Update to libjpeg_turbo 1.4.90

(Duplicate of https://codereview.chromium.org/1939823002/ for landing.)

TBR=noel@chromium.org,thakis@chromium.org
BUG=608347, 398235, 591927

Review URL: https://codereview.chromium.org/1953443002 .
diff --git a/BUILDING.md b/BUILDING.md
new file mode 100644
index 0000000..727d728
--- /dev/null
+++ b/BUILDING.md
@@ -0,0 +1,867 @@
+Building on Un*x Platforms (including Cygwin and OS X)
+=======================================================
+
+
+Build Requirements
+------------------
+
+- autoconf 2.56 or later
+
+- automake 1.7 or later
+
+- libtool 1.4 or later
+  * If using Xcode 4.3 or later on OS X, autoconf and automake are no longer
+    provided.  The easiest way to obtain them is from
+    [MacPorts](http://www.MacPorts.org).
+
+- NASM or YASM (if building x86 or x86-64 SIMD extensions)
+  * NASM 0.98, or 2.01 or later is required for an x86 build (0.99 and 2.00 do
+    not work properly with libjpeg-turbo's x86 SIMD code.)
+  * NASM 2.00 or later is required for an x86-64 build.
+  * NASM 2.07, or 2.11.09 or later is required for an x86-64 Mac build
+    (2.11.08 does not work properly with libjpeg-turbo's x86-64 SIMD code when
+    building macho64 objects.)  NASM or YASM can be obtained from
+    [MacPorts](http://www.MacPorts.org).
+
+  The binary RPMs released by the NASM project do not work on older Linux
+  systems, such as Red Hat Enterprise Linux 4.  On such systems, you can
+   easily build and install NASM from a source RPM by downloading one of the
+  SRPMs from
+
+  http://www.nasm.us/pub/nasm/releasebuilds
+
+  and executing the following as root:
+
+        ARCH=`uname -m`
+        rpmbuild --rebuild nasm-{version}.src.rpm
+        rpm -Uvh /usr/src/redhat/RPMS/$ARCH/nasm-{version}.$ARCH.rpm
+
+  NOTE: the NASM build will fail if texinfo is not installed.
+
+- GCC v4.1 (or later) or clang recommended for best performance
+
+- If building the TurboJPEG Java wrapper, JDK or OpenJDK 1.5 or later is
+  required.  Some systems, such as Solaris 10 and later and Red Hat Enterprise
+  Linux 5 and later, have this pre-installed.  On OS X 10.5 and 10.6, it will
+  be necessary to install the Java Developer Package, which can be downloaded
+  from http://developer.apple.com/downloads (Apple ID required.)  For other
+  systems, you can obtain the Oracle Java Development Kit from
+  http://www.java.com.
+
+
+Out-of-Tree Builds
+------------------
+
+Binary objects, libraries, and executables are generated in the same directory
+from which `configure` was executed (the "binary directory"), and this
+directory need not necessarily be the same as the libjpeg-turbo source
+directory.  You can create multiple independent binary directories, in which
+different versions of libjpeg-turbo can be built from the same source tree
+using different compilers or settings.  In the sections below,
+*{build_directory}* refers to the binary directory, whereas
+*{source_directory}* refers to the libjpeg-turbo source directory.  For in-tree
+builds, these directories are the same.
+
+
+Building libjpeg-turbo
+----------------------
+
+The following procedure will build libjpeg-turbo on Linux, FreeBSD, Cygwin, and
+Solaris/x86 systems (on Solaris, this generates a 32-bit library.  See below
+for 64-bit build instructions.)
+
+    cd {source_directory}
+    autoreconf -fiv
+    cd {build_directory}
+    sh {source_directory}/configure [additional configure flags]
+    make
+
+NOTE: Running autoreconf in the source directory is not necessary if building
+libjpeg-turbo from one of the official release tarballs.
+
+This will generate the following files under .libs/:
+
+**libjpeg.a**
+Static link library for the libjpeg API
+
+**libjpeg.so.{version}** (Linux, Unix)
+**libjpeg.{version}.dylib** (OS X)
+**cygjpeg-{version}.dll** (Cygwin)
+Shared library for the libjpeg API
+
+By default, *{version}* is 62.1.0, 7.1.0, or 8.0.2, depending on whether
+libjpeg v6b (default), v7, or v8 emulation is enabled.  If using Cygwin,
+*{version}* is 62, 7, or 8.
+
+**libjpeg.so** (Linux, Unix)
+**libjpeg.dylib** (OS X)
+Development symlink for the libjpeg API
+
+**libjpeg.dll.a** (Cygwin)
+Import library for the libjpeg API
+
+**libturbojpeg.a**
+Static link library for the TurboJPEG API
+
+**libturbojpeg.so.0.1.0** (Linux, Unix)
+**libturbojpeg.0.1.0.dylib** (OS X)
+**cygturbojpeg-0.dll** (Cygwin)
+Shared library for the TurboJPEG API
+
+**libturbojpeg.so** (Linux, Unix)
+**libturbojpeg.dylib** (OS X)
+Development symlink for the TurboJPEG API
+
+**libturbojpeg.dll.a** (Cygwin)
+Import library for the TurboJPEG API
+
+
+### libjpeg v7 or v8 API/ABI Emulation
+
+Add `--with-jpeg7` to the `configure` command line to build a version of
+libjpeg-turbo that is API/ABI-compatible with libjpeg v7.  Add `--with-jpeg8`
+to the `configure` command to build a version of libjpeg-turbo that is
+API/ABI-compatible with libjpeg v8.  See [README.md](README.md) for more
+information on libjpeg v7 and v8 emulation.
+
+
+### In-Memory Source/Destination Managers
+
+When using libjpeg v6b or v7 API/ABI emulation, add `--without-mem-srcdst` to
+the `configure` command line to build a version of libjpeg-turbo that lacks the
+`jpeg_mem_src()` and `jpeg_mem_dest()` functions.  These functions were not
+part of the original libjpeg v6b and v7 APIs, so removing them ensures strict
+conformance with those APIs.  See [README.md](README.md) for more information.
+
+
+### Arithmetic Coding Support
+
+Since the patent on arithmetic coding has expired, this functionality has been
+included in this release of libjpeg-turbo.  libjpeg-turbo's implementation is
+based on the implementation in libjpeg v8, but it works when emulating libjpeg
+v7 or v6b as well.  The default is to enable both arithmetic encoding and
+decoding, but those who have philosophical objections to arithmetic coding can
+add `--without-arith-enc` or `--without-arith-dec` to the `configure` command
+line to disable encoding or decoding (respectively.)
+
+
+### TurboJPEG Java Wrapper
+
+Add `--with-java` to the `configure` command line to incorporate an optional
+Java Native Interface wrapper into the TurboJPEG shared library and build the
+Java front-end classes to support it.  This allows the TurboJPEG shared library
+to be used directly from Java applications.  See [java/README](java/README) for
+more details.
+
+You can set the `JAVAC`, `JAR`, and `JAVA` configure variables to specify
+alternate commands for javac, jar, and java (respectively.)  You can also
+set the `JAVACFLAGS` configure variable to specify arguments that should be
+passed to the Java compiler when building the front-end classes, and
+`JNI_CFLAGS` to specify arguments that should be passed to the C compiler when
+building the JNI wrapper.  Run `configure --help` for more details.
+
+
+Installing libjpeg-turbo
+------------------------
+
+If you intend to install these libraries and the associated header files, then
+replace 'make' in the instructions above with
+
+    make install prefix={base dir} libdir={library directory}
+
+For example,
+
+    make install prefix=/usr/local libdir=/usr/local/lib64
+
+will install the header files in /usr/local/include and the library files in
+/usr/local/lib64.  If `prefix` and `libdir` are not specified, then the default
+is to install the header files in /opt/libjpeg-turbo/include and the library
+files in /opt/libjpeg-turbo/lib32 (32-bit) or /opt/libjpeg-turbo/lib64
+(64-bit.)
+
+NOTE: You can specify a prefix of /usr and a libdir of, for instance,
+/usr/lib64 to overwrite the system's version of libjpeg.  If you do this,
+however, then be sure to BACK UP YOUR SYSTEM'S INSTALLATION OF LIBJPEG before
+overwriting it.  It is recommended that you instead install libjpeg-turbo into
+a non-system directory and manipulate the `LD_LIBRARY_PATH` or create symlinks
+to force applications to use libjpeg-turbo instead of libjpeg.  See
+[README.md](README.md) for more information.
+
+
+Build Recipes
+-------------
+
+
+### 32-bit Build on 64-bit Linux
+
+Add
+
+    --host i686-pc-linux-gnu CFLAGS='-O3 -m32' LDFLAGS=-m32
+
+to the `configure` command line.
+
+
+### 64-bit Build on 64-bit OS X
+
+Add
+
+    --host x86_64-apple-darwin NASM=/opt/local/bin/nasm
+
+to the `configure` command line.  NASM 2.07 or later from MacPorts must be
+installed.
+
+
+### 32-bit Build on 64-bit OS X
+
+Add
+
+    --host i686-apple-darwin CFLAGS='-O3 -m32' LDFLAGS=-m32
+
+to the `configure` command line.
+
+
+### 64-bit Backward-Compatible Build on 64-bit OS X
+
+Add
+
+    --host x86_64-apple-darwin NASM=/opt/local/bin/nasm \
+      CFLAGS='-mmacosx-version-min=10.5 -O3' \
+      LDFLAGS='-mmacosx-version-min=10.5'
+
+to the `configure` command line.  NASM 2.07 or later from MacPorts must be
+installed.
+
+
+### 32-bit Backward-Compatible Build on OS X
+
+Add
+
+    --host i686-apple-darwin \
+      CFLAGS='-mmacosx-version-min=10.5 -O3 -m32' \
+      LDFLAGS='-mmacosx-version-min=10.5 -m32'
+
+to the `configure` command line.
+
+
+### 64-bit Build on 64-bit Solaris
+
+Add
+
+    --host x86_64-pc-solaris CFLAGS='-O3 -m64' LDFLAGS=-m64
+
+to the `configure` command line.
+
+
+### 32-bit Build on 64-bit FreeBSD
+
+Add
+
+    --host i386-unknown-freebsd CC='gcc -B /usr/lib32' CFLAGS='-O3 -m32' \
+      LDFLAGS='-B/usr/lib32'
+
+to the `configure` command line.  NASM 2.07 or later from FreeBSD ports must be
+installed.
+
+
+### Oracle Solaris Studio
+
+Add
+
+    CC=cc
+
+to the `configure` command line.  libjpeg-turbo will automatically be built
+with the maximum optimization level (-xO5) unless you override `CFLAGS`.
+
+To build a 64-bit version of libjpeg-turbo using Oracle Solaris Studio, add
+
+    --host x86_64-pc-solaris CC=cc CFLAGS='-xO5 -m64' LDFLAGS=-m64
+
+to the `configure` command line.
+
+
+### MinGW Build on Cygwin
+
+Use CMake (see recipes below)
+
+
+ARM Support
+-----------
+
+This release of libjpeg-turbo can use ARM NEON SIMD instructions to accelerate
+JPEG compression/decompression by approximately 2-4x on ARMv7 and later
+platforms.  If libjpeg-turbo is configured on an ARM Linux platform, then the
+build system will automatically include the NEON SIMD routines, if they are
+supported.  Build instructions for other ARM-based platforms follow.
+
+
+### Building libjpeg-turbo for iOS
+
+iOS platforms, such as the iPhone and iPad, use ARM processors, some of which
+support NEON instructions.  Additional steps are required in order to build
+libjpeg-turbo for these platforms.
+
+
+#### Additional build requirements
+
+- [gas-preprocessor.pl]
+  (https://raw.githubusercontent.com/libjpeg-turbo/gas-preprocessor/master/gas-preprocessor.pl)
+  should be installed in your `PATH`.
+
+
+#### ARM 32-bit Build (Xcode 4.6.x and earlier, LLVM-GCC)
+
+Set the following shell variables for simplicity:
+
+  *Xcode 4.2 and earlier*
+
+    IOS_PLATFORMDIR=/Developer/Platforms/iPhoneOS.platform`
+
+  *Xcode 4.3 and later*
+
+    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+
+  *All Xcode versions*
+
+    IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
+    IOS_GCC=$IOS_PLATFORMDIR/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2
+
+  *ARMv6 (code will run on all iOS devices, not SIMD-accelerated)*
+  [NOTE: Requires Xcode 4.4.x or earlier]
+
+    IOS_CFLAGS="-march=armv6 -mcpu=arm1176jzf-s -mfpu=vfp"
+
+  *ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer)*
+
+    IOS_CFLAGS="-march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon"
+
+  *ARMv7s (code will run on iPhone 5/iPad 4th Generation and newer)*
+  [NOTE: Requires Xcode 4.5 or later]
+
+    IOS_CFLAGS="-march=armv7s -mcpu=swift -mtune=swift -mfpu=neon"
+
+Follow the procedure under "Building libjpeg-turbo" above, adding
+
+    --host arm-apple-darwin10 \
+      CC="$IOS_GCC" LD="$IOS_GCC" \
+      CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
+      LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS"
+
+to the `configure` command line.
+
+
+#### ARM 32-bit Build (Xcode 5.0.x and later, Clang)
+
+Set the following shell variables for simplicity:
+
+    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+    IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
+    IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+
+  *ARMv7 (code will run on iPhone 3GS-4S/iPad 1st-3rd Generation and newer)*
+
+    IOS_CFLAGS="-arch armv7"
+
+  *ARMv7s (code will run on iPhone 5/iPad 4th Generation and newer)*
+
+    IOS_CFLAGS="-arch armv7s"
+
+Follow the procedure under "Building libjpeg-turbo" above, adding
+
+    --host arm-apple-darwin10 \
+      CC="$IOS_GCC" LD="$IOS_GCC" \
+      CFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
+      LDFLAGS="-mfloat-abi=softfp -isysroot $IOS_SYSROOT $IOS_CFLAGS" \
+      CCASFLAGS="-no-integrated-as $IOS_CFLAGS"
+
+to the `configure` command line.
+
+
+#### ARMv8 64-bit Build (Xcode 5.0.x and later, Clang)
+
+Code will run on iPhone 5S/iPad Mini 2/iPad Air and newer.
+
+Set the following shell variables for simplicity:
+
+    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+    IOS_SYSROOT=$IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk
+    IOS_GCC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+    IOS_CFLAGS="-arch arm64"
+
+Follow the procedure under "Building libjpeg-turbo" above, adding
+
+    --host aarch64-apple-darwin \
+      CC="$IOS_GCC" LD="$IOS_GCC" \
+      CFLAGS="-isysroot $IOS_SYSROOT -O3 $IOS_CFLAGS" \
+      LDFLAGS="-isysroot $IOS_SYSROOT $IOS_CFLAGS"
+
+to the `configure` command line.
+
+
+NOTE:  You can also add `-miphoneos-version-min={version}` to `$IOS_CFLAGS`
+above in order to support older versions of iOS than the default version
+supported by the SDK.
+
+Once built, lipo can be used to combine the ARMv6, v7, v7s, and/or v8 variants
+into a universal library.
+
+
+### Building libjpeg-turbo for Android
+
+Building libjpeg-turbo for Android platforms requires the Android NDK
+(https://developer.android.com/tools/sdk/ndk) and autotools.  The following is
+a general recipe script that can be modified for your specific needs.
+
+    # Set these variables to suit your needs
+    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/ndk}
+    BUILD_PLATFORM={the platform name for the NDK package you installed--
+      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
+    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
+      toolchain directory under ${NDK_PATH}/toolchains/.}
+    ANDROID_VERSION={The minimum version of Android to support-- for example,
+      "16", "19", etc.  "21" or later is required for a 64-bit build.}
+
+    # 32-bit ARMv7 build
+    HOST=arm-linux-androideabi
+    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm
+    ANDROID_CFLAGS="-march=armv7-a -mfloat-abi=softfp -fprefetch-loop-arrays \
+      --sysroot=${SYSROOT}"
+
+    # 64-bit ARMv8 build
+    HOST=aarch64-linux-android
+    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm64
+    ANDROID_CFLAGS="--sysroot=${SYSROOT}"
+
+    TOOLCHAIN=${NDK_PATH}/toolchains/${HOST}-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
+    ANDROID_INCLUDES="-I${SYSROOT}/usr/include -I${TOOLCHAIN}/include"
+    export CPP=${TOOLCHAIN}/bin/${HOST}-cpp
+    export AR=${TOOLCHAIN}/bin/${HOST}-ar
+    export AS=${TOOLCHAIN}/bin/${HOST}-as
+    export NM=${TOOLCHAIN}/bin/${HOST}-nm
+    export CC=${TOOLCHAIN}/bin/${HOST}-gcc
+    export LD=${TOOLCHAIN}/bin/${HOST}-ld
+    export RANLIB=${TOOLCHAIN}/bin/${HOST}-ranlib
+    export OBJDUMP=${TOOLCHAIN}/bin/${HOST}-objdump
+    export STRIP=${TOOLCHAIN}/bin/${HOST}-strip
+    cd {build_directory}
+    sh {source_directory}/configure --host=${HOST} \
+      CFLAGS="${ANDROID_INCLUDES} ${ANDROID_CFLAGS} -O3 -fPIE" \
+      CPPFLAGS="${ANDROID_INCLUDES} ${ANDROID_CFLAGS}" \
+      LDFLAGS="${ANDROID_CFLAGS} -pie" --with-simd ${1+"$@"}
+    make
+
+If building for Android 4.0.x (API level < 16) or earlier, remove `-fPIE` from
+`CFLAGS` and `-pie` from `LDFLAGS`.
+
+
+Building on Windows (Visual C++ or MinGW)
+=========================================
+
+
+Build Requirements
+------------------
+
+- [CMake](http://www.cmake.org) v2.8.8 or later
+
+- Microsoft Visual C++ 2005 or later
+
+  If you don't already have Visual C++, then the easiest way to get it is by
+  installing the Windows SDK:
+
+  http://msdn.microsoft.com/en-us/windows/bb980924.aspx
+
+  The Windows SDK includes both 32-bit and 64-bit Visual C++ compilers and
+  everything necessary to build libjpeg-turbo.
+
+  * You can also use Microsoft Visual Studio Express Edition, which is a free
+    download.  (NOTE: versions prior to 2012 can only be used to build 32-bit
+    code.)
+  * If you intend to build libjpeg-turbo from the command line, then add the
+    appropriate compiler and SDK directories to the `INCLUDE`, `LIB`, and
+    `PATH` environment variables.  This is generally accomplished by
+    executing `vcvars32.bat` or `vcvars64.bat` and `SetEnv.cmd`.
+    `vcvars32.bat` and `vcvars64.bat` are part of Visual C++ and are located in
+    the same directory as the compiler.  `SetEnv.cmd` is part of the Windows
+    SDK.  You can pass optional arguments to `SetEnv.cmd` to specify a 32-bit
+    or 64-bit build environment.
+
+... OR ...
+
+- MinGW
+
+  MinGW-builds (http://sourceforge.net/projects/mingwbuilds/) or
+  tdm-gcc (http://tdm-gcc.tdragon.net/) recommended if building on a Windows
+  machine.  Both distributions install a Start Menu link that can be used to
+  launch a command prompt with the appropriate compiler paths automatically
+  set.
+
+- [NASM](http://www.nasm.us/) 0.98 or later (NASM 2.05 or later is required for
+  a 64-bit build)
+
+- If building the TurboJPEG Java wrapper, JDK 1.5 or later is required.  This
+  can be downloaded from http://www.java.com.
+
+
+Out-of-Tree Builds
+------------------
+
+Binary objects, libraries, and executables are generated in the same directory
+from which `cmake` was executed (the "binary directory"), and this directory
+need not necessarily be the same as the libjpeg-turbo source directory.  You
+can create multiple independent binary directories, in which different versions
+of libjpeg-turbo can be built from the same source tree using different
+compilers or settings.  In the sections below, *{build_directory}* refers to
+the binary directory, whereas *{source_directory}* refers to the libjpeg-turbo
+source directory.  For in-tree builds, these directories are the same.
+
+
+Building libjpeg-turbo
+----------------------
+
+
+### Visual C++ (Command Line)
+
+    cd {build_directory}
+    cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release {source_directory}
+    nmake
+
+This will build either a 32-bit or a 64-bit version of libjpeg-turbo, depending
+on which version of cl.exe is in the `PATH`.
+
+The following files will be generated under *{build_directory}*:
+
+**jpeg-static.lib**
+Static link library for the libjpeg API
+
+**sharedlib/jpeg{version}.dll**
+DLL for the libjpeg API
+
+**sharedlib/jpeg.lib**
+Import library for the libjpeg API
+
+**turbojpeg-static.lib**
+Static link library for the TurboJPEG API
+
+**turbojpeg.dll**
+DLL for the TurboJPEG API
+
+**turbojpeg.lib**
+Import library for the TurboJPEG API
+
+*{version}* is 62, 7, or 8, depending on whether libjpeg v6b (default), v7, or
+v8 emulation is enabled.
+
+
+### Visual C++ (IDE)
+
+Choose the appropriate CMake generator option for your version of Visual Studio
+(run `cmake` with no arguments for a list of available generators.)  For
+instance:
+
+    cd {build_directory}
+    cmake -G "Visual Studio 10" {source_directory}
+
+NOTE:  Add "Win64" to the generator name (for example, "Visual Studio 10
+Win64") to build a 64-bit version of libjpeg-turbo.  Recent versions of CMake
+no longer document that.  A separate build directory must be used for 32-bit
+and 64-bit builds.
+
+You can then open ALL_BUILD.vcproj in Visual Studio and build one of the
+configurations in that project ("Debug", "Release", etc.) to generate a full
+build of libjpeg-turbo.
+
+This will generate the following files under *{build_directory}*:
+
+**{configuration}/jpeg-static.lib**
+Static link library for the libjpeg API
+
+**sharedlib/{configuration}/jpeg{version}.dll**
+DLL for the libjpeg API
+
+**sharedlib/{configuration}/jpeg.lib**
+Import library for the libjpeg API
+
+**{configuration}/turbojpeg-static.lib**
+Static link library for the TurboJPEG API
+
+**{configuration}/turbojpeg.dll**
+DLL for the TurboJPEG API
+
+**{configuration}/turbojpeg.lib**
+Import library for the TurboJPEG API
+
+*{configuration}* is Debug, Release, RelWithDebInfo, or MinSizeRel, depending
+on the configuration you built in the IDE, and *{version}* is 62, 7, or 8,
+depending on whether libjpeg v6b (default), v7, or v8 emulation is enabled.
+
+
+### MinGW
+
+NOTE: This assumes that you are building on a Windows machine.  If you are
+cross-compiling on a Linux/Unix machine, then see "Build Recipes" below.
+
+    cd {build_directory}
+    cmake -G "MinGW Makefiles" {source_directory}
+    mingw32-make
+
+This will generate the following files under *{build_directory}*:
+
+**libjpeg.a**
+Static link library for the libjpeg API
+
+**sharedlib/libjpeg-{version}.dll**
+DLL for the libjpeg API
+
+**sharedlib/libjpeg.dll.a**
+Import library for the libjpeg API
+
+**libturbojpeg.a**
+Static link library for the TurboJPEG API
+
+**libturbojpeg.dll**
+DLL for the TurboJPEG API
+
+**libturbojpeg.dll.a**
+Import library for the TurboJPEG API
+
+*{version}* is 62, 7, or 8, depending on whether libjpeg v6b (default), v7, or
+v8 emulation is enabled.
+
+
+### Debug Build
+
+Add `-DCMAKE_BUILD_TYPE=Debug` to the `cmake` command line.  Or, if building
+with NMake, remove `-DCMAKE_BUILD_TYPE=Release` (Debug builds are the default
+with NMake.)
+
+
+### libjpeg v7 or v8 API/ABI Emulation
+
+Add `-DWITH_JPEG7=1` to the `cmake` command line to build a version of
+libjpeg-turbo that is API/ABI-compatible with libjpeg v7.  Add `-DWITH_JPEG8=1`
+to the `cmake` command line to build a version of libjpeg-turbo that is
+API/ABI-compatible with libjpeg v8.  See [README.md](README.md) for more
+information on libjpeg v7 and v8 emulation.
+
+
+### In-Memory Source/Destination Managers
+
+When using libjpeg v6b or v7 API/ABI emulation, add `-DWITH_MEM_SRCDST=0` to
+the `cmake` command line to build a version of libjpeg-turbo that lacks the
+`jpeg_mem_src()` and `jpeg_mem_dest()` functions.  These functions were not
+part of the original libjpeg v6b and v7 APIs, so removing them ensures strict
+conformance with those APIs.  See [README.md](README.md) for more information.
+
+
+### Arithmetic Coding Support
+
+Since the patent on arithmetic coding has expired, this functionality has been
+included in this release of libjpeg-turbo.  libjpeg-turbo's implementation is
+based on the implementation in libjpeg v8, but it works when emulating libjpeg
+v7 or v6b as well.  The default is to enable both arithmetic encoding and
+decoding, but those who have philosophical objections to arithmetic coding can
+add `-DWITH_ARITH_ENC=0` or `-DWITH_ARITH_DEC=0` to the `cmake` command line to
+disable encoding or decoding (respectively.)
+
+
+### TurboJPEG Java Wrapper
+
+Add `-DWITH_JAVA=1` to the `cmake` command line to incorporate an optional Java
+Native Interface wrapper into the TurboJPEG shared library and build the Java
+front-end classes to support it.  This allows the TurboJPEG shared library to
+be used directly from Java applications.  See [java/README](java/README) for
+more details.
+
+If you are using CMake 2.8, you can set the `Java_JAVAC_EXECUTABLE`,
+`Java_JAVA_EXECUTABLE`, and `Java_JAR_EXECUTABLE` CMake variables to specify
+alternate commands or locations for javac, jar, and java (respectively.)  You
+can also set the `JAVACFLAGS` CMake variable to specify arguments that should
+be passed to the Java compiler when building the front-end classes.
+
+
+Installing libjpeg-turbo
+------------------------
+
+You can use the build system to install libjpeg-turbo into a directory of your
+choosing (as opposed to creating an installer.)  To do this, add:
+
+    -DCMAKE_INSTALL_PREFIX={install_directory}
+
+to the cmake command line.
+
+For example,
+
+    cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_INSTALL_PREFIX=c:\libjpeg-turbo {source_directory}
+    nmake install
+
+will install the header files in c:\libjpeg-turbo\include, the library files
+in c:\libjpeg-turbo\lib, the DLL's in c:\libjpeg-turbo\bin, and the
+documentation in c:\libjpeg-turbo\doc.
+
+
+Build Recipes
+-------------
+
+
+### 64-bit MinGW Build on Cygwin
+
+    cd {build_directory}
+    CC=/usr/bin/x86_64-w64-mingw32-gcc \
+      cmake -G "Unix Makefiles" -DCMAKE_SYSTEM_NAME=Windows \
+      -DCMAKE_RC_COMPILER=/usr/bin/x86_64-w64-mingw32-windres.exe \
+      {source_directory}
+    make
+
+This produces a 64-bit build of libjpeg-turbo that does not depend on
+cygwin1.dll or other Cygwin DLL's.  The mingw64-x86\_64-gcc-core and
+mingw64-x86\_64-gcc-g++ packages (and their dependencies) must be installed.
+
+
+### 32-bit MinGW Build on Cygwin
+
+     cd {build_directory}
+     CC=/usr/bin/i686-w64-mingw32-gcc \
+       cmake -G "Unix Makefiles" -DCMAKE_SYSTEM_NAME=Windows \
+       -DCMAKE_RC_COMPILER=/usr/bin/i686-w64-mingw32-windres.exe \
+       {source_directory}
+     make
+
+This produces a 32-bit build of libjpeg-turbo that does not depend on
+cygwin1.dll or other Cygwin DLL's.  The mingw64-i686-gcc-core and
+mingw64-i686-gcc-g++ packages (and their dependencies) must be installed.
+
+
+### MinGW Build on Linux
+
+    cd {build_directory}
+    CC={mingw_binary_path}/i686-pc-mingw32-gcc \
+      cmake -G "Unix Makefiles" -DCMAKE_SYSTEM_NAME=Windows \
+      -DCMAKE_RC_COMPILER={mingw_binary_path}/i686-pc-mingw32-windres \
+      -DCMAKE_AR={mingw_binary_path}/i686-pc-mingw32-ar \
+      -DCMAKE_RANLIB={mingw_binary_path}/i686-pc-mingw32-ranlib \
+      {source_directory}
+    make
+
+
+Creating Release Packages
+=========================
+
+The following commands can be used to create various types of release packages:
+
+
+Unix/Linux
+----------
+
+    make rpm
+
+Create Red Hat-style binary RPM package.  Requires RPM v4 or later.
+
+    make srpm
+
+This runs `make dist` to create a pristine source tarball, then creates a
+Red Hat-style source RPM package from the tarball.  Requires RPM v4 or later.
+
+    make deb
+
+Create Debian-style binary package.  Requires dpkg.
+
+    make dmg
+
+Create Macintosh package/disk image.  This requires pkgbuild and
+productbuild, which are installed by default on OS X 10.7 and later and which
+can be obtained by installing Xcode 3.2.6 (with the "Unix Development"
+option) on OS X 10.6.  Packages built in this manner can be installed on OS X
+10.5 and later, but they must be built on OS X 10.6 or later.
+
+    make udmg [BUILDDIR32={32-bit build directory}]
+
+On 64-bit OS X systems, this creates a Macintosh package and disk image that
+contains universal i386/x86-64 binaries.  You should first configure a 32-bit
+out-of-tree build of libjpeg-turbo, then configure a 64-bit out-of-tree
+build, then run `make udmg` from the 64-bit build directory.  The build
+system will look for the 32-bit build under *{source_directory}*/osxx86 by
+default, but you can override this by setting the `BUILDDIR32` variable on the
+make command line as shown above.
+
+    make iosdmg [BUILDDIR32={32-bit build directory}] \
+      [BUILDDIRARMV6={ARMv6 build directory}] \
+      [BUILDDIRARMV7={ARMv7 build directory}] \
+      [BUILDDIRARMV7S={ARMv7s build directory}] \
+      [BUILDDIRARMV8={ARMv8 build directory}]
+
+On OS X systems, this creates a Macintosh package and disk image in which the
+libjpeg-turbo static libraries contain ARM architectures necessary to build
+iOS applications.  If building on an x86-64 system, the binaries will also
+contain the i386 architecture, as with `make udmg` above.  You should first
+configure ARMv6, ARMv7, ARMv7s, and/or ARMv8 out-of-tree builds of
+libjpeg-turbo (see "Building libjpeg-turbo for iOS" above.)  If you are
+building an x86-64 version of libjpeg-turbo, you should configure a 32-bit
+out-of-tree build as well.  Next, build libjpeg-turbo as you would normally,
+using an out-of-tree build.  When it is built, run `make iosdmg` from the
+build directory.  The build system will look for the ARMv6 build under
+*{source_directory}*/iosarmv6 by default, the ARMv7 build under
+*{source_directory}*/iosarmv7 by default, the ARMv7s build under
+*{source_directory}*/iosarmv7s by default, the ARMv8 build under
+*{source_directory}*/iosarmv8 by default, and (if applicable) the 32-bit build
+under *{source_directory}*/osxx86 by default, but you can override this by
+setting the `BUILDDIR32`, `BUILDDIRARMV6`, `BUILDDIRARMV7`, `BUILDDIRARMV7S`,
+and/or `BUILDDIRARMV8` variables on the `make` command line as shown above.
+
+NOTE: If including an ARMv8 build in the package, then you may need to use
+Xcode's version of lipo instead of the operating system's.  To do this, pass
+an argument of `LIPO="xcrun lipo"` on the make command line.
+
+    make cygwinpkg
+
+Build a Cygwin binary package.
+
+
+Windows
+-------
+
+If using NMake:
+
+    cd {build_directory}
+    nmake installer
+
+If using MinGW:
+
+    cd {build_directory}
+    make installer
+
+If using the Visual Studio IDE, build the "installer" project.
+
+The installer package (libjpeg-turbo[-gcc][64].exe) will be located under
+*{build_directory}*.  If building using the Visual Studio IDE, then the
+installer package will be located in a subdirectory with the same name as the
+configuration you built (such as *{build_directory}*\Debug\ or
+*{build_directory}*\Release\).
+
+Building a Windows installer requires the Nullsoft Install System
+(http://nsis.sourceforge.net/.)  makensis.exe should be in your `PATH`.
+
+
+Regression testing
+==================
+
+The most common way to test libjpeg-turbo is by invoking `make test` on
+Unix/Linux platforms or `ctest` on Windows platforms, once the build has
+completed.  This runs a series of tests to ensure that mathematical
+compatibility has been maintained between libjpeg-turbo and libjpeg v6b.  This
+also invokes the TurboJPEG unit tests, which ensure that the colorspace
+extensions, YUV encoding, decompression scaling, and other features of the
+TurboJPEG C and Java APIs are working properly (and, by extension, that the
+equivalent features of the underlying libjpeg API are also working.)
+
+Invoking `make testclean` or `nmake testclean` (if using NMake) or building
+the 'testclean' target (if using the Visual Studio IDE) will clean up the
+output images generated by `make test`.
+
+On Unix/Linux platforms, more extensive tests of the TurboJPEG C and Java
+wrappers can be run by invoking `make tjtest`.  These extended TurboJPEG tests
+essentially iterate through all of the available features of the TurboJPEG APIs
+that are not covered by the TurboJPEG unit tests (this includes the lossless
+transform options) and compare the images generated by each feature to images
+generated using the equivalent feature in the libjpeg API.  The extended
+TurboJPEG tests are meant to test for regressions in the TurboJPEG wrappers,
+not in the underlying libjpeg API library.
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..bfb7661
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,920 @@
+#
+# Setup
+#
+
+cmake_minimum_required(VERSION 2.8.8)
+# Use LINK_INTERFACE_LIBRARIES instead of INTERFACE_LINK_LIBRARIES
+if(POLICY CMP0022)
+  cmake_policy(SET CMP0022 OLD)
+endif()
+
+project(libjpeg-turbo C)
+set(VERSION 1.4.90)
+
+if(CYGWIN OR NOT CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+  execute_process(COMMAND "date" "+%Y%m%d" OUTPUT_VARIABLE BUILD)
+  string(REGEX REPLACE "\n" "" BUILD ${BUILD})
+elseif(WIN32)
+  execute_process(COMMAND "wmic.exe" "os" "get" "LocalDateTime" OUTPUT_VARIABLE
+    BUILD)
+  string(REGEX REPLACE "[^0-9]" "" BUILD "${BUILD}")
+  if (BUILD STREQUAL "")
+    execute_process(COMMAND "cmd.exe" "/C" "DATE" "/T" OUTPUT_VARIABLE BUILD)
+    string(REGEX REPLACE ".*[ ]([0-9]*)[/.]([0-9]*)[/.]([0-9]*).*" "\\3\\2\\1" BUILD "${BUILD}")
+  else()
+    string(SUBSTRING "${BUILD}" 0 8 BUILD)
+  endif()
+else()
+  message(FATAL_ERROR "Platform not supported by this build system.  Use autotools instead.")
+endif()
+
+# This does nothing except when using MinGW.  CMAKE_BUILD_TYPE has no meaning
+# in Visual Studio, and it always defaults to Debug when using NMake.
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+message(STATUS "CMAKE_BUILD_TYPE = ${CMAKE_BUILD_TYPE}")
+
+# This only works if building from the command line.  There is currently no way
+# to set a variable's value based on the build type when using Visual Studio.
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  set(BUILD "${BUILD}d")
+endif()
+
+message(STATUS "VERSION = ${VERSION}, BUILD = ${BUILD}")
+
+option(WITH_SIMD "Include SIMD extensions" TRUE)
+option(WITH_ARITH_ENC "Include arithmetic encoding support" TRUE)
+option(WITH_ARITH_DEC "Include arithmetic decoding support" TRUE)
+option(WITH_JPEG7 "Emulate libjpeg v7 API/ABI (this makes libjpeg-turbo backward incompatible with libjpeg v6b)" FALSE)
+option(WITH_JPEG8 "Emulate libjpeg v8 API/ABI (this makes libjpeg-turbo backward incompatible with libjpeg v6b)" FALSE)
+option(WITH_MEM_SRCDST "Include in-memory source/destination manager functions when emulating the libjpeg v6b or v7 API/ABI" TRUE)
+option(WITH_TURBOJPEG "Include the TurboJPEG wrapper library and associated test programs" TRUE)
+option(WITH_JAVA "Build Java wrapper for the TurboJPEG library" FALSE)
+option(WITH_12BIT "Encode/decode JPEG images with 12-bit samples (implies WITH_SIMD=0 WITH_TURBOJPEG=0 WITH_ARITH_ENC=0 WITH_ARITH_DEC=0)" FALSE)
+option(ENABLE_STATIC "Build static libraries" TRUE)
+option(ENABLE_SHARED "Build shared libraries" TRUE)
+
+if(WITH_12BIT)
+  set(WITH_SIMD FALSE)
+  set(WITH_TURBOJPEG FALSE)
+  set(WITH_JAVA FALSE)
+  set(WITH_ARITH_ENC FALSE)
+  set(WITH_ARITH_DEC FALSE)
+  set(BITS_IN_JSAMPLE 12)
+  message(STATUS "12-bit JPEG support enabled")
+else()
+  set(BITS_IN_JSAMPLE 8)
+endif()
+
+if(WITH_ARITH_ENC)
+  set(C_ARITH_CODING_SUPPORTED 1)
+  message(STATUS "Arithmetic encoding support enabled")
+else()
+  message(STATUS "Arithmetic encoding support disabled")
+endif()
+
+if(WITH_ARITH_DEC)
+  set(D_ARITH_CODING_SUPPORTED 1)
+  message(STATUS "Arithmetic decoding support enabled")
+else()
+  message(STATUS "Arithmetic decoding support disabled")
+endif()
+
+if(WITH_TURBOJPEG)
+  message(STATUS "TurboJPEG C wrapper enabled")
+else()
+  message(STATUS "TurboJPEG C wrapper disabled")
+endif()
+
+if(WITH_JAVA)
+  message(STATUS "TurboJPEG Java wrapper enabled")
+else()
+  message(STATUS "TurboJPEG Java wrapper disabled")
+endif()
+
+set(SO_AGE 0)
+if(WITH_MEM_SRCDST)
+  set(SO_AGE 1)
+endif()
+
+set(JPEG_LIB_VERSION 62)
+set(DLL_VERSION ${JPEG_LIB_VERSION})
+set(FULLVERSION ${DLL_VERSION}.${SO_AGE}.0)
+if(WITH_JPEG8)
+  set(JPEG_LIB_VERSION 80)
+  set(DLL_VERSION 8)
+  set(FULLVERSION ${DLL_VERSION}.0.2)
+  message(STATUS "Emulating libjpeg v8 API/ABI")
+elseif(WITH_JPEG7)
+  set(JPEG_LIB_VERSION 70)
+  set(DLL_VERSION 7)
+  set(FULLVERSION ${DLL_VERSION}.${SO_AGE}.0)
+  message(STATUS "Emulating libjpeg v7 API/ABI")
+endif(WITH_JPEG8)
+
+if(WITH_MEM_SRCDST)
+  set(MEM_SRCDST_SUPPORTED 1)
+  message(STATUS "In-memory source/destination managers enabled")
+else()
+  message(STATUS "In-memory source/destination managers disabled")
+endif()
+
+if(MSVC)
+  option(WITH_CRT_DLL
+    "Link all libjpeg-turbo libraries and executables with the C run-time DLL (msvcr*.dll) instead of the static C run-time library (libcmt*.lib.)  The default is to use the C run-time DLL only with the libraries and executables that need it."
+    FALSE)
+  if(NOT WITH_CRT_DLL)
+    # Use the static C library for all build types
+    foreach(var CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+      CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${var} "${${var}}")
+      endif()
+    endforeach()
+  endif()
+  add_definitions(-W3 -wd4996)
+endif()
+
+# Detect whether compiler is 64-bit
+if(MSVC AND CMAKE_CL_64)
+  set(SIMD_X86_64 1)
+  set(64BIT 1)
+elseif(CMAKE_SIZEOF_VOID_P MATCHES 8)
+  set(SIMD_X86_64 1)
+  set(64BIT 1)
+endif()
+
+if(64BIT)
+  message(STATUS "64-bit build")
+else()
+  message(STATUS "32-bit build")
+endif()
+
+if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  if(MSVC)
+    set(CMAKE_INSTALL_PREFIX_DEFAULT ${CMAKE_PROJECT_NAME})
+  else()
+    set(CMAKE_INSTALL_PREFIX_DEFAULT ${CMAKE_PROJECT_NAME}-gcc)
+  endif()
+  if(64BIT)
+    set(CMAKE_INSTALL_PREFIX_DEFAULT ${CMAKE_INSTALL_PREFIX_DEFAULT}64)
+  endif()
+  set(CMAKE_INSTALL_PREFIX "c:/${CMAKE_INSTALL_PREFIX_DEFAULT}" CACHE PATH
+    "Directory into which to install libjpeg-turbo (default: c:/${CMAKE_INSTALL_PREFIX_DEFAULT})"
+    FORCE)
+endif()
+
+message(STATUS "Install directory = ${CMAKE_INSTALL_PREFIX}")
+
+configure_file(win/jconfig.h.in jconfig.h)
+configure_file(win/jconfigint.h.in jconfigint.h)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_SOURCE_DIR})
+
+string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
+
+set(EFFECTIVE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+message(STATUS "Compiler flags = ${EFFECTIVE_C_FLAGS}")
+
+set(EFFECTIVE_LD_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+message(STATUS "Linker flags = ${EFFECTIVE_LD_FLAGS}")
+
+if(WITH_JAVA)
+  find_package(Java)
+  find_package(JNI)
+  if(DEFINED JAVACFLAGS)
+    message(STATUS "Java compiler flags = ${JAVACFLAGS}")
+  endif()
+endif()
+
+
+#
+# Targets
+#
+
+set(JPEG_SOURCES jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c
+  jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c jcphuff.c
+  jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c jdatasrc.c
+  jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c
+  jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c
+  jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c
+  jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
+
+if(WITH_ARITH_ENC OR WITH_ARITH_DEC)
+  set(JPEG_SOURCES ${JPEG_SOURCES} jaricom.c)
+endif()
+
+if(WITH_ARITH_ENC)
+  set(JPEG_SOURCES ${JPEG_SOURCES} jcarith.c)
+endif()
+
+if(WITH_ARITH_DEC)
+  set(JPEG_SOURCES ${JPEG_SOURCES} jdarith.c)
+endif()
+
+if(WITH_SIMD)
+  add_definitions(-DWITH_SIMD)
+  add_subdirectory(simd)
+  if(SIMD_X86_64)
+    set(JPEG_SOURCES ${JPEG_SOURCES} simd/jsimd_x86_64.c)
+  else()
+    set(JPEG_SOURCES ${JPEG_SOURCES} simd/jsimd_i386.c)
+  endif()
+  # This tells CMake that the "source" files haven't been generated yet
+  set_source_files_properties(${SIMD_OBJS} PROPERTIES GENERATED 1)
+else()
+  set(JPEG_SOURCES ${JPEG_SOURCES} jsimd_none.c)
+  message(STATUS "Not using SIMD acceleration")
+endif()
+
+if(WITH_JAVA)
+  add_subdirectory(java)
+  set(ENABLE_SHARED TRUE)
+endif()
+
+if(ENABLE_SHARED)
+  add_subdirectory(sharedlib)
+endif()
+
+if(ENABLE_STATIC OR WITH_TURBOJPEG)
+  add_library(jpeg-static STATIC ${JPEG_SOURCES} ${SIMD_OBJS})
+  if(NOT MSVC)
+    set_target_properties(jpeg-static PROPERTIES OUTPUT_NAME jpeg)
+  endif()
+  if(WITH_SIMD)
+    add_dependencies(jpeg-static simd)
+  endif()
+endif()
+
+if(WITH_TURBOJPEG)
+  set(TURBOJPEG_SOURCES turbojpeg.c transupp.c jdatadst-tj.c jdatasrc-tj.c)
+  if(WITH_JAVA)
+    set(TURBOJPEG_SOURCES ${TURBOJPEG_SOURCES} turbojpeg-jni.c)
+    include_directories(${JAVA_INCLUDE_PATH} ${JAVA_INCLUDE_PATH2})
+  endif()
+
+  if(ENABLE_SHARED)
+    add_library(turbojpeg SHARED ${TURBOJPEG_SOURCES})
+    set_target_properties(turbojpeg PROPERTIES DEFINE_SYMBOL DLLDEFINE)
+    if(MINGW)
+      set_target_properties(turbojpeg PROPERTIES LINK_FLAGS -Wl,--kill-at)
+    endif()
+    target_link_libraries(turbojpeg jpeg-static)
+    set_target_properties(turbojpeg PROPERTIES LINK_INTERFACE_LIBRARIES "")
+
+    add_executable(tjunittest tjunittest.c tjutil.c)
+    target_link_libraries(tjunittest turbojpeg)
+
+    add_executable(tjbench tjbench.c bmp.c tjutil.c rdbmp.c rdppm.c wrbmp.c
+      wrppm.c)
+    target_link_libraries(tjbench turbojpeg jpeg-static)
+    set_property(TARGET tjbench PROPERTY COMPILE_FLAGS
+      "-DBMP_SUPPORTED -DPPM_SUPPORTED")
+  endif()
+
+  if(ENABLE_STATIC)
+    add_library(turbojpeg-static STATIC ${JPEG_SOURCES} ${SIMD_OBJS}
+      turbojpeg.c transupp.c jdatadst-tj.c jdatasrc-tj.c)
+    if(NOT MSVC)
+      set_target_properties(turbojpeg-static PROPERTIES OUTPUT_NAME turbojpeg)
+    endif()
+    if(WITH_SIMD)
+      add_dependencies(turbojpeg-static simd)
+    endif()
+
+    add_executable(tjunittest-static tjunittest.c tjutil.c)
+    target_link_libraries(tjunittest-static turbojpeg-static)
+
+    add_executable(tjbench-static tjbench.c bmp.c tjutil.c rdbmp.c rdppm.c
+      wrbmp.c wrppm.c)
+    target_link_libraries(tjbench-static turbojpeg-static jpeg-static)
+    set_property(TARGET tjbench-static PROPERTY COMPILE_FLAGS
+      "-DBMP_SUPPORTED -DPPM_SUPPORTED")
+  endif()
+endif()
+
+if(WITH_12BIT)
+  set(COMPILE_FLAGS "-DGIF_SUPPORTED -DPPM_SUPPORTED -DUSE_SETMODE")
+else()
+  set(COMPILE_FLAGS "-DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED -DTARGA_SUPPORTED -DUSE_SETMODE")
+  set(CJPEG_BMP_SOURCES rdbmp.c rdtarga.c)
+  set(DJPEG_BMP_SOURCES wrbmp.c wrtarga.c)
+endif()
+
+if(ENABLE_STATIC)
+  add_executable(cjpeg-static cjpeg.c cdjpeg.c rdgif.c rdppm.c rdswitch.c
+    ${CJPEG_BMP_SOURCES})
+  set_property(TARGET cjpeg-static PROPERTY COMPILE_FLAGS ${COMPILE_FLAGS})
+  target_link_libraries(cjpeg-static jpeg-static)
+
+  add_executable(djpeg-static djpeg.c cdjpeg.c rdcolmap.c rdswitch.c wrgif.c
+    wrppm.c ${DJPEG_BMP_SOURCES})
+  set_property(TARGET djpeg-static PROPERTY COMPILE_FLAGS ${COMPILE_FLAGS})
+  target_link_libraries(djpeg-static jpeg-static)
+
+  add_executable(jpegtran-static jpegtran.c cdjpeg.c rdswitch.c transupp.c)
+  target_link_libraries(jpegtran-static jpeg-static)
+  set_property(TARGET jpegtran-static PROPERTY COMPILE_FLAGS "-DUSE_SETMODE")
+endif()
+
+add_executable(rdjpgcom rdjpgcom.c)
+
+add_executable(wrjpgcom wrjpgcom.c)
+
+
+#
+# Tests
+#
+
+add_subdirectory(md5)
+
+if(MSVC_IDE)
+  set(OBJDIR "\${CTEST_CONFIGURATION_TYPE}/")
+else()
+  set(OBJDIR "")
+endif()
+
+enable_testing()
+
+if(WITH_12BIT)
+  set(TESTORIG testorig12.jpg)
+  set(MD5_JPEG_RGB_ISLOW 9620f424569594bb9242b48498ad801f)
+  set(MD5_PPM_RGB_ISLOW f3301d2219783b8b3d942b7239fa50c0)
+  set(MD5_JPEG_422_IFAST_OPT 7322e3bd2f127f7de4b40d4480ce60e4)
+  set(MD5_PPM_422_IFAST 79807fa552899e66a04708f533e16950)
+  set(MD5_PPM_422M_IFAST 07737bfe8a7c1c87aaa393a0098d16b0)
+  set(MD5_JPEG_420_IFAST_Q100_PROG a1da220b5604081863a504297ed59e55)
+  set(MD5_PPM_420_Q100_IFAST 1b3730122709f53d007255e8dfd3305e)
+  set(MD5_PPM_420M_Q100_IFAST 980a1a3c5bf9510022869d30b7d26566)
+  set(MD5_JPEG_GRAY_ISLOW 235c90707b16e2e069f37c888b2636d9)
+  set(MD5_PPM_GRAY_ISLOW 7213c10af507ad467da5578ca5ee1fca)
+  set(MD5_PPM_GRAY_ISLOW_RGB e96ee81c30a6ed422d466338bd3de65d)
+  set(MD5_JPEG_420S_IFAST_OPT 7af8e60be4d9c227ec63ac9b6630855e)
+  set(MD5_JPEG_3x2_FLOAT_PROG a8c17daf77b457725ec929e215b603f8)
+  set(MD5_PPM_3x2_FLOAT 42876ab9e5c2f76a87d08db5fbd57956)
+  set(MD5_PPM_420M_ISLOW_2_1 4ca6be2a6f326ff9eaab63e70a8259c0)
+  set(MD5_PPM_420M_ISLOW_15_8 12aa9f9534c1b3d7ba047322226365eb)
+  set(MD5_PPM_420M_ISLOW_13_8 f7e22817c7b25e1393e4ec101e9d4e96)
+  set(MD5_PPM_420M_ISLOW_11_8 800a16f9f4dc9b293197bfe11be10a82)
+  set(MD5_PPM_420M_ISLOW_9_8 06b7a92a9bc69f4dc36ec40f1937d55c)
+  set(MD5_PPM_420M_ISLOW_7_8 3ec444a14a4ab4eab88ffc49c48eca43)
+  set(MD5_PPM_420M_ISLOW_3_4 3e726b7ea872445b19437d1c1d4f0d93)
+  set(MD5_PPM_420M_ISLOW_5_8 a8a771abdc94301d20ffac119b2caccd)
+  set(MD5_PPM_420M_ISLOW_1_2 b419124dd5568b085787234866102866)
+  set(MD5_PPM_420M_ISLOW_3_8 343d19015531b7bbe746124127244fa8)
+  set(MD5_PPM_420M_ISLOW_1_4 35fd59d866e44659edfa3c18db2a3edb)
+  set(MD5_PPM_420M_ISLOW_1_8 ccaed48ac0aedefda5d4abe4013f4ad7)
+  set(MD5_PPM_420_ISLOW_SKIP15_31 86664cd9dc956536409e44e244d20a97)
+  set(MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 452a21656115a163029cfba5c04fa76a)
+  set(MD5_PPM_444_ISLOW_SKIP1_6 ef63901f71ef7a75cd78253fc0914f84)
+  set(MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 15b173fb5872d9575572fbcc1b05956f)
+  set(MD5_JPEG_CROP cdb35ff4b4519392690ea040c56ea99c)
+else()
+  set(TESTORIG testorig.jpg)
+  set(MD5_JPEG_RGB_ISLOW 768e970dd57b340ff1b83c9d3d47c77b)
+  set(MD5_PPM_RGB_ISLOW 00a257f5393fef8821f2b88ac7421291)
+  set(MD5_BMP_RGB_ISLOW_565 f07d2e75073e4bb10f6c6f4d36e2e3be)
+  set(MD5_BMP_RGB_ISLOW_565D 4cfa0928ef3e6bb626d7728c924cfda4)
+  set(MD5_JPEG_422_IFAST_OPT 2540287b79d913f91665e660303ab2c8)
+  set(MD5_PPM_422_IFAST 35bd6b3f833bad23de82acea847129fa)
+  set(MD5_PPM_422M_IFAST 8dbc65323d62cca7c91ba02dd1cfa81d)
+  set(MD5_BMP_422M_IFAST_565 3294bd4d9a1f2b3d08ea6020d0db7065)
+  set(MD5_BMP_422M_IFAST_565D da98c9c7b6039511be4a79a878a9abc1)
+  set(MD5_JPEG_420_IFAST_Q100_PROG 990cbe0329c882420a2094da7e5adade)
+  set(MD5_PPM_420_Q100_IFAST 5a732542015c278ff43635e473a8a294)
+  set(MD5_PPM_420M_Q100_IFAST ff692ee9323a3b424894862557c092f1)
+  set(MD5_JPEG_GRAY_ISLOW 72b51f894b8f4a10b3ee3066770aa38d)
+  set(MD5_PPM_GRAY_ISLOW 8d3596c56eace32f205deccc229aa5ed)
+  set(MD5_PPM_GRAY_ISLOW_RGB 116424ac07b79e5e801f00508eab48ec)
+  set(MD5_BMP_GRAY_ISLOW_565 12f78118e56a2f48b966f792fedf23cc)
+  set(MD5_BMP_GRAY_ISLOW_565D bdbbd616441a24354c98553df5dc82db)
+  set(MD5_JPEG_420S_IFAST_OPT 388708217ac46273ca33086b22827ed8)
+  if(WITH_SIMD)
+    set(MD5_JPEG_3x2_FLOAT_PROG 343e3f8caf8af5986ebaf0bdc13b5c71)
+    set(MD5_PPM_3x2_FLOAT 1a75f36e5904d6fc3a85a43da9ad89bb)
+  else()
+    set(MD5_JPEG_3x2_FLOAT_PROG 9bca803d2042bd1eb03819e2bf92b3e5)
+    set(MD5_PPM_3x2_FLOAT f6bfab038438ed8f5522fbd33595dcdc)
+  endif()
+  set(MD5_JPEG_420_ISLOW_ARI e986fb0a637a8d833d96e8a6d6d84ea1)
+  set(MD5_JPEG_444_ISLOW_PROGARI 0a8f1c8f66e113c3cf635df0a475a617)
+  set(MD5_PPM_420M_IFAST_ARI 72b59a99bcf1de24c5b27d151bde2437)
+  set(MD5_JPEG_420_ISLOW 9a68f56bc76e466aa7e52f415d0f4a5f)
+  set(MD5_PPM_420M_ISLOW_2_1 9f9de8c0612f8d06869b960b05abf9c9)
+  set(MD5_PPM_420M_ISLOW_15_8 b6875bc070720b899566cc06459b63b7)
+  set(MD5_PPM_420M_ISLOW_13_8 bc3452573c8152f6ae552939ee19f82f)
+  set(MD5_PPM_420M_ISLOW_11_8 d8cc73c0aaacd4556569b59437ba00a5)
+  set(MD5_PPM_420M_ISLOW_9_8 d25e61bc7eac0002f5b393aa223747b6)
+  set(MD5_PPM_420M_ISLOW_7_8 ddb564b7c74a09494016d6cd7502a946)
+  set(MD5_PPM_420M_ISLOW_3_4 8ed8e68808c3fbc4ea764fc9d2968646)
+  set(MD5_PPM_420M_ISLOW_5_8 a3363274999da2366a024efae6d16c9b)
+  set(MD5_PPM_420M_ISLOW_1_2 e692a315cea26b988c8e8b29a5dbcd81)
+  set(MD5_PPM_420M_ISLOW_3_8 79eca9175652ced755155c90e785a996)
+  set(MD5_PPM_420M_ISLOW_1_4 79cd778f8bf1a117690052cacdd54eca)
+  set(MD5_PPM_420M_ISLOW_1_8 391b3d4aca640c8567d6f8745eb2142f)
+  set(MD5_BMP_420_ISLOW_256 4980185e3776e89bd931736e1cddeee6)
+  set(MD5_BMP_420_ISLOW_565 bf9d13e16c4923b92e1faa604d7922cb)
+  set(MD5_BMP_420_ISLOW_565D 6bde71526acc44bcff76f696df8638d2)
+  set(MD5_BMP_420M_ISLOW_565 8dc0185245353cfa32ad97027342216f)
+  set(MD5_BMP_420M_ISLOW_565D d1be3a3339166255e76fa50a0d70d73e)
+  set(MD5_PPM_420_ISLOW_SKIP15_31 c4c65c1e43d7275cd50328a61e6534f0)
+  set(MD5_PPM_420_ISLOW_ARI_SKIP16_139 087c6b123db16ac00cb88c5b590bb74a)
+  set(MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 26eb36ccc7d1f0cb80cdabb0ac8b5d99)
+  set(MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4 886c6775af22370257122f8b16207e6d)
+  set(MD5_PPM_444_ISLOW_SKIP1_6 5606f86874cf26b8fcee1117a0a436a6)
+  set(MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 db87dc7ce26bcdc7a6b56239ce2b9d6c)
+  set(MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0 cb57b32bd6d03e35432362f7bf184b6d)
+  set(MD5_JPEG_CROP b4197f377e621c4e9b1d20471432610d)
+endif()
+
+if(WITH_JAVA)
+  add_test(TJUnitTest
+    ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar
+      -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
+      TJUnitTest)
+  add_test(TJUnitTest-yuv
+    ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar
+      -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
+      TJUnitTest -yuv)
+  add_test(TJUnitTest-yuv-nopad
+    ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar
+      -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
+      TJUnitTest -yuv -noyuvpad)
+  add_test(TJUnitTest-bi
+    ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar
+      -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
+      TJUnitTest -bi)
+  add_test(TJUnitTest-bi-yuv
+    ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar
+      -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
+      TJUnitTest -bi -yuv)
+  add_test(TJUnitTest-bi-yuv-nopad
+    ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar
+      -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
+      TJUnitTest -bi -yuv -noyuvpad)
+endif()
+
+set(TEST_LIBTYPES "")
+if(ENABLE_SHARED)
+  set(TEST_LIBTYPES ${TEST_LIBTYPES} shared)
+endif()
+if(ENABLE_STATIC)
+  set(TEST_LIBTYPES ${TEST_LIBTYPES} static)
+endif()
+
+set(TESTIMAGES ${CMAKE_SOURCE_DIR}/testimages)
+set(MD5CMP ${CMAKE_CURRENT_BINARY_DIR}/md5/md5cmp)
+if(CMAKE_CROSSCOMPILING)
+  file(RELATIVE_PATH TESTIMAGES ${CMAKE_CURRENT_BINARY_DIR} ${TESTIMAGES})
+  file(RELATIVE_PATH MD5CMP ${CMAKE_CURRENT_BINARY_DIR} ${MD5CMP})
+endif()
+
+foreach(libtype ${TEST_LIBTYPES})
+  if(libtype STREQUAL "shared")
+    set(dir sharedlib/)
+  else()
+    set(dir "")
+    set(suffix -static)
+  endif()
+  if(WITH_TURBOJPEG)
+    add_test(tjunittest${suffix} tjunittest${suffix})
+    add_test(tjunittest${suffix}-alloc tjunittest${suffix} -alloc)
+    add_test(tjunittest${suffix}-yuv tjunittest${suffix} -yuv)
+    add_test(tjunittest${suffix}-yuv-alloc tjunittest${suffix} -yuv -alloc)
+    add_test(tjunittest${suffix}-yuv-nopad tjunittest${suffix} -yuv -noyuvpad)
+  endif()
+
+  # These tests are carefully chosen to provide full coverage of as many of the
+  # underlying algorithms as possible (including all of the SIMD-accelerated
+  # ones.)
+
+  # CC: null  SAMP: fullsize  FDCT: islow  ENT: huff
+  add_test(cjpeg${suffix}-rgb-islow
+    ${dir}cjpeg${suffix} -rgb -dct int
+      -outfile testout_rgb_islow.jpg ${TESTIMAGES}/testorig.ppm)
+  add_test(cjpeg${suffix}-rgb-islow-cmp
+    ${MD5CMP} ${MD5_JPEG_RGB_ISLOW} testout_rgb_islow.jpg)
+
+  # CC: null  SAMP: fullsize  IDCT: islow  ENT: huff
+  add_test(djpeg${suffix}-rgb-islow
+    ${dir}djpeg${suffix} -dct int -ppm
+      -outfile testout_rgb_islow.ppm testout_rgb_islow.jpg)
+  add_test(djpeg${suffix}-rgb-islow-cmp
+    ${MD5CMP} ${MD5_PPM_RGB_ISLOW} testout_rgb_islow.ppm)
+
+  if(NOT WITH_12BIT)
+    # CC: RGB->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
+    add_test(djpeg${suffix}-rgb-islow-565
+      ${dir}djpeg${suffix} -dct int -rgb565 -dither none -bmp
+        -outfile testout_rgb_islow_565.bmp testout_rgb_islow.jpg)
+    add_test(djpeg${suffix}-rgb-islow-565-cmp
+      ${MD5CMP} ${MD5_BMP_RGB_ISLOW_565} testout_rgb_islow_565.bmp)
+
+    # CC: RGB->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
+    add_test(djpeg${suffix}-rgb-islow-565D
+      ${dir}djpeg${suffix} -dct int -rgb565 -bmp
+        -outfile testout_rgb_islow_565D.bmp testout_rgb_islow.jpg)
+    add_test(djpeg${suffix}-rgb-islow-565D-cmp
+      ${MD5CMP} ${MD5_BMP_RGB_ISLOW_565D} testout_rgb_islow_565D.bmp)
+  endif()
+
+  # CC: RGB->YCC  SAMP: fullsize/h2v1  FDCT: ifast  ENT: 2-pass huff
+  add_test(cjpeg${suffix}-422-ifast-opt
+    ${dir}cjpeg${suffix} -sample 2x1 -dct fast -opt
+      -outfile testout_422_ifast_opt.jpg ${TESTIMAGES}/testorig.ppm)
+  add_test(cjpeg${suffix}-422-ifast-opt-cmp
+    ${MD5CMP} ${MD5_JPEG_422_IFAST_OPT} testout_422_ifast_opt.jpg)
+
+  # CC: YCC->RGB  SAMP: fullsize/h2v1 fancy  IDCT: ifast  ENT: huff
+  add_test(djpeg${suffix}-422-ifast
+    ${dir}djpeg${suffix} -dct fast
+      -outfile testout_422_ifast.ppm testout_422_ifast_opt.jpg)
+  add_test(djpeg${suffix}-422-ifast-cmp
+    ${MD5CMP} ${MD5_PPM_422_IFAST} testout_422_ifast.ppm)
+
+  # CC: YCC->RGB  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
+  add_test(djpeg${suffix}-422m-ifast
+    ${dir}djpeg${suffix} -dct fast -nosmooth
+      -outfile testout_422m_ifast.ppm testout_422_ifast_opt.jpg)
+  add_test(djpeg${suffix}-422m-ifast-cmp
+    ${MD5CMP} ${MD5_PPM_422M_IFAST} testout_422m_ifast.ppm)
+
+  if(NOT WITH_12BIT)
+    # CC: YCC->RGB565  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
+    add_test(djpeg${suffix}-422m-ifast-565
+      ${dir}djpeg${suffix} -dct int -nosmooth -rgb565 -dither none -bmp
+        -outfile testout_422m_ifast_565.bmp testout_422_ifast_opt.jpg)
+    add_test(djpeg${suffix}-422m-ifast-565-cmp
+      ${MD5CMP} ${MD5_BMP_422M_IFAST_565} testout_422m_ifast_565.bmp)
+
+    # CC: YCC->RGB565 (dithered)  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
+    add_test(djpeg${suffix}-422m-ifast-565D
+      ${dir}djpeg${suffix} -dct int -nosmooth -rgb565 -bmp
+        -outfile testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg)
+    add_test(djpeg${suffix}-422m-ifast-565D-cmp
+      ${MD5CMP} ${MD5_BMP_422M_IFAST_565D} testout_422m_ifast_565D.bmp)
+  endif()
+
+  # CC: RGB->YCC  SAMP: fullsize/h2v2  FDCT: ifast  ENT: prog huff
+  add_test(cjpeg${suffix}-420-q100-ifast-prog
+    ${dir}cjpeg${suffix} -sample 2x2 -quality 100 -dct fast -prog
+      -outfile testout_420_q100_ifast_prog.jpg ${TESTIMAGES}/testorig.ppm)
+  add_test(cjpeg${suffix}-420-q100-ifast-prog-cmp
+    ${MD5CMP} ${MD5_JPEG_420_IFAST_Q100_PROG} testout_420_q100_ifast_prog.jpg)
+
+  # CC: YCC->RGB  SAMP: fullsize/h2v2 fancy  IDCT: ifast  ENT: prog huff
+  add_test(djpeg${suffix}-420-q100-ifast-prog
+    ${dir}djpeg${suffix} -dct fast
+      -outfile testout_420_q100_ifast.ppm testout_420_q100_ifast_prog.jpg)
+  add_test(djpeg${suffix}-420-q100-ifast-prog-cmp
+    ${MD5CMP} ${MD5_PPM_420_Q100_IFAST} testout_420_q100_ifast.ppm)
+
+  # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: ifast  ENT: prog huff
+  add_test(djpeg${suffix}-420m-q100-ifast-prog
+    ${dir}djpeg${suffix} -dct fast -nosmooth
+      -outfile testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg)
+  add_test(djpeg${suffix}-420m-q100-ifast-prog-cmp
+    ${MD5CMP} ${MD5_PPM_420M_Q100_IFAST} testout_420m_q100_ifast.ppm)
+
+  # CC: RGB->Gray  SAMP: fullsize  FDCT: islow  ENT: huff
+  add_test(cjpeg${suffix}-gray-islow
+    ${dir}cjpeg${suffix} -gray -dct int
+      -outfile testout_gray_islow.jpg ${TESTIMAGES}/testorig.ppm)
+  add_test(cjpeg${suffix}-gray-islow-cmp
+    ${MD5CMP} ${MD5_JPEG_GRAY_ISLOW} testout_gray_islow.jpg)
+
+  # CC: Gray->Gray  SAMP: fullsize  IDCT: islow  ENT: huff
+  add_test(djpeg${suffix}-gray-islow
+    ${dir}djpeg${suffix} -dct int
+      -outfile testout_gray_islow.ppm testout_gray_islow.jpg)
+  add_test(djpeg${suffix}-gray-islow-cmp
+    ${MD5CMP} ${MD5_PPM_GRAY_ISLOW} testout_gray_islow.ppm)
+
+  # CC: Gray->RGB  SAMP: fullsize  IDCT: islow  ENT: huff
+  add_test(djpeg${suffix}-gray-islow-rgb
+    ${dir}djpeg${suffix} -dct int -rgb
+      -outfile testout_gray_islow_rgb.ppm testout_gray_islow.jpg)
+  add_test(djpeg${suffix}-gray-islow-rgb-cmp
+    ${MD5CMP} ${MD5_PPM_GRAY_ISLOW_RGB} testout_gray_islow_rgb.ppm)
+
+  if(NOT WITH_12BIT)
+    # CC: Gray->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
+    add_test(djpeg${suffix}-gray-islow-565
+      ${dir}djpeg${suffix} -dct int -rgb565 -dither none -bmp
+        -outfile testout_gray_islow_565.bmp testout_gray_islow.jpg)
+    add_test(djpeg${suffix}-gray-islow-565-cmp
+      ${MD5CMP} ${MD5_BMP_GRAY_ISLOW_565} testout_gray_islow_565.bmp)
+
+    # CC: Gray->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
+    add_test(djpeg${suffix}-gray-islow-565D
+      ${dir}djpeg${suffix} -dct int -rgb565 -bmp
+        -outfile testout_gray_islow_565D.bmp testout_gray_islow.jpg)
+    add_test(djpeg${suffix}-gray-islow-565D-cmp
+      ${MD5CMP} ${MD5_BMP_GRAY_ISLOW_565D} testout_gray_islow_565D.bmp)
+  endif()
+
+  # CC: RGB->YCC  SAMP: fullsize smooth/h2v2 smooth  FDCT: islow
+  # ENT: 2-pass huff
+  add_test(cjpeg${suffix}-420s-ifast-opt
+    ${dir}cjpeg${suffix} -sample 2x2 -smooth 1 -dct int -opt
+      -outfile testout_420s_ifast_opt.jpg ${TESTIMAGES}/testorig.ppm)
+  add_test(cjpeg${suffix}-420s-ifast-opt-cmp
+    ${MD5CMP} ${MD5_JPEG_420S_IFAST_OPT} testout_420s_ifast_opt.jpg)
+
+  # CC: RGB->YCC  SAMP: fullsize/int  FDCT: float  ENT: prog huff
+  add_test(cjpeg${suffix}-3x2-float-prog
+    ${dir}cjpeg${suffix} -sample 3x2 -dct float -prog
+      -outfile testout_3x2_float_prog.jpg ${TESTIMAGES}/testorig.ppm)
+  add_test(cjpeg${suffix}-3x2-float-prog-cmp
+    ${MD5CMP} ${MD5_JPEG_3x2_FLOAT_PROG} testout_3x2_float_prog.jpg)
+
+  # CC: YCC->RGB  SAMP: fullsize/int  IDCT: float  ENT: prog huff
+  add_test(djpeg${suffix}-3x2-float-prog
+    ${dir}djpeg${suffix} -dct float
+      -outfile testout_3x2_float.ppm testout_3x2_float_prog.jpg)
+  add_test(djpeg${suffix}-3x2-float-prog-cmp
+    ${MD5CMP} ${MD5_PPM_3x2_FLOAT} testout_3x2_float.ppm)
+
+  if(WITH_ARITH_ENC)
+    # CC: YCC->RGB  SAMP: fullsize/h2v2  FDCT: islow  ENT: arith
+    add_test(cjpeg${suffix}-420-islow-ari
+      ${dir}cjpeg${suffix} -dct int -arithmetic
+        -outfile testout_420_islow_ari.jpg ${TESTIMAGES}/testorig.ppm)
+    add_test(cjpeg${suffix}-420-islow-ari-cmp
+      ${MD5CMP} ${MD5_JPEG_420_ISLOW_ARI} testout_420_islow_ari.jpg)
+
+    add_test(jpegtran${suffix}-420-islow-ari
+      ${dir}jpegtran${suffix} -arithmetic
+        -outfile testout_420_islow_ari.jpg ${TESTIMAGES}/testimgint.jpg)
+    add_test(jpegtran${suffix}-420-islow-ari-cmp
+      ${MD5CMP} ${MD5_JPEG_420_ISLOW_ARI} testout_420_islow_ari.jpg)
+
+    # CC: YCC->RGB  SAMP: fullsize  FDCT: islow  ENT: prog arith
+    add_test(cjpeg${suffix}-444-islow-progari
+      ${dir}cjpeg${suffix} -sample 1x1 -dct int -prog -arithmetic
+        -outfile testout_444_islow_progari.jpg ${TESTIMAGES}/testorig.ppm)
+    add_test(cjpeg${suffix}-444-islow-progari-cmp
+      ${MD5CMP} ${MD5_JPEG_444_ISLOW_PROGARI} testout_444_islow_progari.jpg)
+  endif()
+
+  if(WITH_ARITH_DEC)
+    # CC: RGB->YCC  SAMP: h2v2 merged  IDCT: ifast  ENT: arith
+    add_test(djpeg${suffix}-420m-ifast-ari
+      ${dir}djpeg${suffix} -fast -ppm
+        -outfile testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg)
+    add_test(djpeg${suffix}-420m-ifast-ari-cmp
+      ${MD5CMP} ${MD5_PPM_420M_IFAST_ARI} testout_420m_ifast_ari.ppm)
+
+    add_test(jpegtran${suffix}-420-islow
+      ${dir}jpegtran${suffix}
+        -outfile testout_420_islow.jpg ${TESTIMAGES}/testimgari.jpg)
+    add_test(jpegtran${suffix}-420-islow-cmp
+      ${MD5CMP} ${MD5_JPEG_420_ISLOW} testout_420_islow.jpg)
+  endif()
+
+  # 2/1--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 16x16 islow  ENT: huff
+  # 15/8--  CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 15x15 islow  ENT: huff
+  # 13/8--  CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 13x13 islow  ENT: huff
+  # 11/8--  CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 11x11 islow  ENT: huff
+  # 9/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 9x9 islow  ENT: huff
+  # 7/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 7x7 islow/14x14 islow
+  #         ENT: huff
+  # 3/4--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 6x6 islow/12x12 islow
+  #         ENT: huff
+  # 5/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 5x5 islow/10x10 islow
+  #         ENT: huff
+  # 1/2--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 4x4 islow/8x8 islow
+  #         ENT: huff
+  # 3/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 3x3 islow/6x6 islow
+  #         ENT: huff
+  # 1/4--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 2x2 islow/4x4 islow
+  #         ENT: huff
+  # 1/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 1x1 islow/2x2 islow
+  #         ENT: huff
+  foreach(scale 2_1 15_8 13_8 11_8 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8)
+    string(REGEX REPLACE "_" "/" scalearg ${scale})
+    add_test(djpeg${suffix}-420m-islow-${scale}
+      ${dir}djpeg${suffix} -dct int -scale ${scalearg} -nosmooth -ppm
+        -outfile testout_420m_islow_${scale}.ppm ${TESTIMAGES}/${TESTORIG})
+    add_test(djpeg${suffix}-420m-islow-${scale}-cmp
+      ${MD5CMP} ${MD5_PPM_420M_ISLOW_${scale}} testout_420m_islow_${scale}.ppm)
+  endforeach()
+
+  if(NOT WITH_12BIT)
+    # CC: YCC->RGB (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
+    add_test(djpeg${suffix}-420-islow-256
+      ${dir}djpeg${suffix} -dct int -colors 256 -bmp
+        -outfile testout_420_islow_256.bmp ${TESTIMAGES}/${TESTORIG})
+    add_test(djpeg${suffix}-420-islow-256-cmp
+      ${MD5CMP} ${MD5_BMP_420_ISLOW_256} testout_420_islow_256.bmp)
+
+    # CC: YCC->RGB565  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
+    add_test(djpeg${suffix}-420-islow-565
+      ${dir}djpeg${suffix} -dct int -rgb565 -dither none -bmp
+        -outfile testout_420_islow_565.bmp ${TESTIMAGES}/${TESTORIG})
+    add_test(djpeg${suffix}-420-islow-565-cmp
+      ${MD5CMP} ${MD5_BMP_420_ISLOW_565} testout_420_islow_565.bmp)
+
+    # CC: YCC->RGB565 (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
+    add_test(djpeg${suffix}-420-islow-565D
+      ${dir}djpeg${suffix} -dct int -rgb565 -bmp
+        -outfile testout_420_islow_565D.bmp ${TESTIMAGES}/${TESTORIG})
+    add_test(djpeg${suffix}-420-islow-565D-cmp
+      ${MD5CMP} ${MD5_BMP_420_ISLOW_565D} testout_420_islow_565D.bmp)
+
+    # CC: YCC->RGB565  SAMP: h2v2 merged  IDCT: islow  ENT: huff
+    add_test(djpeg${suffix}-420m-islow-565
+      ${dir}djpeg${suffix} -dct int -nosmooth -rgb565 -dither none -bmp
+        -outfile testout_420m_islow_565.bmp ${TESTIMAGES}/${TESTORIG})
+    add_test(djpeg${suffix}-420m-islow-565-cmp
+      ${MD5CMP} ${MD5_BMP_420M_ISLOW_565} testout_420m_islow_565.bmp)
+
+    # CC: YCC->RGB565 (dithered)  SAMP: h2v2 merged  IDCT: islow  ENT: huff
+    add_test(djpeg${suffix}-420m-islow-565D
+      ${dir}djpeg${suffix} -dct int -nosmooth -rgb565 -bmp
+        -outfile testout_420m_islow_565D.bmp ${TESTIMAGES}/${TESTORIG})
+    add_test(djpeg${suffix}-420m-islow-565D-cmp
+      ${MD5CMP} ${MD5_BMP_420M_ISLOW_565D} testout_420m_islow_565D.bmp)
+  endif()
+
+  # Partial decode tests.  These tests are designed to cover all of the
+  # possible code paths in jpeg_skip_scanlines().
+
+  # Context rows: Yes  Intra-iMCU row: Yes  iMCU row prefetch: No   ENT: huff
+  add_test(djpeg${suffix}-420-islow-skip15_31
+    ${dir}djpeg${suffix} -dct int -skip 15,31 -ppm
+      -outfile testout_420_islow_skip15,31.ppm ${TESTIMAGES}/${TESTORIG})
+  add_test(djpeg${suffix}-420-islow-skip15_31-cmp
+    ${MD5CMP} ${MD5_PPM_420_ISLOW_SKIP15_31} testout_420_islow_skip15,31.ppm)
+
+  # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: Yes  ENT: arith
+  if(WITH_ARITH_DEC)
+    add_test(djpeg${suffix}-420-islow-ari-skip16_139
+      ${dir}djpeg${suffix} -dct int -skip 16,139 -ppm
+        -outfile testout_420_islow_ari_skip16,139.ppm
+        ${TESTIMAGES}/testimgari.jpg)
+    add_test(djpeg${suffix}-420-islow-ari_skip16_139-cmp
+      ${MD5CMP} ${MD5_PPM_420_ISLOW_ARI_SKIP16_139}
+        testout_420_islow_ari_skip16,139.ppm)
+  endif()
+
+  # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: prog huff
+  add_test(cjpeg${suffix}-420-islow-prog
+    ${dir}cjpeg${suffix} -dct int -prog
+      -outfile testout_420_islow_prog.jpg ${TESTIMAGES}/testorig.ppm)
+  add_test(djpeg${suffix}-420-islow-prog-crop62x62_71_71
+    ${dir}djpeg${suffix} -dct int -crop 62x62+71+71 -ppm
+      -outfile testout_420_islow_prog_crop62x62,71,71.ppm
+      testout_420_islow_prog.jpg)
+  add_test(djpeg${suffix}-420-islow-prog-crop62x62_71_71-cmp
+    ${MD5CMP} ${MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71}
+      testout_420_islow_prog_crop62x62,71,71.ppm)
+
+  # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: arith
+  if(WITH_ARITH_DEC)
+    add_test(djpeg${suffix}-420-islow-ari-crop53x53_4_4
+      ${dir}djpeg${suffix} -dct int -crop 53x53+4+4 -ppm
+        -outfile testout_420_islow_ari_crop53x53,4,4.ppm
+        ${TESTIMAGES}/testimgari.jpg)
+    add_test(djpeg${suffix}-420-islow-ari-crop53x53_4_4-cmp
+      ${MD5CMP} ${MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4}
+        testout_420_islow_ari_crop53x53,4,4.ppm)
+  endif()
+
+  # Context rows: No   Intra-iMCU row: Yes  ENT: huff
+  add_test(cjpeg${suffix}-444-islow
+    ${dir}cjpeg${suffix} -dct int -sample 1x1
+      -outfile testout_444_islow.jpg ${TESTIMAGES}/testorig.ppm)
+  add_test(djpeg${suffix}-444-islow-skip1_6
+    ${dir}djpeg${suffix} -dct int -skip 1,6 -ppm
+      -outfile testout_444_islow_skip1,6.ppm testout_444_islow.jpg)
+  add_test(djpeg${suffix}-444-islow-skip1_6-cmp
+    ${MD5CMP} ${MD5_PPM_444_ISLOW_SKIP1_6} testout_444_islow_skip1,6.ppm)
+
+  # Context rows: No   Intra-iMCU row: No   ENT: prog huff
+  add_test(cjpeg${suffix}-444-islow-prog
+    ${dir}cjpeg${suffix} -dct int -prog -sample 1x1
+      -outfile testout_444_islow_prog.jpg ${TESTIMAGES}/testorig.ppm)
+  add_test(djpeg${suffix}-444-islow-prog-crop98x98_13_13
+    ${dir}djpeg${suffix} -dct int -crop 98x98+13+13 -ppm
+      -outfile testout_444_islow_prog_crop98x98,13,13.ppm
+      testout_444_islow_prog.jpg)
+  add_test(djpeg${suffix}-444-islow-prog_crop98x98_13_13-cmp
+    ${MD5CMP} ${MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13}
+      testout_444_islow_prog_crop98x98,13,13.ppm)
+
+  # Context rows: No   Intra-iMCU row: No   ENT: arith
+  if(WITH_ARITH_ENC)
+    add_test(cjpeg${suffix}-444-islow-ari
+      ${dir}cjpeg${suffix} -dct int -arithmetic -sample 1x1
+        -outfile testout_444_islow_ari.jpg ${TESTIMAGES}/testorig.ppm)
+    if(WITH_ARITH_DEC)
+      add_test(djpeg${suffix}-444-islow-ari-crop37x37_0_0
+        ${dir}djpeg${suffix} -dct int -crop 37x37+0+0 -ppm
+          -outfile testout_444_islow_ari_crop37x37,0,0.ppm
+          testout_444_islow_ari.jpg)
+      add_test(djpeg${suffix}-444-islow-ari-crop37x37_0_0-cmp
+        ${MD5CMP} ${MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0}
+          testout_444_islow_ari_crop37x37,0,0.ppm)
+    endif()
+  endif()
+
+  add_test(jpegtran${suffix}-crop
+    ${dir}jpegtran${suffix} -crop 120x90+20+50 -transpose -perfect
+      -outfile testout_crop.jpg ${TESTIMAGES}/${TESTORIG})
+  add_test(jpegtran${suffix}-crop-cmp
+    ${MD5CMP} ${MD5_JPEG_CROP} testout_crop.jpg)
+
+endforeach()
+
+add_custom_target(testclean COMMAND ${MD5CMP} -P
+  ${CMAKE_SOURCE_DIR}/cmakescripts/testclean.cmake)
+
+
+#
+# Installer
+#
+
+if(MSVC)
+  set(INST_PLATFORM "Visual C++")
+  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-vc)
+  set(INST_REG_NAME ${CMAKE_PROJECT_NAME})
+elseif(MINGW)
+  set(INST_PLATFORM GCC)
+  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-gcc)
+  set(INST_REG_NAME ${CMAKE_PROJECT_NAME}-gcc)
+  set(INST_DEFS -DGCC)
+endif()
+
+if(64BIT)
+  set(INST_PLATFORM "${INST_PLATFORM} 64-bit")
+  set(INST_NAME ${INST_NAME}64)
+  set(INST_REG_NAME ${INST_DIR}64)
+  set(INST_DEFS ${INST_DEFS} -DWIN64)
+endif()
+
+if(WITH_JAVA)
+  set(INST_DEFS ${INST_DEFS} -DJAVA)
+endif()
+
+if(MSVC_IDE)
+  set(INST_DEFS ${INST_DEFS} "-DBUILDDIR=${CMAKE_CFG_INTDIR}\\")
+else()
+  set(INST_DEFS ${INST_DEFS} "-DBUILDDIR=")
+endif()
+
+STRING(REGEX REPLACE "/" "\\\\" INST_DIR ${CMAKE_INSTALL_PREFIX})
+
+configure_file(release/libjpeg-turbo.nsi.in libjpeg-turbo.nsi @ONLY)
+
+if(WITH_JAVA)
+  set(JAVA_DEPEND java)
+endif()
+add_custom_target(installer
+  makensis -nocd ${INST_DEFS} libjpeg-turbo.nsi
+  DEPENDS jpeg jpeg-static turbojpeg turbojpeg-static rdjpgcom wrjpgcom
+    cjpeg djpeg jpegtran tjbench ${JAVA_DEPEND}
+  SOURCES libjpeg-turbo.nsi)
+
+if(WITH_TURBOJPEG)
+  if(ENABLE_SHARED)
+    install(TARGETS turbojpeg tjbench
+      ARCHIVE DESTINATION lib
+      LIBRARY DESTINATION lib
+      RUNTIME DESTINATION bin)
+  endif()
+  if(ENABLE_STATIC)
+    install(TARGETS turbojpeg-static ARCHIVE DESTINATION lib)
+    if(NOT ENABLE_SHARED)
+      install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/tjbench-static.exe
+        DESTINATION bin RENAME tjbench.exe)
+    endif()
+  endif()
+  install(FILES ${CMAKE_SOURCE_DIR}/turbojpeg.h DESTINATION include)
+endif()
+
+if(ENABLE_STATIC)
+  install(TARGETS jpeg-static ARCHIVE DESTINATION lib)
+  if(NOT ENABLE_SHARED)
+    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/cjpeg-static.exe
+      DESTINATION bin RENAME cjpeg.exe)
+    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/djpeg-static.exe
+      DESTINATION bin RENAME djpeg.exe)
+    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/jpegtran-static.exe
+      DESTINATION bin RENAME jpegtran.exe)
+  endif()
+endif()
+
+install(TARGETS rdjpgcom wrjpgcom RUNTIME DESTINATION bin)
+
+install(FILES ${CMAKE_SOURCE_DIR}/README.ijg ${CMAKE_SOURCE_DIR}/README.md
+  ${CMAKE_SOURCE_DIR}/example.c ${CMAKE_SOURCE_DIR}/libjpeg.txt
+  ${CMAKE_SOURCE_DIR}/structure.txt ${CMAKE_SOURCE_DIR}/usage.txt
+  ${CMAKE_SOURCE_DIR}/wizard.txt
+  DESTINATION doc)
+
+install(FILES ${CMAKE_BINARY_DIR}/jconfig.h ${CMAKE_SOURCE_DIR}/jerror.h
+  ${CMAKE_SOURCE_DIR}/jmorecfg.h ${CMAKE_SOURCE_DIR}/jpeglib.h
+  DESTINATION include)
diff --git a/ChangeLog.txt b/ChangeLog.txt
index 3ec6c18..4062c69 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -1,3 +1,407 @@
+1.5 beta1
+=========
+
+[1] Added full SIMD acceleration for PowerPC platforms using AltiVec VMX
+(128-bit SIMD) instructions.  Although the performance of libjpeg-turbo on
+PowerPC was already good, due to the increased number of registers available
+to the compiler vs. x86, it was still possible to speed up compression by about
+3-4x and decompression by about 2-2.5x (relative to libjpeg v6b) through the
+use of AltiVec instructions.
+
+[2] Added two new libjpeg API functions (jpeg_skip_scanlines() and
+jpeg_crop_scanline()) that can be used to partially decode a JPEG image.  See
+libjpeg.txt for more details.
+
+[3] The TJCompressor and TJDecompressor classes in the TurboJPEG Java API now
+implement the Closeable interface, so those classes can be used with a
+try-with-resources statement.
+
+[4] The TurboJPEG Java classes now throw unchecked idiomatic exceptions
+(IllegalArgumentException, IllegalStateException) for unrecoverable errors
+caused by incorrect API usage, and those classes throw a new checked exception
+type (TJException) for errors that are passed through from the C library.
+
+[5] Source buffers for the TurboJPEG C API functions, as well as the
+jpeg_mem_src() function in the libjpeg API, are now declared as const pointers.
+This facilitates passing read-only buffers to those functions and ensures the
+caller that the source buffer will not be modified.  This should not create any
+backward API or ABI incompatibilities with prior libjpeg-turbo releases.
+
+[6] The MIPS DSPr2 SIMD code can now be compiled to support either FR=0 or FR=1
+FPUs.
+
+[7] Fixed additional negative left shifts and other issues reported by the GCC
+and Clang undefined behavior sanitizers.  Most of these issues affected only
+32-bit code, and none of them was known to pose a security threat, but removing
+the warnings makes it easier to detect actual security issues, should they
+arise in the future.
+
+[8] Removed the unnecessary .arch directive from the ARM64 NEON SIMD code.
+This directive was preventing the code from assembling using the clang
+integrated assembler.
+
+[9] Fixed a regression caused by 1.4.1[6] that prevented 32-bit and 64-bit
+libjpeg-turbo RPMs from being installed simultaneously on recent Red Hat/Fedora
+distributions.  This was due to the addition of a macro in jconfig.h that
+allows the Huffman codec to determine the word size at compile time.  Since
+that macro differs between 32-bit and 64-bit builds, this caused a conflict
+between the i386 and x86_64 RPMs (any differing files, other than executables,
+are not allowed when 32-bit and 64-bit RPMs are installed simultaneously.)
+Since the macro is used only internally, it has been moved into jconfigint.h.
+
+[10] The x86-64 SIMD code can now be disabled at run time by setting the
+JSIMD_FORCENONE environment variable to 1 (the other SIMD implementations
+already had this capability.)
+
+[11] Added a new command-line argument to TJBench (-nowrite) that prevents the
+benchmark from outputting any images.  This removes any potential operating
+system overhead that might be caused by lazy writes to disk and thus improves
+the consistency of the performance measurements.
+
+[12] Added SIMD acceleration for Huffman encoding on SSE2-capable x86 and
+x86-64 platforms.  This speeds up the compression of full-color JPEGs by about
+10-15% on average (relative to libjpeg-turbo 1.4.x) when using modern Intel and
+AMD CPUs.  Additionally, this works around an issue in the clang optimizer that
+prevents it (as of this writing) from achieving the same performance as GCC
+when compiling the C version of the Huffman encoder
+(https://llvm.org/bugs/show_bug.cgi?id=16035). For the purposes of benchmarking
+or regression testing, SIMD-accelerated Huffman encoding can be disabled by
+setting the JSIMD_NOHUFFENC environment variable to 1.
+
+[13] Added ARM 64-bit (ARMv8) NEON SIMD implementations of the commonly-used
+compression algorithms (including the slow integer forward DCT and h2v2 & h2v1
+downsampling algorithms, which are not accelerated in the 32-bit NEON
+implementation.)  This speeds up the compression of full-color JPEGs by about
+75% on average on a Cavium ThunderX processor and by about 2-2.5x on average on
+Cortex-A53 and Cortex-A57 cores.
+
+[14] Added SIMD acceleration for Huffman encoding on NEON-capable ARM 32-bit
+and 64-bit platforms.
+
+For 32-bit code, this speeds up the compression of full-color JPEGs by about
+30% on average on a typical iOS device (iPhone 4S, Cortex-A9) and by about 6-7%
+on average on a typical Android device (Nexus 5X, Cortex-A53 and Cortex-A57),
+relative to libjpeg-turbo 1.4.x.  Note that the larger speedup under iOS is due
+to the fact that iOS builds use LLVM, which does not optimize the C Huffman
+encoder as well as GCC does.
+
+For 64-bit code, NEON-accelerated Huffman encoding speeds up the compression of
+full-color JPEGs by about 40% on average on a typical iOS device (iPhone 5S,
+Apple A7) and by about 7-8% on average on a typical Android device (Nexus 5X,
+Cortex-A53 and Cortex-A57), in addition to the speedup described in [13] above.
+
+For the purposes of benchmarking or regression testing, SIMD-accelerated
+Huffman encoding can be disabled by setting the JSIMD_NOHUFFENC environment
+variable to 1.
+
+[15] pkg-config (.pc) scripts are now included for both the libjpeg and
+TurboJPEG API libraries on Un*x systems.  Note that if a project's build system
+relies on these scripts, then it will not be possible to build that project
+with libjpeg or with a prior version of libjpeg-turbo.
+
+[16] Optimized the ARM 64-bit (ARMv8) NEON SIMD decompression routines to
+improve performance on CPUs with in-order pipelines.  This speeds up the
+decompression of full-color JPEGs by nearly 2x on average on a Cavium ThunderX
+processor and by about 15% on average on a Cortex-A53 core.
+
+[17] Fixed an issue in the accelerated Huffman decoder that could have caused
+the decoder to read past the end of the input buffer when a malformed,
+specially-crafted JPEG image was being decompressed.  In prior versions of
+libjpeg-turbo, the accelerated Huffman decoder was invoked (in most cases) only
+if there were > 128 bytes of data in the input buffer.  However, it is possible
+to construct a JPEG image in which a single Huffman block is over 430 bytes
+long, so this version of libjpeg-turbo activates the accelerated Huffman
+decoder only if there are > 512 bytes of data in the input buffer.
+
+[18] Fixed a memory leak in tjunittest encountered when running the program
+with the -yuv option.
+
+
+1.4.2
+=====
+
+[1] Fixed an issue whereby cjpeg would segfault if a Windows bitmap with a
+negative width or height was used as an input image (Windows bitmaps can have
+a negative height if they are stored in top-down order, but such files are
+rare and not supported by libjpeg-turbo.)
+
+[2] Fixed an issue whereby, under certain circumstances, libjpeg-turbo would
+incorrectly encode certain JPEG images when quality=100 and the fast integer
+forward DCT were used.  This was known to cause 'make test' to fail when the
+library was built with '-march=haswell' on x86 systems.
+
+[3] Fixed an issue whereby libjpeg-turbo would crash when built with the latest
+& greatest development version of the Clang/LLVM compiler.  This was caused by
+an x86-64 ABI conformance issue in some of libjpeg-turbo's 64-bit SSE2 SIMD
+routines.  Those routines were incorrectly using a 64-bit mov instruction to
+transfer a 32-bit JDIMENSION argument, whereas the x86-64 ABI allows the upper
+(unused) 32 bits of a 32-bit argument's register to be undefined.  The new
+Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
+structure members into a single 64-bit register, and this exposed the ABI
+conformance issue.
+
+[4] Fixed a bug in the MIPS DSPr2 4:2:0 "plain" (non-fancy and non-merged)
+upsampling routine that caused a buffer overflow (and subsequent segfault) when
+decompressing a 4:2:0 JPEG image whose scaled output width was less than 16
+pixels.  The "plain" upsampling routines are normally only used when
+decompressing a non-YCbCr JPEG image, but they are also used when decompressing
+a JPEG image whose scaled output height is 1.
+
+[5] Fixed various negative left shifts and other issues reported by the GCC and
+Clang undefined behavior sanitizers.  None of these was known to pose a
+security threat, but removing the warnings makes it easier to detect actual
+security issues, should they arise in the future.
+
+
+1.4.1
+=====
+
+[1] tjbench now properly handles CMYK/YCCK JPEG files.  Passing an argument of
+-cmyk (instead of, for instance, -rgb) will cause tjbench to internally convert
+the source bitmap to CMYK prior to compression, to generate YCCK JPEG files,
+and to internally convert the decompressed CMYK pixels back to RGB after
+decompression (the latter is done automatically if a CMYK or YCCK JPEG is
+passed to tjbench as a source image.)  The CMYK<->RGB conversion operation is
+not benchmarked.  NOTE: The quick & dirty CMYK<->RGB conversions that tjbench
+uses are suitable for testing only.  Proper conversion between CMYK and RGB
+requires a color management system.
+
+[2] 'make test' now performs additional bitwise regression tests using tjbench,
+mainly for the purpose of testing compression from/decompression to a subregion
+of a larger image buffer.
+
+[3] 'make test' no longer tests the regression of the floating point DCT/IDCT
+by default, since the results of those tests can vary if the algorithms in
+question are not implemented using SIMD instructions on a particular platform.
+See the comments in Makefile.am for information on how to re-enable the tests
+and to specify an expected result for them based on the particulars of your
+platform.
+
+[4] The NULL color conversion routines have been significantly optimized,
+which speeds up the compression of RGB and CMYK JPEGs by 5-20% when using
+64-bit code and 0-3% when using 32-bit code, and the decompression of those
+images by 10-30% when using 64-bit code and 3-12% when using 32-bit code.
+
+[5] Fixed an "illegal instruction" error that occurred when djpeg from a
+SIMD-enabled libjpeg-turbo MIPS build was executed with the -nosmooth option on
+a MIPS machine that lacked DSPr2 support.  The MIPS SIMD routines for h2v1 and
+h2v2 merged upsampling were not properly checking for the existence of DSPr2.
+
+[6] Performance has been improved significantly on 64-bit non-Linux and
+non-Windows platforms (generally 10-20% faster compression and 5-10% faster
+decompression.)  Due to an oversight, the 64-bit version of the accelerated
+Huffman codec was not being compiled in when libjpeg-turbo was built on
+platforms other than Windows or Linux.  Oops.
+
+[7] Fixed an extremely rare bug in the Huffman encoder that caused 64-bit
+builds of libjpeg-turbo to incorrectly encode a few specific test images when
+quality=98, an optimized Huffman table, and the slow integer forward DCT were
+used.
+
+[8] The Windows (CMake) build system now supports building only static or only
+shared libraries.  This is accomplished by adding either -DENABLE_STATIC=0 or
+-DENABLE_SHARED=0 to the CMake command line.
+
+[9] TurboJPEG API functions will now return an error code if a warning is
+triggered in the underlying libjpeg API.  For instance, if a JPEG file is
+corrupt, the TurboJPEG decompression functions will attempt to decompress
+as much of the image as possible, but those functions will now return -1 to
+indicate that the decompression was not entirely successful.
+
+[10] Fixed a bug in the MIPS DSPr2 4:2:2 fancy upsampling routine that caused a
+buffer overflow (and subsequent segfault) when decompressing a 4:2:2 JPEG image
+in which the right-most MCU was 5 or 6 pixels wide.
+
+
+1.4.0
+=====
+
+[1] Fixed a build issue on OS X PowerPC platforms (md5cmp failed to build
+because OS X does not provide the le32toh() and htole32() functions.)
+
+[2] The non-SIMD RGB565 color conversion code did not work correctly on big
+endian machines.  This has been fixed.
+
+[3] Fixed an issue in tjPlaneSizeYUV() whereby it would erroneously return 1
+instead of -1 if componentID was > 0 and subsamp was TJSAMP_GRAY.
+
+[3] Fixed an issue in tjBufSizeYUV2() whereby it would erroneously return 0
+instead of -1 if width was < 1.
+
+[5] The Huffman encoder now uses clz and bsr instructions for bit counting on
+ARM64 platforms (see 1.4 beta1 [5].)
+
+[6] The close() method in the TJCompressor and TJDecompressor Java classes is
+now idempotent.  Previously, that method would call the native tjDestroy()
+function even if the TurboJPEG instance had already been destroyed.  This
+caused an exception to be thrown during finalization, if the close() method had
+already been called.  The exception was caught, but it was still an expensive
+operation.
+
+[7] The TurboJPEG API previously generated an error ("Could not determine
+subsampling type for JPEG image") when attempting to decompress grayscale JPEG
+images that were compressed with a sampling factor other than 1 (for instance,
+with 'cjpeg -grayscale -sample 2x2').  Subsampling technically has no meaning
+with grayscale JPEGs, and thus the horizontal and vertical sampling factors
+for such images are ignored by the decompressor.  However, the TurboJPEG API
+was being too rigid and was expecting the sampling factors to be equal to 1
+before it treated the image as a grayscale JPEG.
+
+[8] cjpeg, djpeg, and jpegtran now accept an argument of -version, which will
+print the library version and exit.
+
+[9] Referring to 1.4 beta1 [15], another extremely rare circumstance was
+discovered under which the Huffman encoder's local buffer can be overrun
+when a buffered destination manager is being used and an
+extremely-high-frequency block (basically junk image data) is being encoded.
+Even though the Huffman local buffer was increased from 128 bytes to 136 bytes
+to address the previous issue, the new issue caused even the larger buffer to
+be overrun.  Further analysis reveals that, in the absolute worst case (such as
+setting alternating AC coefficients to 32767 and -32768 in the JPEG scanning
+order), the Huffman encoder can produce encoded blocks that approach double the
+size of the unencoded blocks.  Thus, the Huffman local buffer was increased to
+256 bytes, which should prevent any such issue from re-occurring in the future.
+
+[10] The new tjPlaneSizeYUV(), tjPlaneWidth(), and tjPlaneHeight() functions
+were not actually usable on any platform except OS X and Windows, because
+those functions were not included in the libturbojpeg mapfile.  This has been
+fixed.
+
+[11] Restored the JPP(), JMETHOD(), and FAR macros in the libjpeg-turbo header
+files.  The JPP() and JMETHOD() macros were originally implemented in libjpeg
+as a way of supporting non-ANSI compilers that lacked support for prototype
+parameters.  libjpeg-turbo has never supported such compilers, but some
+software packages still use the macros to define their own prototypes.
+Similarly, libjpeg-turbo has never supported MS-DOS and other platforms that
+have far symbols, but some software packages still use the FAR macro.  A pretty
+good argument can be made that this is a bad practice on the part of the
+software in question, but since this affects more than one package, it's just
+easier to fix it here.
+
+[12] Fixed issues that were preventing the ARM 64-bit SIMD code from compiling
+for iOS, and included an ARMv8 architecture in all of the binaries installed by
+the "official" libjpeg-turbo SDK for OS X.
+
+
+1.3.90 (1.4 beta1)
+==================
+
+[1] New features in the TurboJPEG API:
+-- YUV planar images can now be generated with an arbitrary line padding
+(previously only 4-byte padding, which was compatible with X Video, was
+supported.)
+-- The decompress-to-YUV function has been extended to support image scaling.
+-- JPEG images can now be compressed from YUV planar source images.
+-- YUV planar images can now be decoded into RGB or grayscale images.
+-- 4:1:1 subsampling is now supported.  This is mainly included for
+compatibility, since 4:1:1 is not fully accelerated in libjpeg-turbo and has no
+significant advantages relative to 4:2:0.
+-- CMYK images are now supported.  This feature allows CMYK source images to be
+compressed to YCCK JPEGs and YCCK or CMYK JPEGs to be decompressed to CMYK
+destination images.  Conversion between CMYK/YCCK and RGB or YUV images is not
+supported.  Such conversion requires a color management system and is thus out
+of scope for a codec library.
+-- The handling of YUV images in the Java API has been significantly refactored
+and should now be much more intuitive.
+-- The Java API now supports encoding a YUV image from an arbitrary position in
+a large image buffer.
+-- All of the YUV functions now have a corresponding function that operates on
+separate image planes instead of a unified image buffer.  This allows for
+compressing/decoding from or decompressing/encoding to a subregion of a larger
+YUV image.  It also allows for handling YUV formats that swap the order of the
+U and V planes.
+
+[2] Added SIMD acceleration for DSPr2-capable MIPS platforms.  This speeds up
+the compression of full-color JPEGs by 70-80% on such platforms and
+decompression by 25-35%.
+
+[3] If an application attempts to decompress a Huffman-coded JPEG image whose
+header does not contain Huffman tables, libjpeg-turbo will now insert the
+default Huffman tables.  In order to save space, many motion JPEG video frames
+are encoded without the default Huffman tables, so these frames can now be
+successfully decompressed by libjpeg-turbo without additional work on the part
+of the application.  An application can still override the Huffman tables, for
+instance to re-use tables from a previous frame of the same video.
+
+[4] The Mac packaging system now uses pkgbuild and productbuild rather than
+PackageMaker (which is obsolete and no longer supported.)  This means that
+OS X 10.6 "Snow Leopard" or later must be used when packaging libjpeg-turbo,
+although the packages produced can be installed on OS X 10.5 "Leopard" or
+later.  OS X 10.4 "Tiger" is no longer supported.
+
+[5] The Huffman encoder now uses clz and bsr instructions for bit counting on
+ARM platforms rather than a lookup table.  This reduces the memory footprint
+by 64k, which may be important for some mobile applications.  Out of four
+Android devices that were tested, two demonstrated a small overall performance
+loss (~3-4% on average) with ARMv6 code and a small gain (also ~3-4%) with
+ARMv7 code when enabling this new feature, but the other two devices
+demonstrated a significant overall performance gain with both ARMv6 and ARMv7
+code (~10-20%) when enabling the feature.  Actual mileage may vary.
+
+[6] Worked around an issue with Visual C++ 2010 and later that caused incorrect
+pixels to be generated when decompressing a JPEG image to a 256-color bitmap,
+if compiler optimization was enabled when libjpeg-turbo was built.  This caused
+the regression tests to fail when doing a release build under Visual C++ 2010
+and later.
+
+[7] Improved the accuracy and performance of the non-SIMD implementation of the
+floating point inverse DCT (using code borrowed from libjpeg v8a and later.)
+The accuracy of this implementation now matches the accuracy of the SSE/SSE2
+implementation.  Note, however, that the floating point DCT/IDCT algorithms are
+mainly a legacy feature.  They generally do not produce significantly better
+accuracy than the slow integer DCT/IDCT algorithms, and they are quite a bit
+slower.
+
+[8] Added a new output colorspace (JCS_RGB565) to the libjpeg API that allows
+for decompressing JPEG images into RGB565 (16-bit) pixels.  If dithering is not
+used, then this code path is SIMD-accelerated on ARM platforms.
+
+[9] Numerous obsolete features, such as support for non-ANSI compilers and
+support for the MS-DOS memory model, were removed from the libjpeg code,
+greatly improving its readability and making it easier to maintain and extend.
+
+[10] Fixed a segfault that occurred when calling output_message() with msg_code
+set to JMSG_COPYRIGHT.
+
+[11] Fixed an issue whereby wrjpgcom was allowing comments longer than 65k
+characters to be passed on the command line, which was causing it to generate
+incorrect JPEG files.
+
+[12] Fixed a bug in the build system that was causing the Windows version of
+wrjpgcom to be built using the rdjpgcom source code.
+
+[13] Restored 12-bit-per-component JPEG support.  A 12-bit version of
+libjpeg-turbo can now be built by passing an argument of --with-12bit to
+configure (Unix) or -DWITH_12BIT=1 to cmake (Windows.)  12-bit JPEG support is
+included only for convenience.  Enabling this feature disables all of the
+performance features in libjpeg-turbo, as well as arithmetic coding and the
+TurboJPEG API.  The resulting library still contains the other libjpeg-turbo
+features (such as the colorspace extensions), but in general, it performs no
+faster than libjpeg v6b.
+
+[14] Added ARM 64-bit SIMD acceleration for the YCC-to-RGB color conversion
+and IDCT algorithms (both are used during JPEG decompression.)  For unknown
+reasons (probably related to clang), this code cannot currently be compiled for
+iOS.
+
+[15] Fixed an extremely rare bug that could cause the Huffman encoder's local
+buffer to overrun when a very high-frequency MCU is compressed using quality
+100 and no subsampling, and when the JPEG output buffer is being dynamically
+resized by the destination manager.  This issue was so rare that, even with a
+test program specifically designed to make the bug occur (by injecting random
+high-frequency YUV data into the compressor), it was reproducible only once in
+about every 25 million iterations.
+
+[16] Fixed an oversight in the TurboJPEG C wrapper:  if any of the JPEG
+compression functions was called repeatedly with the same
+automatically-allocated destination buffer, then TurboJPEG would erroneously
+assume that the jpegSize parameter was equal to the size of the buffer, when in
+fact that parameter was probably equal to the size of the most recently
+compressed JPEG image.  If the size of the previous JPEG image was not as large
+as the current JPEG image, then TurboJPEG would unnecessarily reallocate the
+destination buffer.
+
+
 1.3.1
 =====
 
@@ -128,9 +532,9 @@
 incremented by 1 to reflect this.  You can disable this feature with a
 configure/CMake switch in order to retain strict API/ABI compatibility with the
 libjpeg v6b or v7 API/ABI (or with previous versions of libjpeg-turbo.)  See
-README-turbo.txt for more details.
+README.md for more details.
 
-[13] Added ARM v7s architecture to libjpeg.a and libturbojpeg.a in the official
+[13] Added ARMv7s architecture to libjpeg.a and libturbojpeg.a in the official
 libjpeg-turbo binary package for OS X, so that those libraries can be used to
 build applications that leverage the faster CPUs in the iPhone 5 and iPad 4.
 
@@ -213,7 +617,7 @@
 are in violation of the spec, other JPEG implementations handle them
 correctly.
 
-[7] Added ARM v6 and ARM v7 architectures to libjpeg.a and libturbojpeg.a in
+[7] Added ARMv6 and ARMv7 architectures to libjpeg.a and libturbojpeg.a in
 the official libjpeg-turbo binary package for OS X, so that those libraries can
 be used to build both OS X and iOS applications.
 
@@ -364,7 +768,7 @@
 ==================
 
 [1] Added emulation of the libjpeg v7 and v8 APIs and ABIs.  See
-README-turbo.txt for more details.  This feature was sponsored by CamTrace SAS.
+README.md for more details.  This feature was sponsored by CamTrace SAS.
 
 [2] Created a new CMake-based build system for the Visual C++ and MinGW builds.
 
diff --git a/LGPL.txt b/LGPL.txt
deleted file mode 100644
index b1e3f5a..0000000
--- a/LGPL.txt
+++ /dev/null
@@ -1,504 +0,0 @@
-		  GNU LESSER GENERAL PUBLIC LICENSE
-		       Version 2.1, February 1999
-
- Copyright (C) 1991, 1999 Free Software Foundation, Inc.
-     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-[This is the first released version of the Lesser GPL.  It also counts
- as the successor of the GNU Library Public License, version 2, hence
- the version number 2.1.]
-
-			    Preamble
-
-  The licenses for most software are designed to take away your
-freedom to share and change it.  By contrast, the GNU General Public
-Licenses are intended to guarantee your freedom to share and change
-free software--to make sure the software is free for all its users.
-
-  This license, the Lesser General Public License, applies to some
-specially designated software packages--typically libraries--of the
-Free Software Foundation and other authors who decide to use it.  You
-can use it too, but we suggest you first think carefully about whether
-this license or the ordinary General Public License is the better
-strategy to use in any particular case, based on the explanations below.
-
-  When we speak of free software, we are referring to freedom of use,
-not price.  Our General Public Licenses are designed to make sure that
-you have the freedom to distribute copies of free software (and charge
-for this service if you wish); that you receive source code or can get
-it if you want it; that you can change the software and use pieces of
-it in new free programs; and that you are informed that you can do
-these things.
-
-  To protect your rights, we need to make restrictions that forbid
-distributors to deny you these rights or to ask you to surrender these
-rights.  These restrictions translate to certain responsibilities for
-you if you distribute copies of the library or if you modify it.
-
-  For example, if you distribute copies of the library, whether gratis
-or for a fee, you must give the recipients all the rights that we gave
-you.  You must make sure that they, too, receive or can get the source
-code.  If you link other code with the library, you must provide
-complete object files to the recipients, so that they can relink them
-with the library after making changes to the library and recompiling
-it.  And you must show them these terms so they know their rights.
-
-  We protect your rights with a two-step method: (1) we copyright the
-library, and (2) we offer you this license, which gives you legal
-permission to copy, distribute and/or modify the library.
-
-  To protect each distributor, we want to make it very clear that
-there is no warranty for the free library.  Also, if the library is
-modified by someone else and passed on, the recipients should know
-that what they have is not the original version, so that the original
-author's reputation will not be affected by problems that might be
-introduced by others.
-
-  Finally, software patents pose a constant threat to the existence of
-any free program.  We wish to make sure that a company cannot
-effectively restrict the users of a free program by obtaining a
-restrictive license from a patent holder.  Therefore, we insist that
-any patent license obtained for a version of the library must be
-consistent with the full freedom of use specified in this license.
-
-  Most GNU software, including some libraries, is covered by the
-ordinary GNU General Public License.  This license, the GNU Lesser
-General Public License, applies to certain designated libraries, and
-is quite different from the ordinary General Public License.  We use
-this license for certain libraries in order to permit linking those
-libraries into non-free programs.
-
-  When a program is linked with a library, whether statically or using
-a shared library, the combination of the two is legally speaking a
-combined work, a derivative of the original library.  The ordinary
-General Public License therefore permits such linking only if the
-entire combination fits its criteria of freedom.  The Lesser General
-Public License permits more lax criteria for linking other code with
-the library.
-
-  We call this license the "Lesser" General Public License because it
-does Less to protect the user's freedom than the ordinary General
-Public License.  It also provides other free software developers Less
-of an advantage over competing non-free programs.  These disadvantages
-are the reason we use the ordinary General Public License for many
-libraries.  However, the Lesser license provides advantages in certain
-special circumstances.
-
-  For example, on rare occasions, there may be a special need to
-encourage the widest possible use of a certain library, so that it becomes
-a de-facto standard.  To achieve this, non-free programs must be
-allowed to use the library.  A more frequent case is that a free
-library does the same job as widely used non-free libraries.  In this
-case, there is little to gain by limiting the free library to free
-software only, so we use the Lesser General Public License.
-
-  In other cases, permission to use a particular library in non-free
-programs enables a greater number of people to use a large body of
-free software.  For example, permission to use the GNU C Library in
-non-free programs enables many more people to use the whole GNU
-operating system, as well as its variant, the GNU/Linux operating
-system.
-
-  Although the Lesser General Public License is Less protective of the
-users' freedom, it does ensure that the user of a program that is
-linked with the Library has the freedom and the wherewithal to run
-that program using a modified version of the Library.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.  Pay close attention to the difference between a
-"work based on the library" and a "work that uses the library".  The
-former contains code derived from the library, whereas the latter must
-be combined with the library in order to run.
-
-		  GNU LESSER GENERAL PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
-  0. This License Agreement applies to any software library or other
-program which contains a notice placed by the copyright holder or
-other authorized party saying it may be distributed under the terms of
-this Lesser General Public License (also called "this License").
-Each licensee is addressed as "you".
-
-  A "library" means a collection of software functions and/or data
-prepared so as to be conveniently linked with application programs
-(which use some of those functions and data) to form executables.
-
-  The "Library", below, refers to any such software library or work
-which has been distributed under these terms.  A "work based on the
-Library" means either the Library or any derivative work under
-copyright law: that is to say, a work containing the Library or a
-portion of it, either verbatim or with modifications and/or translated
-straightforwardly into another language.  (Hereinafter, translation is
-included without limitation in the term "modification".)
-
-  "Source code" for a work means the preferred form of the work for
-making modifications to it.  For a library, complete source code means
-all the source code for all modules it contains, plus any associated
-interface definition files, plus the scripts used to control compilation
-and installation of the library.
-
-  Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope.  The act of
-running a program using the Library is not restricted, and output from
-such a program is covered only if its contents constitute a work based
-on the Library (independent of the use of the Library in a tool for
-writing it).  Whether that is true depends on what the Library does
-and what the program that uses the Library does.
-  
-  1. You may copy and distribute verbatim copies of the Library's
-complete source code as you receive it, in any medium, provided that
-you conspicuously and appropriately publish on each copy an
-appropriate copyright notice and disclaimer of warranty; keep intact
-all the notices that refer to this License and to the absence of any
-warranty; and distribute a copy of this License along with the
-Library.
-
-  You may charge a fee for the physical act of transferring a copy,
-and you may at your option offer warranty protection in exchange for a
-fee.
-
-  2. You may modify your copy or copies of the Library or any portion
-of it, thus forming a work based on the Library, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-
-    a) The modified work must itself be a software library.
-
-    b) You must cause the files modified to carry prominent notices
-    stating that you changed the files and the date of any change.
-
-    c) You must cause the whole of the work to be licensed at no
-    charge to all third parties under the terms of this License.
-
-    d) If a facility in the modified Library refers to a function or a
-    table of data to be supplied by an application program that uses
-    the facility, other than as an argument passed when the facility
-    is invoked, then you must make a good faith effort to ensure that,
-    in the event an application does not supply such function or
-    table, the facility still operates, and performs whatever part of
-    its purpose remains meaningful.
-
-    (For example, a function in a library to compute square roots has
-    a purpose that is entirely well-defined independent of the
-    application.  Therefore, Subsection 2d requires that any
-    application-supplied function or table used by this function must
-    be optional: if the application does not supply it, the square
-    root function must still compute square roots.)
-
-These requirements apply to the modified work as a whole.  If
-identifiable sections of that work are not derived from the Library,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works.  But when you
-distribute the same sections as part of a whole which is a work based
-on the Library, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote
-it.
-
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Library.
-
-In addition, mere aggregation of another work not based on the Library
-with the Library (or with a work based on the Library) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-
-  3. You may opt to apply the terms of the ordinary GNU General Public
-License instead of this License to a given copy of the Library.  To do
-this, you must alter all the notices that refer to this License, so
-that they refer to the ordinary GNU General Public License, version 2,
-instead of to this License.  (If a newer version than version 2 of the
-ordinary GNU General Public License has appeared, then you can specify
-that version instead if you wish.)  Do not make any other change in
-these notices.
-
-  Once this change is made in a given copy, it is irreversible for
-that copy, so the ordinary GNU General Public License applies to all
-subsequent copies and derivative works made from that copy.
-
-  This option is useful when you wish to copy part of the code of
-the Library into a program that is not a library.
-
-  4. You may copy and distribute the Library (or a portion or
-derivative of it, under Section 2) in object code or executable form
-under the terms of Sections 1 and 2 above provided that you accompany
-it with the complete corresponding machine-readable source code, which
-must be distributed under the terms of Sections 1 and 2 above on a
-medium customarily used for software interchange.
-
-  If distribution of object code is made by offering access to copy
-from a designated place, then offering equivalent access to copy the
-source code from the same place satisfies the requirement to
-distribute the source code, even though third parties are not
-compelled to copy the source along with the object code.
-
-  5. A program that contains no derivative of any portion of the
-Library, but is designed to work with the Library by being compiled or
-linked with it, is called a "work that uses the Library".  Such a
-work, in isolation, is not a derivative work of the Library, and
-therefore falls outside the scope of this License.
-
-  However, linking a "work that uses the Library" with the Library
-creates an executable that is a derivative of the Library (because it
-contains portions of the Library), rather than a "work that uses the
-library".  The executable is therefore covered by this License.
-Section 6 states terms for distribution of such executables.
-
-  When a "work that uses the Library" uses material from a header file
-that is part of the Library, the object code for the work may be a
-derivative work of the Library even though the source code is not.
-Whether this is true is especially significant if the work can be
-linked without the Library, or if the work is itself a library.  The
-threshold for this to be true is not precisely defined by law.
-
-  If such an object file uses only numerical parameters, data
-structure layouts and accessors, and small macros and small inline
-functions (ten lines or less in length), then the use of the object
-file is unrestricted, regardless of whether it is legally a derivative
-work.  (Executables containing this object code plus portions of the
-Library will still fall under Section 6.)
-
-  Otherwise, if the work is a derivative of the Library, you may
-distribute the object code for the work under the terms of Section 6.
-Any executables containing that work also fall under Section 6,
-whether or not they are linked directly with the Library itself.
-
-  6. As an exception to the Sections above, you may also combine or
-link a "work that uses the Library" with the Library to produce a
-work containing portions of the Library, and distribute that work
-under terms of your choice, provided that the terms permit
-modification of the work for the customer's own use and reverse
-engineering for debugging such modifications.
-
-  You must give prominent notice with each copy of the work that the
-Library is used in it and that the Library and its use are covered by
-this License.  You must supply a copy of this License.  If the work
-during execution displays copyright notices, you must include the
-copyright notice for the Library among them, as well as a reference
-directing the user to the copy of this License.  Also, you must do one
-of these things:
-
-    a) Accompany the work with the complete corresponding
-    machine-readable source code for the Library including whatever
-    changes were used in the work (which must be distributed under
-    Sections 1 and 2 above); and, if the work is an executable linked
-    with the Library, with the complete machine-readable "work that
-    uses the Library", as object code and/or source code, so that the
-    user can modify the Library and then relink to produce a modified
-    executable containing the modified Library.  (It is understood
-    that the user who changes the contents of definitions files in the
-    Library will not necessarily be able to recompile the application
-    to use the modified definitions.)
-
-    b) Use a suitable shared library mechanism for linking with the
-    Library.  A suitable mechanism is one that (1) uses at run time a
-    copy of the library already present on the user's computer system,
-    rather than copying library functions into the executable, and (2)
-    will operate properly with a modified version of the library, if
-    the user installs one, as long as the modified version is
-    interface-compatible with the version that the work was made with.
-
-    c) Accompany the work with a written offer, valid for at
-    least three years, to give the same user the materials
-    specified in Subsection 6a, above, for a charge no more
-    than the cost of performing this distribution.
-
-    d) If distribution of the work is made by offering access to copy
-    from a designated place, offer equivalent access to copy the above
-    specified materials from the same place.
-
-    e) Verify that the user has already received a copy of these
-    materials or that you have already sent this user a copy.
-
-  For an executable, the required form of the "work that uses the
-Library" must include any data and utility programs needed for
-reproducing the executable from it.  However, as a special exception,
-the materials to be distributed need not include anything that is
-normally distributed (in either source or binary form) with the major
-components (compiler, kernel, and so on) of the operating system on
-which the executable runs, unless that component itself accompanies
-the executable.
-
-  It may happen that this requirement contradicts the license
-restrictions of other proprietary libraries that do not normally
-accompany the operating system.  Such a contradiction means you cannot
-use both them and the Library together in an executable that you
-distribute.
-
-  7. You may place library facilities that are a work based on the
-Library side-by-side in a single library together with other library
-facilities not covered by this License, and distribute such a combined
-library, provided that the separate distribution of the work based on
-the Library and of the other library facilities is otherwise
-permitted, and provided that you do these two things:
-
-    a) Accompany the combined library with a copy of the same work
-    based on the Library, uncombined with any other library
-    facilities.  This must be distributed under the terms of the
-    Sections above.
-
-    b) Give prominent notice with the combined library of the fact
-    that part of it is a work based on the Library, and explaining
-    where to find the accompanying uncombined form of the same work.
-
-  8. You may not copy, modify, sublicense, link with, or distribute
-the Library except as expressly provided under this License.  Any
-attempt otherwise to copy, modify, sublicense, link with, or
-distribute the Library is void, and will automatically terminate your
-rights under this License.  However, parties who have received copies,
-or rights, from you under this License will not have their licenses
-terminated so long as such parties remain in full compliance.
-
-  9. You are not required to accept this License, since you have not
-signed it.  However, nothing else grants you permission to modify or
-distribute the Library or its derivative works.  These actions are
-prohibited by law if you do not accept this License.  Therefore, by
-modifying or distributing the Library (or any work based on the
-Library), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Library or works based on it.
-
-  10. Each time you redistribute the Library (or any work based on the
-Library), the recipient automatically receives a license from the
-original licensor to copy, distribute, link with or modify the Library
-subject to these terms and conditions.  You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties with
-this License.
-
-  11. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Library at all.  For example, if a patent
-license would not permit royalty-free redistribution of the Library by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Library.
-
-If any portion of this section is held invalid or unenforceable under any
-particular circumstance, the balance of the section is intended to apply,
-and the section as a whole is intended to apply in other circumstances.
-
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system which is
-implemented by public license practices.  Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-
-  12. If the distribution and/or use of the Library is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Library under this License may add
-an explicit geographical distribution limitation excluding those countries,
-so that distribution is permitted only in or among countries not thus
-excluded.  In such case, this License incorporates the limitation as if
-written in the body of this License.
-
-  13. The Free Software Foundation may publish revised and/or new
-versions of the Lesser General Public License from time to time.
-Such new versions will be similar in spirit to the present version,
-but may differ in detail to address new problems or concerns.
-
-Each version is given a distinguishing version number.  If the Library
-specifies a version number of this License which applies to it and
-"any later version", you have the option of following the terms and
-conditions either of that version or of any later version published by
-the Free Software Foundation.  If the Library does not specify a
-license version number, you may choose any version ever published by
-the Free Software Foundation.
-
-  14. If you wish to incorporate parts of the Library into other free
-programs whose distribution conditions are incompatible with these,
-write to the author to ask for permission.  For software which is
-copyrighted by the Free Software Foundation, write to the Free
-Software Foundation; we sometimes make exceptions for this.  Our
-decision will be guided by the two goals of preserving the free status
-of all derivatives of our free software and of promoting the sharing
-and reuse of software generally.
-
-			    NO WARRANTY
-
-  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
-WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
-EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
-OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
-KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
-LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
-THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
-WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
-AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
-FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
-CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
-LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
-RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
-FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
-SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGES.
-
-		     END OF TERMS AND CONDITIONS
-
-           How to Apply These Terms to Your New Libraries
-
-  If you develop a new library, and you want it to be of the greatest
-possible use to the public, we recommend making it free software that
-everyone can redistribute and change.  You can do so by permitting
-redistribution under these terms (or, alternatively, under the terms of the
-ordinary General Public License).
-
-  To apply these terms, attach the following notices to the library.  It is
-safest to attach them to the start of each source file to most effectively
-convey the exclusion of warranty; and each file should have at least the
-"copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the library's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-Also add information on how to contact you by electronic and paper mail.
-
-You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a "copyright disclaimer" for the library, if
-necessary.  Here is a sample; alter the names:
-
-  Yoyodyne, Inc., hereby disclaims all copyright interest in the
-  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
-
-  <signature of Ty Coon>, 1 April 1990
-  Ty Coon, President of Vice
-
-That's all there is to it!
-
-
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..4623e29
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,88 @@
+libjpeg-turbo Licenses
+======================
+
+libjpeg-turbo is covered by three compatible BSD-style open source licenses:
+
+- The IJG (Independent JPEG Group) License, which is listed in
+  [README.ijg](README.ijg)
+
+  This license applies to the libjpeg API library and associated programs
+  (any code inherited from libjpeg, and any modifications to that code.)
+
+- The Modified (3-clause) BSD License, which is listed in
+  [turbojpeg.c](turbojpeg.c)
+
+  This license covers the TurboJPEG API library and associated programs.
+
+- The zlib License, which is listed in [simd/jsimdext.inc](simd/jsimdext.inc)
+
+  This license is a subset of the other two, and it covers the libjpeg-turbo
+  SIMD extensions.
+
+
+Complying with the libjpeg-turbo Licenses
+=========================================
+
+This section provides a roll-up of the libjpeg-turbo licensing terms, to the
+best of our understanding.
+
+1.  If you are distributing a modified version of the libjpeg-turbo source,
+    then:
+
+    1.  You cannot alter or remove any existing copyright or license notices
+        from the source.
+
+        **Origin**
+        - Clause 1 of the IJG License
+        - Clause 1 of the Modified BSD License
+        - Clauses 1 and 3 of the zlib License
+
+    2.  You must add your own copyright notice to the header of each source
+        file you modified, so others can tell that you modified that file (if
+        there is not an existing copyright header in that file, then you can
+        simply add a notice stating that you modified the file.)
+
+        **Origin**
+        - Clause 1 of the IJG License
+        - Clause 2 of the zlib License
+
+    3.  You must include the IJG README file, and you must not alter any of the
+        copyright or license text in that file.
+
+        **Origin**
+        - Clause 1 of the IJG License
+
+2.  If you are distributing only libjpeg-turbo binaries without the source, or
+    if you are distributing an application that statically links with
+    libjpeg-turbo, then:
+
+    1.  Your product documentation must include a message stating:
+
+        This software is based in part on the work of the Independent JPEG
+        Group.
+
+        **Origin**
+        - Clause 2 of the IJG license
+
+    2.  If your binary distribution includes or uses the TurboJPEG API, then
+        your product documentation must include the text of the Modified BSD
+        License.
+
+        **Origin**
+        - Clause 2 of the Modified BSD License
+
+3.  You cannot use the name of the IJG or The libjpeg-turbo Project or the
+    contributors thereof in advertising, publicity, etc.
+
+    **Origin**
+    - IJG License
+    - Clause 3 of the Modified BSD License
+
+4.  The IJG and The libjpeg-turbo Project do not warrant libjpeg-turbo to be
+    free of defects, nor do we accept any liability for undesirable
+    consequences resulting from your use of the software.
+
+    **Origin**
+    - IJG License
+    - Modified BSD License
+    - zlib License
diff --git a/LICENSE.txt b/LICENSE.txt
deleted file mode 100644
index dbb810e..0000000
--- a/LICENSE.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-libjpeg-turbo is licensed under a non-restrictive, BSD-style license
-(see README.)  The TurboJPEG/OSS wrapper (both C and Java versions) and
-associated test programs bear a similar license, which is reproduced below:
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-- Redistributions of source code must retain the above copyright notice,
-  this list of conditions and the following disclaimer.
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-- Neither the name of the libjpeg-turbo Project nor the names of its
-  contributors may be used to endorse or promote products derived from this
-  software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..b29edde
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,778 @@
+lib_LTLIBRARIES = libjpeg.la
+libjpeg_la_LDFLAGS = -version-info ${LIBTOOL_CURRENT}:${SO_MINOR_VERSION}:${SO_AGE} -no-undefined
+include_HEADERS = jerror.h jmorecfg.h jpeglib.h
+
+if WITH_TURBOJPEG
+lib_LTLIBRARIES += libturbojpeg.la
+libturbojpeg_la_LDFLAGS = -version-info 1:0:1 -no-undefined
+include_HEADERS += turbojpeg.h
+endif
+
+nodist_include_HEADERS = jconfig.h
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = pkgscripts/libjpeg.pc pkgscripts/libturbojpeg.pc
+
+HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+	jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h jpegcomp.h \
+	jpeg_nbits_table.h
+
+libjpeg_la_SOURCES = $(HDRS) jcapimin.c jcapistd.c jccoefct.c jccolor.c \
+	jcdctmgr.c jchuff.c jcinit.c jcmainct.c jcmarker.c jcmaster.c \
+	jcomapi.c jcparam.c jcphuff.c jcprepct.c jcsample.c jctrans.c \
+	jdapimin.c jdapistd.c jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c \
+	jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c jdmaster.c \
+	jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c \
+	jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c \
+	jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c
+
+if WITH_ARITH
+libjpeg_la_SOURCES += jaricom.c
+endif
+
+if WITH_ARITH_ENC
+libjpeg_la_SOURCES += jcarith.c
+endif
+
+if WITH_ARITH_DEC
+libjpeg_la_SOURCES += jdarith.c
+endif
+
+
+SUBDIRS = java
+
+
+if WITH_TURBOJPEG
+
+libturbojpeg_la_SOURCES = $(libjpeg_la_SOURCES) turbojpeg.c turbojpeg.h \
+	transupp.c transupp.h jdatadst-tj.c jdatasrc-tj.c
+
+if WITH_JAVA
+
+libturbojpeg_la_SOURCES += turbojpeg-jni.c
+libturbojpeg_la_CFLAGS = ${JNI_CFLAGS}
+TJMAPFILE = turbojpeg-mapfile.jni
+
+else
+
+TJMAPFILE = turbojpeg-mapfile
+
+endif
+
+libturbojpeg_la_SOURCES += $(TJMAPFILE)
+
+if VERSION_SCRIPT
+libturbojpeg_la_LDFLAGS += $(VERSION_SCRIPT_FLAG)$(srcdir)/$(TJMAPFILE)
+endif
+
+endif
+
+
+if VERSION_SCRIPT
+libjpeg_la_LDFLAGS += $(VERSION_SCRIPT_FLAG)libjpeg.map
+endif
+
+
+if WITH_SIMD
+
+SUBDIRS += simd
+libjpeg_la_LIBADD = simd/libsimd.la
+libturbojpeg_la_LIBADD = simd/libsimd.la
+
+else
+
+libjpeg_la_SOURCES += jsimd_none.c
+
+endif
+
+
+bin_PROGRAMS = cjpeg djpeg jpegtran rdjpgcom wrjpgcom
+noinst_PROGRAMS = jcstest
+
+
+if WITH_TURBOJPEG
+
+bin_PROGRAMS += tjbench
+
+noinst_PROGRAMS += tjunittest
+
+tjbench_SOURCES = tjbench.c bmp.h bmp.c tjutil.h tjutil.c rdbmp.c rdppm.c \
+	wrbmp.c wrppm.c
+
+tjbench_LDADD = libturbojpeg.la libjpeg.la -lm
+
+tjbench_CFLAGS = -DBMP_SUPPORTED -DPPM_SUPPORTED
+
+tjunittest_SOURCES = tjunittest.c tjutil.h tjutil.c
+
+tjunittest_LDADD = libturbojpeg.la
+
+endif
+
+
+cjpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c cjpeg.c rdgif.c rdppm.c rdswitch.c
+if WITH_12BIT
+else
+cjpeg_SOURCES += rdbmp.c rdtarga.c
+endif
+
+cjpeg_LDADD = libjpeg.la
+
+cjpeg_CFLAGS = -DGIF_SUPPORTED -DPPM_SUPPORTED
+if WITH_12BIT
+else
+cjpeg_CFLAGS += -DBMP_SUPPORTED -DTARGA_SUPPORTED
+endif
+
+djpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c djpeg.c rdcolmap.c rdswitch.c \
+	wrgif.c wrppm.c
+if WITH_12BIT
+else
+djpeg_SOURCES += wrbmp.c wrtarga.c
+endif
+
+djpeg_LDADD = libjpeg.la
+
+djpeg_CFLAGS = -DGIF_SUPPORTED -DPPM_SUPPORTED
+if WITH_12BIT
+else
+djpeg_CFLAGS += -DBMP_SUPPORTED -DTARGA_SUPPORTED
+endif
+
+jpegtran_SOURCES = jpegtran.c rdswitch.c cdjpeg.c transupp.c transupp.h
+
+jpegtran_LDADD = libjpeg.la
+
+rdjpgcom_SOURCES = rdjpgcom.c
+
+rdjpgcom_LDADD = libjpeg.la
+
+wrjpgcom_SOURCES = wrjpgcom.c
+
+wrjpgcom_LDADD = libjpeg.la
+
+jcstest_SOURCES = jcstest.c
+
+jcstest_LDADD = libjpeg.la
+
+dist_man1_MANS = cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 wrjpgcom.1
+
+DOCS= coderules.txt jconfig.txt change.log rdrle.c wrrle.c BUILDING.md \
+	ChangeLog.txt
+
+dist_doc_DATA = README.ijg README.md libjpeg.txt structure.txt usage.txt \
+	wizard.txt LICENSE.md
+
+exampledir = $(docdir)
+dist_example_DATA = example.c
+
+
+EXTRA_DIST = win release $(DOCS) testimages CMakeLists.txt \
+	sharedlib/CMakeLists.txt cmakescripts libjpeg.map.in doc doxygen.config \
+	doxygen-extra.css jccolext.c jdcolext.c jdcol565.c jdmrgext.c jdmrg565.c \
+	jstdhuff.c jdcoefct.h jdmainct.h jdmaster.h jdsample.h wrppm.h \
+	md5/CMakeLists.txt
+
+dist-hook:
+	rm -rf `find $(distdir) -name .svn`
+
+
+SUBDIRS += md5
+
+if WITH_12BIT
+
+TESTORIG = testorig12.jpg
+MD5_JPEG_RGB_ISLOW = 9620f424569594bb9242b48498ad801f
+MD5_PPM_RGB_ISLOW = f3301d2219783b8b3d942b7239fa50c0
+MD5_JPEG_422_IFAST_OPT = 7322e3bd2f127f7de4b40d4480ce60e4
+MD5_PPM_422_IFAST = 79807fa552899e66a04708f533e16950
+MD5_PPM_422M_IFAST = 07737bfe8a7c1c87aaa393a0098d16b0
+MD5_JPEG_420_IFAST_Q100_PROG = a1da220b5604081863a504297ed59e55
+MD5_PPM_420_Q100_IFAST = 1b3730122709f53d007255e8dfd3305e
+MD5_PPM_420M_Q100_IFAST = 980a1a3c5bf9510022869d30b7d26566
+MD5_JPEG_GRAY_ISLOW = 235c90707b16e2e069f37c888b2636d9
+MD5_PPM_GRAY_ISLOW = 7213c10af507ad467da5578ca5ee1fca
+MD5_PPM_GRAY_ISLOW_RGB = e96ee81c30a6ed422d466338bd3de65d
+MD5_JPEG_420S_IFAST_OPT = 7af8e60be4d9c227ec63ac9b6630855e
+MD5_JPEG_3x2_FLOAT_PROG_SSE = a8c17daf77b457725ec929e215b603f8
+MD5_PPM_3x2_FLOAT_SSE = 42876ab9e5c2f76a87d08db5fbd57956
+MD5_JPEG_3x2_FLOAT_PROG_32BIT = a8c17daf77b457725ec929e215b603f8
+MD5_PPM_3x2_FLOAT_32BIT = 42876ab9e5c2f76a87d08db5fbd57956
+MD5_PPM_3x2_FLOAT_64BIT = d6fbc71153b3d8ded484dbc17c7b9cf4
+MD5_JPEG_3x2_IFAST_PROG = 1396cc2b7185cfe943d408c9d305339e
+MD5_PPM_3x2_IFAST = 3975985ef6eeb0a2cdc58daa651ccc00
+MD5_PPM_420M_ISLOW_2_1 = 4ca6be2a6f326ff9eaab63e70a8259c0
+MD5_PPM_420M_ISLOW_15_8 = 12aa9f9534c1b3d7ba047322226365eb
+MD5_PPM_420M_ISLOW_13_8 = f7e22817c7b25e1393e4ec101e9d4e96
+MD5_PPM_420M_ISLOW_11_8 = 800a16f9f4dc9b293197bfe11be10a82
+MD5_PPM_420M_ISLOW_9_8 = 06b7a92a9bc69f4dc36ec40f1937d55c
+MD5_PPM_420M_ISLOW_7_8 = 3ec444a14a4ab4eab88ffc49c48eca43
+MD5_PPM_420M_ISLOW_3_4 = 3e726b7ea872445b19437d1c1d4f0d93
+MD5_PPM_420M_ISLOW_5_8 = a8a771abdc94301d20ffac119b2caccd
+MD5_PPM_420M_ISLOW_1_2 = b419124dd5568b085787234866102866
+MD5_PPM_420M_ISLOW_3_8 = 343d19015531b7bbe746124127244fa8
+MD5_PPM_420M_ISLOW_1_4 = 35fd59d866e44659edfa3c18db2a3edb
+MD5_PPM_420M_ISLOW_1_8 = ccaed48ac0aedefda5d4abe4013f4ad7
+MD5_PPM_420_ISLOW_SKIP15_31 = 86664cd9dc956536409e44e244d20a97
+MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 = 452a21656115a163029cfba5c04fa76a
+MD5_PPM_444_ISLOW_SKIP1_6 = ef63901f71ef7a75cd78253fc0914f84
+MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 = 15b173fb5872d9575572fbcc1b05956f
+MD5_JPEG_CROP = cdb35ff4b4519392690ea040c56ea99c
+
+else
+
+TESTORIG = testorig.jpg
+MD5_JPEG_RGB_ISLOW = 768e970dd57b340ff1b83c9d3d47c77b
+MD5_PPM_RGB_ISLOW = 00a257f5393fef8821f2b88ac7421291
+MD5_BMP_RGB_ISLOW_565 = f07d2e75073e4bb10f6c6f4d36e2e3be
+MD5_BMP_RGB_ISLOW_565D = 4cfa0928ef3e6bb626d7728c924cfda4
+MD5_JPEG_422_IFAST_OPT = 2540287b79d913f91665e660303ab2c8
+MD5_PPM_422_IFAST = 35bd6b3f833bad23de82acea847129fa
+MD5_PPM_422M_IFAST = 8dbc65323d62cca7c91ba02dd1cfa81d
+MD5_BMP_422M_IFAST_565 = 3294bd4d9a1f2b3d08ea6020d0db7065
+MD5_BMP_422M_IFAST_565D = da98c9c7b6039511be4a79a878a9abc1
+MD5_JPEG_420_IFAST_Q100_PROG = 990cbe0329c882420a2094da7e5adade
+MD5_PPM_420_Q100_IFAST = 5a732542015c278ff43635e473a8a294
+MD5_PPM_420M_Q100_IFAST = ff692ee9323a3b424894862557c092f1
+MD5_JPEG_GRAY_ISLOW = 72b51f894b8f4a10b3ee3066770aa38d
+MD5_PPM_GRAY_ISLOW = 8d3596c56eace32f205deccc229aa5ed
+MD5_PPM_GRAY_ISLOW_RGB = 116424ac07b79e5e801f00508eab48ec
+MD5_BMP_GRAY_ISLOW_565 = 12f78118e56a2f48b966f792fedf23cc
+MD5_BMP_GRAY_ISLOW_565D = bdbbd616441a24354c98553df5dc82db
+MD5_JPEG_420S_IFAST_OPT = 388708217ac46273ca33086b22827ed8
+# See README.md for more details on why this next bit is necessary.
+MD5_JPEG_3x2_FLOAT_PROG_SSE = 343e3f8caf8af5986ebaf0bdc13b5c71
+MD5_PPM_3x2_FLOAT_SSE = 1a75f36e5904d6fc3a85a43da9ad89bb
+MD5_JPEG_3x2_FLOAT_PROG_32BIT = 9bca803d2042bd1eb03819e2bf92b3e5
+MD5_PPM_3x2_FLOAT_32BIT = f6bfab038438ed8f5522fbd33595dcdc
+MD5_PPM_3x2_FLOAT_64BIT = 0e917a34193ef976b679a6b069b1be26
+MD5_JPEG_3x2_IFAST_PROG = 1ee5d2c1a77f2da495f993c8c7cceca5
+MD5_PPM_3x2_IFAST = fd283664b3b49127984af0a7f118fccd
+MD5_JPEG_420_ISLOW_ARI = e986fb0a637a8d833d96e8a6d6d84ea1
+MD5_JPEG_444_ISLOW_PROGARI = 0a8f1c8f66e113c3cf635df0a475a617
+MD5_PPM_420M_IFAST_ARI = 72b59a99bcf1de24c5b27d151bde2437
+MD5_JPEG_420_ISLOW = 9a68f56bc76e466aa7e52f415d0f4a5f
+MD5_PPM_420M_ISLOW_2_1 = 9f9de8c0612f8d06869b960b05abf9c9
+MD5_PPM_420M_ISLOW_15_8 = b6875bc070720b899566cc06459b63b7
+MD5_PPM_420M_ISLOW_13_8 = bc3452573c8152f6ae552939ee19f82f
+MD5_PPM_420M_ISLOW_11_8 = d8cc73c0aaacd4556569b59437ba00a5
+MD5_PPM_420M_ISLOW_9_8 = d25e61bc7eac0002f5b393aa223747b6
+MD5_PPM_420M_ISLOW_7_8 = ddb564b7c74a09494016d6cd7502a946
+MD5_PPM_420M_ISLOW_3_4 = 8ed8e68808c3fbc4ea764fc9d2968646
+MD5_PPM_420M_ISLOW_5_8 = a3363274999da2366a024efae6d16c9b
+MD5_PPM_420M_ISLOW_1_2 = e692a315cea26b988c8e8b29a5dbcd81
+MD5_PPM_420M_ISLOW_3_8 = 79eca9175652ced755155c90e785a996
+MD5_PPM_420M_ISLOW_1_4 = 79cd778f8bf1a117690052cacdd54eca
+MD5_PPM_420M_ISLOW_1_8 = 391b3d4aca640c8567d6f8745eb2142f
+MD5_BMP_420_ISLOW_256 = 4980185e3776e89bd931736e1cddeee6
+MD5_BMP_420_ISLOW_565 = bf9d13e16c4923b92e1faa604d7922cb
+MD5_BMP_420_ISLOW_565D = 6bde71526acc44bcff76f696df8638d2
+MD5_BMP_420M_ISLOW_565 = 8dc0185245353cfa32ad97027342216f
+MD5_BMP_420M_ISLOW_565D =d1be3a3339166255e76fa50a0d70d73e
+MD5_PPM_420_ISLOW_SKIP15_31 = c4c65c1e43d7275cd50328a61e6534f0
+MD5_PPM_420_ISLOW_ARI_SKIP16_139 = 087c6b123db16ac00cb88c5b590bb74a
+MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 = 26eb36ccc7d1f0cb80cdabb0ac8b5d99
+MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4 = 886c6775af22370257122f8b16207e6d
+MD5_PPM_444_ISLOW_SKIP1_6 = 5606f86874cf26b8fcee1117a0a436a6
+MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 = db87dc7ce26bcdc7a6b56239ce2b9d6c
+MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0 = cb57b32bd6d03e35432362f7bf184b6d
+MD5_JPEG_CROP = b4197f377e621c4e9b1d20471432610d
+
+endif
+
+.PHONY: test
+test: tjquicktest tjbittest bittest
+
+if CROSS_COMPILING
+tjquicktest: testclean
+else
+tjquicktest: testclean all
+endif
+
+if WITH_TURBOJPEG
+if WITH_JAVA
+	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest
+	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -bi
+	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv
+	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv -noyuvpad
+	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv -bi
+	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv -bi -noyuvpad
+endif
+	./tjunittest
+	./tjunittest -alloc
+	./tjunittest -yuv
+	./tjunittest -yuv -alloc
+	./tjunittest -yuv -noyuvpad
+endif
+	echo GREAT SUCCESS!
+
+if CROSS_COMPILING
+tjbittest: testclean
+else
+tjbittest: testclean all
+endif
+
+if WITH_TURBOJPEG
+
+MD5_PPM_GRAY_TILE = 89d3ca21213d9d864b50b4e4e7de4ca6
+MD5_PPM_420_8x8_TILE = 847fceab15c5b7b911cb986cf0f71de3
+MD5_PPM_420_16x16_TILE = ca45552a93687e078f7137cc4126a7b0
+MD5_PPM_420_32x32_TILE = d8676f1d6b68df358353bba9844f4a00
+MD5_PPM_420_64x64_TILE = 4e4c1a3d7ea4bace4f868bcbe83b7050
+MD5_PPM_420_128x128_TILE = f24c3429c52265832beab9df72a0ceae
+MD5_PPM_420M_8x8_TILE = bc25320e1f4c31ce2e610e43e9fd173c
+MD5_PPM_420M_TILE = 75ffdf14602258c5c189522af57fa605
+MD5_PPM_422_8x8_TILE = d83dacd9fc73b0a6f10c09acad64eb1e
+MD5_PPM_422_16x16_TILE = 35077fb610d72dd743b1eb0cbcfe10fb
+MD5_PPM_422_32x32_TILE = e6902ed8a449ecc0f0d6f2bf945f65f7
+MD5_PPM_422_64x64_TILE = 2b4502a8f316cedbde1da7bce3d2231e
+MD5_PPM_422_128x128_TILE = f0b5617d578f5e13c8eee215d64d4877
+MD5_PPM_422M_8x8_TILE = 828941d7f41cd6283abd6beffb7fd51d
+MD5_PPM_422M_TILE = e877ae1324c4a280b95376f7f018172f
+MD5_PPM_444_TILE = 7964e41e67cfb8d0a587c0aa4798f9c3
+
+# Test compressing from/decompressing to an arbitrary subregion of a larger
+# image buffer
+	cp $(srcdir)/testimages/testorig.ppm testout_tile.ppm
+	./tjbench testout_tile.ppm 95 -rgb -quiet -tile -benchtime 0.01 >/dev/null 2>&1
+	for i in 8 16 32 64 128; do \
+		md5/md5cmp $(MD5_PPM_GRAY_TILE) testout_tile_GRAY_Q95_$$i\x$$i.ppm; \
+	done
+	md5/md5cmp $(MD5_PPM_420_8x8_TILE) testout_tile_420_Q95_8x8.ppm
+	md5/md5cmp $(MD5_PPM_420_16x16_TILE) testout_tile_420_Q95_16x16.ppm
+	md5/md5cmp $(MD5_PPM_420_32x32_TILE) testout_tile_420_Q95_32x32.ppm
+	md5/md5cmp $(MD5_PPM_420_64x64_TILE) testout_tile_420_Q95_64x64.ppm
+	md5/md5cmp $(MD5_PPM_420_128x128_TILE) testout_tile_420_Q95_128x128.ppm
+	md5/md5cmp $(MD5_PPM_422_8x8_TILE) testout_tile_422_Q95_8x8.ppm
+	md5/md5cmp $(MD5_PPM_422_16x16_TILE) testout_tile_422_Q95_16x16.ppm
+	md5/md5cmp $(MD5_PPM_422_32x32_TILE) testout_tile_422_Q95_32x32.ppm
+	md5/md5cmp $(MD5_PPM_422_64x64_TILE) testout_tile_422_Q95_64x64.ppm
+	md5/md5cmp $(MD5_PPM_422_128x128_TILE) testout_tile_422_Q95_128x128.ppm
+	for i in 8 16 32 64 128; do \
+		md5/md5cmp $(MD5_PPM_444_TILE) testout_tile_444_Q95_$$i\x$$i.ppm; \
+	done
+	rm -f testout_tile_GRAY_* testout_tile_420_* testout_tile_422_* testout_tile_444_*
+
+	./tjbench testout_tile.ppm 95 -rgb -fastupsample -quiet -tile -benchtime 0.01 >/dev/null 2>&1
+	md5/md5cmp $(MD5_PPM_420M_8x8_TILE) testout_tile_420_Q95_8x8.ppm
+	for i in 16 32 64 128; do \
+		md5/md5cmp $(MD5_PPM_420M_TILE) testout_tile_420_Q95_$$i\x$$i.ppm; \
+	done
+	md5/md5cmp $(MD5_PPM_422M_8x8_TILE) testout_tile_422_Q95_8x8.ppm
+	for i in 16 32 64 128; do \
+		md5/md5cmp $(MD5_PPM_422M_TILE) testout_tile_422_Q95_$$i\x$$i.ppm; \
+	done
+	rm -f testout_tile_GRAY_* testout_tile_420_* testout_tile_422_* testout_tile_444_* testout_tile.ppm
+	echo GREAT SUCCESS!
+
+endif
+
+if CROSS_COMPILING
+bittest: testclean
+else
+bittest: testclean all
+endif
+
+# These tests are carefully crafted to provide full coverage of as many of the
+# underlying algorithms as possible (including all of the SIMD-accelerated
+# ones.)
+
+# CC: null  SAMP: fullsize  FDCT: islow  ENT: huff
+	./cjpeg -rgb -dct int -outfile testout_rgb_islow.jpg $(srcdir)/testimages/testorig.ppm
+	md5/md5cmp $(MD5_JPEG_RGB_ISLOW) testout_rgb_islow.jpg
+# CC: null  SAMP: fullsize  IDCT: islow  ENT: huff
+	./djpeg -dct int -ppm -outfile testout_rgb_islow.ppm testout_rgb_islow.jpg
+	md5/md5cmp $(MD5_PPM_RGB_ISLOW) testout_rgb_islow.ppm
+	rm -f testout_rgb_islow.ppm
+if WITH_12BIT
+	rm -f testout_rgb_islow.jpg
+else
+# CC: RGB->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
+	./djpeg -dct int -rgb565 -dither none -bmp -outfile testout_rgb_islow_565.bmp testout_rgb_islow.jpg
+	md5/md5cmp $(MD5_BMP_RGB_ISLOW_565) testout_rgb_islow_565.bmp
+	rm -f testout_rgb_islow_565.bmp
+# CC: RGB->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
+	./djpeg -dct int -rgb565 -bmp -outfile testout_rgb_islow_565D.bmp testout_rgb_islow.jpg
+	md5/md5cmp $(MD5_BMP_RGB_ISLOW_565D) testout_rgb_islow_565D.bmp
+	rm -f testout_rgb_islow_565D.bmp testout_rgb_islow.jpg
+endif
+
+# CC: RGB->YCC  SAMP: fullsize/h2v1  FDCT: ifast  ENT: 2-pass huff
+	./cjpeg -sample 2x1 -dct fast -opt -outfile testout_422_ifast_opt.jpg $(srcdir)/testimages/testorig.ppm
+	md5/md5cmp $(MD5_JPEG_422_IFAST_OPT) testout_422_ifast_opt.jpg
+# CC: YCC->RGB  SAMP: fullsize/h2v1 fancy  IDCT: ifast  ENT: huff
+	./djpeg -dct fast -outfile testout_422_ifast.ppm testout_422_ifast_opt.jpg
+	md5/md5cmp $(MD5_PPM_422_IFAST) testout_422_ifast.ppm
+	rm -f testout_422_ifast.ppm
+# CC: YCC->RGB  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
+	./djpeg -dct fast -nosmooth -outfile testout_422m_ifast.ppm testout_422_ifast_opt.jpg
+	md5/md5cmp $(MD5_PPM_422M_IFAST) testout_422m_ifast.ppm
+	rm -f testout_422m_ifast.ppm
+if WITH_12BIT
+	rm -f testout_422_ifast_opt.jpg
+else
+# CC: YCC->RGB565  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
+	./djpeg -dct int -nosmooth -rgb565 -dither none -bmp -outfile testout_422m_ifast_565.bmp testout_422_ifast_opt.jpg
+	md5/md5cmp $(MD5_BMP_422M_IFAST_565) testout_422m_ifast_565.bmp
+	rm -f testout_422m_ifast_565.bmp
+# CC: YCC->RGB565 (dithered)  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
+	./djpeg -dct int -nosmooth -rgb565 -bmp -outfile testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg
+	md5/md5cmp $(MD5_BMP_422M_IFAST_565D) testout_422m_ifast_565D.bmp
+	rm -f testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg
+endif
+
+# CC: RGB->YCC  SAMP: fullsize/h2v2  FDCT: ifast  ENT: prog huff
+	./cjpeg -sample 2x2 -quality 100 -dct fast -prog -outfile testout_420_q100_ifast_prog.jpg $(srcdir)/testimages/testorig.ppm
+	md5/md5cmp $(MD5_JPEG_420_IFAST_Q100_PROG) testout_420_q100_ifast_prog.jpg
+# CC: YCC->RGB  SAMP: fullsize/h2v2 fancy  IDCT: ifast  ENT: prog huff
+	./djpeg -dct fast -outfile testout_420_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
+	md5/md5cmp $(MD5_PPM_420_Q100_IFAST) testout_420_q100_ifast.ppm
+	rm -f testout_420_q100_ifast.ppm
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: ifast  ENT: prog huff
+	./djpeg -dct fast -nosmooth -outfile testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
+	md5/md5cmp $(MD5_PPM_420M_Q100_IFAST) testout_420m_q100_ifast.ppm
+	rm -f testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
+
+# CC: RGB->Gray  SAMP: fullsize  FDCT: islow  ENT: huff
+	./cjpeg -gray -dct int -outfile testout_gray_islow.jpg $(srcdir)/testimages/testorig.ppm
+	md5/md5cmp $(MD5_JPEG_GRAY_ISLOW) testout_gray_islow.jpg
+# CC: Gray->Gray  SAMP: fullsize  IDCT: islow  ENT: huff
+	./djpeg -dct int -outfile testout_gray_islow.ppm testout_gray_islow.jpg
+	md5/md5cmp $(MD5_PPM_GRAY_ISLOW) testout_gray_islow.ppm
+	rm -f testout_gray_islow.ppm
+# CC: Gray->RGB  SAMP: fullsize  IDCT: islow  ENT: huff
+	./djpeg -dct int -rgb -outfile testout_gray_islow_rgb.ppm testout_gray_islow.jpg
+	md5/md5cmp $(MD5_PPM_GRAY_ISLOW_RGB) testout_gray_islow_rgb.ppm
+	rm -f testout_gray_islow_rgb.ppm
+if WITH_12BIT
+	rm -f testout_gray_islow.jpg
+else
+# CC: Gray->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
+	./djpeg -dct int -rgb565 -dither none -bmp -outfile testout_gray_islow_565.bmp testout_gray_islow.jpg
+	md5/md5cmp $(MD5_BMP_GRAY_ISLOW_565) testout_gray_islow_565.bmp
+	rm -f testout_gray_islow_565.bmp
+# CC: Gray->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
+	./djpeg -dct int -rgb565 -bmp -outfile testout_gray_islow_565D.bmp testout_gray_islow.jpg
+	md5/md5cmp $(MD5_BMP_GRAY_ISLOW_565D) testout_gray_islow_565D.bmp
+	rm -f testout_gray_islow_565D.bmp testout_gray_islow.jpg
+endif
+
+# CC: RGB->YCC  SAMP: fullsize smooth/h2v2 smooth  FDCT: islow
+# ENT: 2-pass huff
+	./cjpeg -sample 2x2 -smooth 1 -dct int -opt -outfile testout_420s_ifast_opt.jpg $(srcdir)/testimages/testorig.ppm
+	md5/md5cmp $(MD5_JPEG_420S_IFAST_OPT) testout_420s_ifast_opt.jpg
+	rm -f testout_420s_ifast_opt.jpg
+
+# The output of the floating point tests is not validated by default, because
+# the output differs depending on the type of floating point math used, and
+# this is only deterministic if the DCT/IDCT are implemented using SIMD
+# instructions on a particular platform.  Pass one of the following on the make
+# command line to validate the floating point tests against one of the expected
+# results:
+#
+# FLOATTEST=sse  validate against the expected results from the libjpeg-turbo
+#                SSE SIMD extensions
+# FLOATTEST=32bit  validate against the expected results from the C code
+#                  when running on a 32-bit FPU (or when SSE is being used for
+#                  floating point math, which is generally the default with
+#                  x86-64 compilers)
+# FLOATTEST=64bit  validate against the exepected results from the C code
+#                  when running on a 64-bit FPU
+
+# CC: RGB->YCC  SAMP: fullsize/int  FDCT: float  ENT: prog huff
+	./cjpeg -sample 3x2 -dct float -prog -outfile testout_3x2_float_prog.jpg $(srcdir)/testimages/testorig.ppm
+	if [ "${FLOATTEST}" = "sse" ]; then \
+		md5/md5cmp $(MD5_JPEG_3x2_FLOAT_PROG_SSE) testout_3x2_float_prog.jpg; \
+	elif [ "${FLOATTEST}" = "32bit" -o "${FLOATTEST}" = "64bit" ]; then \
+		md5/md5cmp $(MD5_JPEG_3x2_FLOAT_PROG_32BIT) testout_3x2_float_prog.jpg; \
+	fi
+# CC: YCC->RGB  SAMP: fullsize/int  IDCT: float  ENT: prog huff
+	./djpeg -dct float -outfile testout_3x2_float.ppm testout_3x2_float_prog.jpg
+	if [ "${FLOATTEST}" = "sse" ]; then \
+		md5/md5cmp $(MD5_PPM_3x2_FLOAT_SSE) testout_3x2_float.ppm; \
+	elif [ "${FLOATTEST}" = "32bit" ]; then \
+		md5/md5cmp $(MD5_PPM_3x2_FLOAT_32BIT) testout_3x2_float.ppm; \
+	elif [ "${FLOATTEST}" = "64bit" ]; then \
+		md5/md5cmp $(MD5_PPM_3x2_FLOAT_64BIT) testout_3x2_float.ppm; \
+	fi
+	rm -f testout_3x2_float.ppm testout_3x2_float_prog.jpg
+
+# CC: RGB->YCC  SAMP: fullsize/int  FDCT: ifast  ENT: prog huff
+	./cjpeg -sample 3x2 -dct fast -prog -outfile testout_3x2_ifast_prog.jpg $(srcdir)/testimages/testorig.ppm
+	md5/md5cmp $(MD5_JPEG_3x2_IFAST_PROG) testout_3x2_ifast_prog.jpg
+# CC: YCC->RGB  SAMP: fullsize/int  IDCT: ifast  ENT: prog huff
+	./djpeg -dct fast -outfile testout_3x2_ifast.ppm testout_3x2_ifast_prog.jpg
+	md5/md5cmp $(MD5_PPM_3x2_IFAST) testout_3x2_ifast.ppm
+	rm -f testout_3x2_ifast.ppm testout_3x2_ifast_prog.jpg
+
+if WITH_ARITH_ENC
+# CC: YCC->RGB  SAMP: fullsize/h2v2  FDCT: islow  ENT: arith
+	./cjpeg -dct int -arithmetic -outfile testout_420_islow_ari.jpg $(srcdir)/testimages/testorig.ppm
+	md5/md5cmp $(MD5_JPEG_420_ISLOW_ARI) testout_420_islow_ari.jpg
+	rm -f testout_420_islow_ari.jpg
+	./jpegtran -arithmetic -outfile testout_420_islow_ari.jpg $(srcdir)/testimages/testimgint.jpg
+	md5/md5cmp $(MD5_JPEG_420_ISLOW_ARI) testout_420_islow_ari.jpg
+	rm -f testout_420_islow_ari.jpg
+# CC: YCC->RGB  SAMP: fullsize  FDCT: islow  ENT: prog arith
+	./cjpeg -sample 1x1 -dct int -prog -arithmetic -outfile testout_444_islow_progari.jpg $(srcdir)/testimages/testorig.ppm
+	md5/md5cmp $(MD5_JPEG_444_ISLOW_PROGARI) testout_444_islow_progari.jpg
+	rm -f testout_444_islow_progari.jpg
+endif
+if WITH_ARITH_DEC
+# CC: RGB->YCC  SAMP: h2v2 merged  IDCT: ifast  ENT: arith
+	./djpeg -fast -ppm -outfile testout_420m_ifast_ari.ppm $(srcdir)/testimages/testimgari.jpg
+	md5/md5cmp $(MD5_PPM_420M_IFAST_ARI) testout_420m_ifast_ari.ppm
+	rm -f testout_420m_ifast_ari.ppm
+	./jpegtran -outfile testout_420_islow.jpg $(srcdir)/testimages/testimgari.jpg
+	md5/md5cmp $(MD5_JPEG_420_ISLOW) testout_420_islow.jpg
+	rm -f testout_420_islow.jpg
+endif
+
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 16x16 islow  ENT: huff
+	./djpeg -dct int -scale 2/1 -nosmooth -ppm -outfile testout_420m_islow_2_1.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420M_ISLOW_2_1) testout_420m_islow_2_1.ppm
+	rm -f testout_420m_islow_2_1.ppm
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 15x15 islow  ENT: huff
+	./djpeg -dct int -scale 15/8 -nosmooth -ppm -outfile testout_420m_islow_15_8.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420M_ISLOW_15_8) testout_420m_islow_15_8.ppm
+	rm -f testout_420m_islow_15_8.ppm
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 13x13 islow  ENT: huff
+	./djpeg -dct int -scale 13/8 -nosmooth -ppm -outfile testout_420m_islow_13_8.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420M_ISLOW_13_8) testout_420m_islow_13_8.ppm
+	rm -f testout_420m_islow_13_8.ppm
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 11x11 islow  ENT: huff
+	./djpeg -dct int -scale 11/8 -nosmooth -ppm -outfile testout_420m_islow_11_8.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420M_ISLOW_11_8) testout_420m_islow_11_8.ppm
+	rm -f testout_420m_islow_11_8.ppm
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 9x9 islow  ENT: huff
+	./djpeg -dct int -scale 9/8 -nosmooth -ppm -outfile testout_420m_islow_9_8.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420M_ISLOW_9_8) testout_420m_islow_9_8.ppm
+	rm -f testout_420m_islow_9_8.ppm
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 7x7 islow/14x14 islow  ENT: huff
+	./djpeg -dct int -scale 7/8 -nosmooth -ppm -outfile testout_420m_islow_7_8.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420M_ISLOW_7_8) testout_420m_islow_7_8.ppm
+	rm -f testout_420m_islow_7_8.ppm
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 6x6 islow/12x12 islow  ENT: huff
+	./djpeg -dct int -scale 3/4 -nosmooth -ppm -outfile testout_420m_islow_3_4.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420M_ISLOW_3_4) testout_420m_islow_3_4.ppm
+	rm -f testout_420m_islow_3_4.ppm
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 5x5 islow/10x10 islow  ENT: huff
+	./djpeg -dct int -scale 5/8 -nosmooth -ppm -outfile testout_420m_islow_5_8.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420M_ISLOW_5_8) testout_420m_islow_5_8.ppm
+	rm -f testout_420m_islow_5_8.ppm
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 4x4 islow/8x8 islow  ENT: huff
+	./djpeg -dct int -scale 1/2 -nosmooth -ppm -outfile testout_420m_islow_1_2.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420M_ISLOW_1_2) testout_420m_islow_1_2.ppm
+	rm -f testout_420m_islow_1_2.ppm
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 3x3 islow/6x6 islow  ENT: huff
+	./djpeg -dct int -scale 3/8 -nosmooth -ppm -outfile testout_420m_islow_3_8.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420M_ISLOW_3_8) testout_420m_islow_3_8.ppm
+	rm -f testout_420m_islow_3_8.ppm
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 2x2 islow/4x4 islow  ENT: huff
+	./djpeg -dct int -scale 1/4 -nosmooth -ppm -outfile testout_420m_islow_1_4.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420M_ISLOW_1_4) testout_420m_islow_1_4.ppm
+	rm -f testout_420m_islow_1_4.ppm
+# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 1x1 islow/2x2 islow  ENT: huff
+	./djpeg -dct int -scale 1/8 -nosmooth -ppm -outfile testout_420m_islow_1_8.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420M_ISLOW_1_8) testout_420m_islow_1_8.ppm
+	rm -f testout_420m_islow_1_8.ppm
+if WITH_12BIT
+else
+# CC: YCC->RGB (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
+	./djpeg -dct int -colors 256 -bmp -outfile testout_420_islow_256.bmp $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_BMP_420_ISLOW_256) testout_420_islow_256.bmp
+	rm -f testout_420_islow_256.bmp
+# CC: YCC->RGB565  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
+	./djpeg -dct int -rgb565 -dither none -bmp -outfile testout_420_islow_565.bmp $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_BMP_420_ISLOW_565) testout_420_islow_565.bmp
+	rm -f testout_420_islow_565.bmp
+# CC: YCC->RGB565 (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
+	./djpeg -dct int -rgb565 -bmp -outfile testout_420_islow_565D.bmp $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_BMP_420_ISLOW_565D) testout_420_islow_565D.bmp
+	rm -f testout_420_islow_565D.bmp
+# CC: YCC->RGB565  SAMP: h2v2 merged  IDCT: islow  ENT: huff
+	./djpeg -dct int -nosmooth -rgb565 -dither none -bmp -outfile testout_420m_islow_565.bmp $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_BMP_420M_ISLOW_565) testout_420m_islow_565.bmp
+	rm -f testout_420m_islow_565.bmp
+# CC: YCC->RGB565 (dithered)  SAMP: h2v2 merged  IDCT: islow  ENT: huff
+	./djpeg -dct int -nosmooth -rgb565 -bmp -outfile testout_420m_islow_565D.bmp $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_BMP_420M_ISLOW_565D) testout_420m_islow_565D.bmp
+	rm -f testout_420m_islow_565D.bmp
+endif
+
+# Partial decode tests.  These tests are designed to cover all of the possible
+# code paths in jpeg_skip_scanlines().
+
+# Context rows: Yes  Intra-iMCU row: Yes  iMCU row prefetch: No   ENT: huff
+	./djpeg -dct int -skip 15,31 -ppm -outfile testout_420_islow_skip15,31.ppm $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_PPM_420_ISLOW_SKIP15_31) testout_420_islow_skip15,31.ppm
+	rm -f testout_420_islow_skip15,31.ppm
+# Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: Yes  ENT: arith
+if WITH_ARITH_DEC
+	./djpeg -dct int -skip 16,139 -ppm -outfile testout_420_islow_ari_skip16,139.ppm $(srcdir)/testimages/testimgari.jpg
+	md5/md5cmp $(MD5_PPM_420_ISLOW_ARI_SKIP16_139) testout_420_islow_ari_skip16,139.ppm
+	rm -f testout_420_islow_ari_skip16,139.ppm
+endif
+# Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: prog huff
+	./cjpeg -dct int -prog -outfile testout_420_islow_prog.jpg $(srcdir)/testimages/testorig.ppm
+	./djpeg -dct int -crop 62x62+71+71 -ppm -outfile testout_420_islow_prog_crop62x62,71,71.ppm testout_420_islow_prog.jpg
+	md5/md5cmp $(MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71) testout_420_islow_prog_crop62x62,71,71.ppm
+	rm -f testout_420_islow_prog_crop62x62,71,71.ppm testout_420_islow_prog.jpg
+# Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: arith
+if WITH_ARITH_DEC
+	./djpeg -dct int -crop 53x53+4+4 -ppm -outfile testout_420_islow_ari_crop53x53,4,4.ppm $(srcdir)/testimages/testimgari.jpg
+	md5/md5cmp $(MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4) testout_420_islow_ari_crop53x53,4,4.ppm
+	rm -f testout_420_islow_ari_crop53x53,4,4.ppm
+endif
+# Context rows: No   Intra-iMCU row: Yes  ENT: huff
+	./cjpeg -dct int -sample 1x1 -outfile testout_444_islow.jpg $(srcdir)/testimages/testorig.ppm
+	./djpeg -dct int -skip 1,6 -ppm -outfile testout_444_islow_skip1,6.ppm testout_444_islow.jpg
+	md5/md5cmp $(MD5_PPM_444_ISLOW_SKIP1_6) testout_444_islow_skip1,6.ppm
+	rm -f testout_444_islow_skip1,6.ppm testout_444_islow.jpg
+# Context rows: No   Intra-iMCU row: No   ENT: prog huff
+	./cjpeg -dct int -prog -sample 1x1 -outfile testout_444_islow_prog.jpg $(srcdir)/testimages/testorig.ppm
+	./djpeg -dct int -crop 98x98+13+13 -ppm -outfile testout_444_islow_prog_crop98x98,13,13.ppm testout_444_islow_prog.jpg
+	md5/md5cmp $(MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13) testout_444_islow_prog_crop98x98,13,13.ppm
+	rm -f testout_444_islow_prog_crop98x98,13,13.ppm testout_444_islow_prog.jpg
+# Context rows: No   Intra-iMCU row: No   ENT: arith
+if WITH_ARITH_ENC
+	./cjpeg -dct int -arithmetic -sample 1x1 -outfile testout_444_islow_ari.jpg $(srcdir)/testimages/testorig.ppm
+if WITH_ARITH_DEC
+	./djpeg -dct int -crop 37x37+0+0 -ppm -outfile testout_444_islow_ari_crop37x37,0,0.ppm testout_444_islow_ari.jpg
+	md5/md5cmp $(MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0) testout_444_islow_ari_crop37x37,0,0.ppm
+	rm -f testout_444_islow_ari_crop37x37,0,0.ppm
+endif
+	rm -f testout_444_islow_ari.jpg
+endif
+
+	./jpegtran -crop 120x90+20+50 -transpose -perfect -outfile testout_crop.jpg $(srcdir)/testimages/$(TESTORIG)
+	md5/md5cmp $(MD5_JPEG_CROP) testout_crop.jpg
+	rm -f testout_crop.jpg
+	echo GREAT SUCCESS!
+
+
+testclean:
+	rm -f testout*
+	rm -f *_GRAY_*.bmp
+	rm -f *_GRAY_*.png
+	rm -f *_GRAY_*.ppm
+	rm -f *_GRAY_*.jpg
+	rm -f *_GRAY.yuv
+	rm -f *_420_*.bmp
+	rm -f *_420_*.png
+	rm -f *_420_*.ppm
+	rm -f *_420_*.jpg
+	rm -f *_420.yuv
+	rm -f *_422_*.bmp
+	rm -f *_422_*.png
+	rm -f *_422_*.ppm
+	rm -f *_422_*.jpg
+	rm -f *_422.yuv
+	rm -f *_444_*.bmp
+	rm -f *_444_*.png
+	rm -f *_444_*.ppm
+	rm -f *_444_*.jpg
+	rm -f *_444.yuv
+	rm -f *_440_*.bmp
+	rm -f *_440_*.png
+	rm -f *_440_*.ppm
+	rm -f *_440_*.jpg
+	rm -f *_440.yuv
+	rm -f *_411_*.bmp
+	rm -f *_411_*.png
+	rm -f *_411_*.ppm
+	rm -f *_411_*.jpg
+	rm -f *_411.yuv
+
+
+tjtest:
+	sh ./tjbenchtest
+	sh ./tjbenchtest -alloc
+	sh ./tjbenchtest -yuv
+	sh ./tjbenchtest -yuv -alloc
+if WITH_JAVA
+	sh ./tjbenchtest.java
+	sh ./tjbenchtest.java -yuv
+endif
+
+
+pkgscripts/libjpeg-turbo.spec: pkgscripts/libjpeg-turbo.spec.tmpl
+	cat pkgscripts/libjpeg-turbo.spec.tmpl | sed s@%{__prefix}@$(prefix)@g | \
+		sed s@%{__bindir}@$(bindir)@g | sed s@%{__datadir}@$(datadir)@g | \
+		sed s@%{__docdir}@$(docdir)@g | sed s@%{__includedir}@$(includedir)@g | \
+		sed s@%{__libdir}@$(libdir)@g | sed s@%{__mandir}@$(mandir)@g \
+		> pkgscripts/libjpeg-turbo.spec
+
+rpm: all pkgscripts/libjpeg-turbo.spec
+	TMPDIR=`mktemp -d /tmp/${PACKAGE_NAME}-build.XXXXXX`; \
+	mkdir -p $$TMPDIR/RPMS; \
+	ln -fs `pwd` $$TMPDIR/BUILD; \
+	rm -f ${PKGNAME}-${VERSION}.${RPMARCH}.rpm; \
+	rpmbuild -bb --define "_blddir $$TMPDIR/buildroot"  \
+		--define "_topdir $$TMPDIR" \
+		--target ${RPMARCH} pkgscripts/libjpeg-turbo.spec; \
+	cp $$TMPDIR/RPMS/${RPMARCH}/${PKGNAME}-${VERSION}-${BUILD}.${RPMARCH}.rpm \
+		${PKGNAME}-${VERSION}.${RPMARCH}.rpm; \
+	rm -rf $$TMPDIR
+
+srpm: dist-gzip pkgscripts/libjpeg-turbo.spec
+	TMPDIR=`mktemp -d /tmp/${PACKAGE_NAME}-build.XXXXXX`; \
+	mkdir -p $$TMPDIR/RPMS; \
+	mkdir -p $$TMPDIR/SRPMS; \
+	mkdir -p $$TMPDIR/BUILD; \
+	mkdir -p $$TMPDIR/SOURCES; \
+	mkdir -p $$TMPDIR/SPECS; \
+	rm -f ${PKGNAME}-${VERSION}.src.rpm; \
+	cp ${PACKAGE_NAME}-${VERSION}.tar.gz $$TMPDIR/SOURCES; \
+	cat pkgscripts/libjpeg-turbo.spec | sed s/%{_blddir}/%{_tmppath}/g \
+		| sed s/#--\>//g \
+		> $$TMPDIR/SPECS/libjpeg-turbo.spec; \
+	rpmbuild -bs --define "_topdir $$TMPDIR" $$TMPDIR/SPECS/libjpeg-turbo.spec; \
+	cp $$TMPDIR/SRPMS/${PKGNAME}-${VERSION}-${BUILD}.src.rpm \
+		${PKGNAME}-${VERSION}.src.rpm; \
+	rm -rf $$TMPDIR
+
+pkgscripts/makedpkg: pkgscripts/makedpkg.tmpl
+	cat pkgscripts/makedpkg.tmpl | sed s@%{__prefix}@$(prefix)@g | \
+		sed s@%{__docdir}@$(docdir)@g | sed s@%{__libdir}@$(libdir)@g \
+		> pkgscripts/makedpkg
+
+deb: all pkgscripts/makedpkg
+	sh pkgscripts/makedpkg
+
+pkgscripts/uninstall: pkgscripts/uninstall.tmpl
+	cat pkgscripts/uninstall.tmpl | sed s@%{__prefix}@$(prefix)@g | \
+		sed s@%{__bindir}@$(bindir)@g | sed s@%{__datadir}@$(datadir)@g | \
+		sed s@%{__includedir}@$(includedir)@g | sed s@%{__libdir}@$(libdir)@g | \
+		sed s@%{__mandir}@$(mandir)@g > pkgscripts/uninstall
+
+pkgscripts/makemacpkg: pkgscripts/makemacpkg.tmpl
+	cat pkgscripts/makemacpkg.tmpl | sed s@%{__prefix}@$(prefix)@g | \
+		sed s@%{__bindir}@$(bindir)@g | sed s@%{__docdir}@$(docdir)@g | \
+		sed s@%{__libdir}@$(libdir)@g > pkgscripts/makemacpkg
+
+if X86_64
+
+udmg: all pkgscripts/makemacpkg pkgscripts/uninstall
+	sh pkgscripts/makemacpkg -build32 ${BUILDDIR32}
+
+iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
+	sh pkgscripts/makemacpkg -build32 ${BUILDDIR32} -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
+
+else
+
+iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
+	sh pkgscripts/makemacpkg -buildarmv6 ${BUILDDIRARMV6} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
+
+endif
+
+dmg: all pkgscripts/makemacpkg pkgscripts/uninstall
+	sh pkgscripts/makemacpkg
+
+pkgscripts/makecygwinpkg: pkgscripts/makecygwinpkg.tmpl
+	cat pkgscripts/makecygwinpkg.tmpl | sed s@%{__prefix}@$(prefix)@g | \
+		sed s@%{__docdir}@$(docdir)@g | sed s@%{__libdir}@$(libdir)@g \
+		> pkgscripts/makecygwinpkg
+
+cygwinpkg: all pkgscripts/makecygwinpkg
+	sh pkgscripts/makecygwinpkg
diff --git a/README-turbo.txt b/README-turbo.txt
deleted file mode 100644
index b81299f..0000000
--- a/README-turbo.txt
+++ /dev/null
@@ -1,475 +0,0 @@
-*******************************************************************************
-**     Background
-*******************************************************************************
-
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-NEON) to accelerate baseline JPEG compression and decompression on x86, x86-64,
-and ARM systems.  On such systems, libjpeg-turbo is generally 2-4x as fast as
-libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can
-still outperform libjpeg by a significant amount, by virtue of its
-highly-optimized Huffman coding routines.  In many cases, the performance of
-libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
-
-libjpeg-turbo implements both the traditional libjpeg API as well as the less
-powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
-colorspace extensions that allow it to compress from/decompress to 32-bit and
-big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java
-interface.
-
-libjpeg-turbo was originally based on libjpeg/SIMD, an MMX-accelerated
-derivative of libjpeg v6b developed by Miyasaka Masaru.  The TigerVNC and
-VirtualGL projects made numerous enhancements to the codec in 2009, and in
-early 2010, libjpeg-turbo spun off into an independent project, with the goal
-of making high-speed JPEG compression/decompression technology available to a
-broader range of users and developers.
-
-
-*******************************************************************************
-**     License
-*******************************************************************************
-
-Most of libjpeg-turbo inherits the non-restrictive, BSD-style license used by
-libjpeg (see README.)  The TurboJPEG wrapper (both C and Java versions) and
-associated test programs bear a similar license, which is reproduced below:
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-- Redistributions of source code must retain the above copyright notice,
-  this list of conditions and the following disclaimer.
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-- Neither the name of the libjpeg-turbo Project nor the names of its
-  contributors may be used to endorse or promote products derived from this
-  software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-*******************************************************************************
-**     Using libjpeg-turbo
-*******************************************************************************
-
-libjpeg-turbo includes two APIs that can be used to compress and decompress
-JPEG images:
-
-  TurboJPEG API:  This API provides an easy-to-use interface for compressing
-  and decompressing JPEG images in memory.  It also provides some functionality
-  that would not be straightforward to achieve using the underlying libjpeg
-  API, such as generating planar YUV images and performing multiple
-  simultaneous lossless transforms on an image.  The Java interface for
-  libjpeg-turbo is written on top of the TurboJPEG API.
-
-  libjpeg API:  This is the de facto industry-standard API for compressing and
-  decompressing JPEG images.  It is more difficult to use than the TurboJPEG
-  API but also more powerful.  The libjpeg API implementation in libjpeg-turbo
-  is both API/ABI-compatible and mathematically compatible with libjpeg v6b.
-  It can also optionally be configured to be API/ABI-compatible with libjpeg v7
-  and v8 (see below.)
-
-There is no significant performance advantage to either API when both are used
-to perform similar operations.
-
-======================
-Installation Directory
-======================
-
-This document assumes that libjpeg-turbo will be installed in the default
-directory (/opt/libjpeg-turbo on Un*x and Mac systems and
-c:\libjpeg-turbo[-gcc][64] on Windows systems.  If your installation of
-libjpeg-turbo resides in a different directory, then adjust the instructions
-accordingly.
-
-=============================
-Replacing libjpeg at Run Time
-=============================
-
-Un*x
-----
-
-If a Un*x application is dynamically linked with libjpeg, then you can replace
-libjpeg with libjpeg-turbo at run time by manipulating LD_LIBRARY_PATH.
-For instance:
-
-  [Using libjpeg]
-  > time cjpeg <vgl_5674_0098.ppm >vgl_5674_0098.jpg
-  real  0m0.392s
-  user  0m0.074s
-  sys   0m0.020s
-
-  [Using libjpeg-turbo]
-  > export LD_LIBRARY_PATH=/opt/libjpeg-turbo/{lib}:$LD_LIBRARY_PATH
-  > time cjpeg <vgl_5674_0098.ppm >vgl_5674_0098.jpg
-  real  0m0.109s
-  user  0m0.029s
-  sys   0m0.010s
-
-({lib} = lib32 or lib64, depending on whether you wish to use the 32-bit or the
-64-bit version of libjpeg-turbo.)
-
-System administrators can also replace the libjpeg symlinks in /usr/lib* with
-links to the libjpeg-turbo dynamic library located in /opt/libjpeg-turbo/{lib}.
-This will effectively accelerate every application that uses the libjpeg
-dynamic library on the system.
-
-Windows
--------
-
-If a Windows application is dynamically linked with libjpeg, then you can
-replace libjpeg with libjpeg-turbo at run time by backing up the application's
-copy of jpeg62.dll, jpeg7.dll, or jpeg8.dll (assuming the application has its
-own local copy of this library) and copying the corresponding DLL from
-libjpeg-turbo into the application's install directory.  The official
-libjpeg-turbo binary packages only provide jpeg62.dll.  If the application uses
-jpeg7.dll or jpeg8.dll instead, then it will be necessary to build
-libjpeg-turbo from source (see "libjpeg v7 and v8 API/ABI Emulation" below.)
-
-The following information is specific to the official libjpeg-turbo binary
-packages for Visual C++:
-
--- jpeg62.dll requires the Visual C++ 2008 C run-time DLL (msvcr90.dll).
-msvcr90.dll ships with more recent versions of Windows, but users of older
-Windows releases can obtain it from the Visual C++ 2008 Redistributable
-Package, which is available as a free download from Microsoft's web site.
-
--- Features of the libjpeg API that require passing a C run-time structure,
-such as a file handle, from an application to the library will probably not
-work with jpeg62.dll, unless the application is also built to use the Visual
-C++ 2008 C run-time DLL.  In particular, this affects jpeg_stdio_dest() and
-jpeg_stdio_src().
-
-Mac
----
-
-Mac applications typically embed their own copies of the libjpeg dylib inside
-the (hidden) application bundle, so it is not possible to globally replace
-libjpeg on OS X systems.  Replacing the application's version of the libjpeg
-dylib would generally involve copying libjpeg.*.dylib from libjpeg-turbo into
-the appropriate place in the application bundle and using install_name_tool to
-repoint the libjpeg-turbo dylib to its new directory.  This requires an
-advanced knowledge of OS X and would not survive an upgrade or a re-install of
-the application.  Thus, it is not recommended for most users.
-
-========================================
-Using libjpeg-turbo in Your Own Programs
-========================================
-
-For the most part, libjpeg-turbo should work identically to libjpeg, so in
-most cases, an application can be built against libjpeg and then run against
-libjpeg-turbo.  On Un*x systems and Cygwin, you can build against libjpeg-turbo
-instead of libjpeg by setting
-
-  CPATH=/opt/libjpeg-turbo/include
-  and
-  LIBRARY_PATH=/opt/libjpeg-turbo/{lib}
-
-({lib} = lib32 or lib64, depending on whether you are building a 32-bit or a
-64-bit application.)
-
-If using MinGW, then set
-
-  CPATH=/c/libjpeg-turbo-gcc[64]/include
-  and
-  LIBRARY_PATH=/c/libjpeg-turbo-gcc[64]/lib
-
-Building against libjpeg-turbo is useful, for instance, if you want to build an
-application that leverages the libjpeg-turbo colorspace extensions (see below.)
-On Un*x systems, you would still need to manipulate LD_LIBRARY_PATH or create
-appropriate symlinks to use libjpeg-turbo at run time.  On such systems, you
-can pass -R /opt/libjpeg-turbo/{lib} to the linker to force the use of
-libjpeg-turbo at run time rather than libjpeg (also useful if you want to
-leverage the colorspace extensions), or you can link against the libjpeg-turbo
-static library.
-
-To force a Un*x or MinGW application to link against the static version of
-libjpeg-turbo, you can use the following linker options:
-
-  -Wl,-Bstatic -ljpeg -Wl,-Bdynamic
-
-On OS X, simply add /opt/libjpeg-turbo/lib/libjpeg.a to the linker command
-line.
-
-To build Visual C++ applications using libjpeg-turbo, add
-c:\libjpeg-turbo[64]\include to the system or user INCLUDE environment
-variable and c:\libjpeg-turbo[64]\lib to the system or user LIB environment
-variable, and then link against either jpeg.lib (to use the DLL version of
-libjpeg-turbo) or jpeg-static.lib (to use the static version of libjpeg-turbo.)
-
-=====================
-Colorspace Extensions
-=====================
-
-libjpeg-turbo includes extensions that allow JPEG images to be compressed
-directly from (and decompressed directly to) buffers that use BGR, BGRX,
-RGBX, XBGR, and XRGB pixel ordering.  This is implemented with ten new
-colorspace constants:
-
-  JCS_EXT_RGB   /* red/green/blue */
-  JCS_EXT_RGBX  /* red/green/blue/x */
-  JCS_EXT_BGR   /* blue/green/red */
-  JCS_EXT_BGRX  /* blue/green/red/x */
-  JCS_EXT_XBGR  /* x/blue/green/red */
-  JCS_EXT_XRGB  /* x/red/green/blue */
-  JCS_EXT_RGBA  /* red/green/blue/alpha */
-  JCS_EXT_BGRA  /* blue/green/red/alpha */
-  JCS_EXT_ABGR  /* alpha/blue/green/red */
-  JCS_EXT_ARGB  /* alpha/red/green/blue */
-
-Setting cinfo.in_color_space (compression) or cinfo.out_color_space
-(decompression) to one of these values will cause libjpeg-turbo to read the
-red, green, and blue values from (or write them to) the appropriate position in
-the pixel when compressing from/decompressing to an RGB buffer.
-
-Your application can check for the existence of these extensions at compile
-time with:
-
-  #ifdef JCS_EXTENSIONS
-
-At run time, attempting to use these extensions with a libjpeg implementation
-that does not support them will result in a "Bogus input colorspace" error.
-Applications can trap this error in order to test whether run-time support is
-available for the colorspace extensions.
-
-When using the RGBX, BGRX, XBGR, and XRGB colorspaces during decompression, the
-X byte is undefined, and in order to ensure the best performance, libjpeg-turbo
-can set that byte to whatever value it wishes.  If an application expects the X
-byte to be used as an alpha channel, then it should specify JCS_EXT_RGBA,
-JCS_EXT_BGRA, JCS_EXT_ABGR, or JCS_EXT_ARGB.  When these colorspace constants
-are used, the X byte is guaranteed to be 0xFF, which is interpreted as opaque.
-
-Your application can check for the existence of the alpha channel colorspace
-extensions at compile time with:
-
-  #ifdef JCS_ALPHA_EXTENSIONS
-
-jcstest.c, located in the libjpeg-turbo source tree, demonstrates how to check
-for the existence of the colorspace extensions at compile time and run time.
-
-===================================
-libjpeg v7 and v8 API/ABI Emulation
-===================================
-
-With libjpeg v7 and v8, new features were added that necessitated extending the
-compression and decompression structures.  Unfortunately, due to the exposed
-nature of those structures, extending them also necessitated breaking backward
-ABI compatibility with previous libjpeg releases.  Thus, programs that were
-built to use libjpeg v7 or v8 did not work with libjpeg-turbo, since it is
-based on the libjpeg v6b code base.  Although libjpeg v7 and v8 are still not
-as widely used as v6b, enough programs (including a few Linux distros) made
-the switch that there was a demand to emulate the libjpeg v7 and v8 ABIs
-in libjpeg-turbo.  It should be noted, however, that this feature was added
-primarily so that applications that had already been compiled to use libjpeg
-v7+ could take advantage of accelerated baseline JPEG encoding/decoding
-without recompiling.  libjpeg-turbo does not claim to support all of the
-libjpeg v7+ features, nor to produce identical output to libjpeg v7+ in all
-cases (see below.)
-
-By passing an argument of --with-jpeg7 or --with-jpeg8 to configure, or an
-argument of -DWITH_JPEG7=1 or -DWITH_JPEG8=1 to cmake, you can build a version
-of libjpeg-turbo that emulates the libjpeg v7 or v8 ABI, so that programs
-that are built against libjpeg v7 or v8 can be run with libjpeg-turbo.  The
-following section describes which libjpeg v7+ features are supported and which
-aren't.
-
-Support for libjpeg v7 and v8 Features:
----------------------------------------
-
-Fully supported:
-
--- libjpeg: IDCT scaling extensions in decompressor
-   libjpeg-turbo supports IDCT scaling with scaling factors of 1/8, 1/4, 3/8,
-   1/2, 5/8, 3/4, 7/8, 9/8, 5/4, 11/8, 3/2, 13/8, 7/4, 15/8, and 2/1 (only 1/4
-   and 1/2 are SIMD-accelerated.)
-
--- libjpeg: arithmetic coding
-
--- libjpeg: In-memory source and destination managers
-   See notes below.
-
--- cjpeg: Separate quality settings for luminance and chrominance
-   Note that the libpjeg v7+ API was extended to accommodate this feature only
-   for convenience purposes.  It has always been possible to implement this
-   feature with libjpeg v6b (see rdswitch.c for an example.)
-
--- cjpeg: 32-bit BMP support
-
--- cjpeg: -rgb option
-
--- jpegtran: lossless cropping
-
--- jpegtran: -perfect option
-
--- jpegtran: forcing width/height when performing lossless crop
-
--- rdjpgcom: -raw option
-
--- rdjpgcom: locale awareness
-
-
-Not supported:
-
-NOTE:  As of this writing, extensive research has been conducted into the
-usefulness of DCT scaling as a means of data reduction and SmartScale as a
-means of quality improvement.  The reader is invited to peruse the research at
-http://www.libjpeg-turbo.org/About/SmartScale and draw his/her own conclusions,
-but it is the general belief of our project that these features have not
-demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
-
--- libjpeg: DCT scaling in compressor
-   cinfo.scale_num and cinfo.scale_denom are silently ignored.
-   There is no technical reason why DCT scaling could not be supported when
-   emulating the libjpeg v7+ API/ABI, but without the SmartScale extension (see
-   below), only scaling factors of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and
-   8/9 would be available, which is of limited usefulness.
-
--- libjpeg: SmartScale
-   cinfo.block_size is silently ignored.
-   SmartScale is an extension to the JPEG format that allows for DCT block
-   sizes other than 8x8.  Providing support for this new format would be
-   feasible (particularly without full acceleration.)  However, until/unless
-   the format becomes either an official industry standard or, at minimum, an
-   accepted solution in the community, we are hesitant to implement it, as
-   there is no sense of whether or how it might change in the future.  It is
-   our belief that SmartScale has not demonstrated sufficient usefulness as a
-   lossless format nor as a means of quality enhancement, and thus, our primary
-   interest in providing this feature would be as a means of supporting
-   additional DCT scaling factors.
-
--- libjpeg: Fancy downsampling in compressor
-   cinfo.do_fancy_downsampling is silently ignored.
-   This requires the DCT scaling feature, which is not supported.
-
--- jpegtran: Scaling
-   This requires both the DCT scaling and SmartScale features, which are not
-   supported.
-
--- Lossless RGB JPEG files
-   This requires the SmartScale feature, which is not supported.
-
-What About libjpeg v9?
-----------------------
-
-libjpeg v9 introduced yet another field to the JPEG compression structure
-(color_transform), thus making the ABI backward incompatible with that of
-libjpeg v8.  This new field was introduced solely for the purpose of supporting
-lossless SmartScale encoding.  Further, there was actually no reason to extend
-the API in this manner, as the color transform could have just as easily been
-activated by way of a new JPEG colorspace constant, thus preserving backward
-ABI compatibility.
-
-Our research (see link above) has shown that lossless SmartScale does not
-generally accomplish anything that can't already be accomplished better with
-existing, standard lossless formats.  Thus, at this time, it is our belief that
-there is not sufficient technical justification for software to upgrade from
-libjpeg v8 to libjpeg v9, and therefore, not sufficient technical justification
-for us to emulate the libjpeg v9 ABI.
-
-=====================================
-In-Memory Source/Destination Managers
-=====================================
-
-By default, libjpeg-turbo 1.3 and later includes the jpeg_mem_src() and
-jpeg_mem_dest() functions, even when not emulating the libjpeg v8 API/ABI.
-Previously, it was necessary to build libjpeg-turbo from source with libjpeg v8
-API/ABI emulation in order to use the in-memory source/destination managers,
-but several projects requested that those functions be included when emulating
-the libjpeg v6b API/ABI as well.  This allows the use of those functions by
-programs that need them without breaking ABI compatibility for programs that
-don't, and it allows those functions to be provided in the "official"
-libjpeg-turbo binaries.
-
-Those who are concerned about maintaining strict conformance with the libjpeg
-v6b or v7 API can pass an argument of --without-mem-srcdst to configure or
-an argument of -DWITH_MEM_SRCDST=0 to CMake prior to building libjpeg-turbo.
-This will restore the pre-1.3 behavior, in which jpeg_mem_src() and
-jpeg_mem_dest() are only included when emulating the libjpeg v8 API/ABI.
-
-On Un*x systems, including the in-memory source/destination managers changes
-the dynamic library version from 62.0.0 to 62.1.0 if using libjpeg v6b API/ABI
-emulation and from 7.0.0 to 7.1.0 if using libjpeg v7 API/ABI emulation.
-
-Note that, on most Un*x systems, the dynamic linker will not look for a
-function in a library until that function is actually used.  Thus, if a program
-is built against libjpeg-turbo 1.3+ and uses jpeg_mem_src() or jpeg_mem_dest(),
-that program will not fail if run against an older version of libjpeg-turbo or
-against libjpeg v7- until the program actually tries to call jpeg_mem_src() or
-jpeg_mem_dest().  Such is not the case on Windows.  If a program is built
-against the libjpeg-turbo 1.3+ DLL and uses jpeg_mem_src() or jpeg_mem_dest(),
-then it must use the libjpeg-turbo 1.3+ DLL at run time.
-
-Both cjpeg and djpeg have been extended to allow testing the in-memory
-source/destination manager functions.  See their respective man pages for more
-details.
-
-
-*******************************************************************************
-**     Mathematical Compatibility
-*******************************************************************************
-
-For the most part, libjpeg-turbo should produce identical output to libjpeg
-v6b.  The one exception to this is when using the floating point DCT/IDCT, in
-which case the outputs of libjpeg v6b and libjpeg-turbo are not guaranteed to
-be identical (the accuracy of the floating point DCT/IDCT is constant when
-using libjpeg-turbo's SIMD extensions, but otherwise, it can depend heavily on
-the compiler and compiler settings.)
-
-While libjpeg-turbo does emulate the libjpeg v8 API/ABI, under the hood, it is
-still using the same algorithms as libjpeg v6b, so there are several specific
-cases in which libjpeg-turbo cannot be expected to produce the same output as
-libjpeg v8:
-
--- When decompressing using scaling factors of 1/2 and 1/4, because libjpeg v8
-   implements those scaling algorithms a bit differently than libjpeg v6b does,
-   and libjpeg-turbo's SIMD extensions are based on the libjpeg v6b behavior.
-
--- When using chrominance subsampling, because libjpeg v8 implements this
-   with its DCT/IDCT scaling algorithms rather than with a separate
-   downsampling/upsampling algorithm.
-
--- When using the floating point IDCT, for the reasons stated above and also
-   because the floating point IDCT algorithm was modified in libjpeg v8a to
-   improve accuracy.
-
--- When decompressing using a scaling factor > 1 and merged (AKA "non-fancy" or
-   "non-smooth") chrominance upsampling, because libjpeg v8 does not support
-   merged upsampling with scaling factors > 1.
-
-
-*******************************************************************************
-**     Performance Pitfalls
-*******************************************************************************
-
-===============
-Restart Markers
-===============
-
-The optimized Huffman decoder in libjpeg-turbo does not handle restart markers
-in a way that makes the rest of the libjpeg infrastructure happy, so it is
-necessary to use the slow Huffman decoder when decompressing a JPEG image that
-has restart markers.  This can cause the decompression performance to drop by
-as much as 20%, but the performance will still be much greater than that of
-libjpeg.  Many consumer packages, such as PhotoShop, use restart markers when
-generating JPEG images, so images generated by those programs will experience
-this issue.
-
-===============================================
-Fast Integer Forward DCT at High Quality Levels
-===============================================
-
-The algorithm used by the SIMD-accelerated quantization function cannot produce
-correct results whenever the fast integer forward DCT is used along with a JPEG
-quality of 98-100.  Thus, libjpeg-turbo must use the non-SIMD quantization
-function in those cases.  This causes performance to drop by as much as 40%.
-It is therefore strongly advised that you use the slow integer forward DCT
-whenever encoding images with a JPEG quality of 98 or higher.
diff --git a/README.chromium b/README.chromium
index f7ea906..2846aed 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,37 +1,24 @@
 Name: libjpeg-turbo
-URL: http://sourceforge.net/projects/libjpeg-turbo/
-Version: 1.3.1
+URL: https://github.com/libjpeg-turbo/libjpeg-turbo/
+Version: 1.4.90
 License: Custom license
-License File: LICENSE.txt
+License File: LICENSE.md
 Security Critical: yes
 License Android Compatible: yes
 
 Description:
 This consists of the components:
-* A partial copy of libjpeg-turbo 1.3.1 (r1219);
-* Revision r1188 cherry-picked from upstream trunk into config.h to fix
-  compiler warning on newer versions of gcc;
-* Revision r1220 cherry-picked from upstream trunk into jchuff.c to use
-  clz/bsr instructions on ARM for bit counting rather than the lookup table
-  (reduces memory footprint and can improve performance in some cases);
-* Revisions r1108, r1109, r1333, r1375, r1386, r1389 and r1390 cherry-picked
-  from upstream trunk for Arm64 NEON SIMD support;
-* Revisions r1582, r1583, r1586, r1587, r1591, and
-  commit 91eceba0a132a3fc70388a82c75616e67725a93a (code moved to GitHub)
-  cherry-picked from upstream trunk for partial decoding optimization;
-  http://crbug.com/515694
-* Revisions r1295, r1385, r1398, and r1402 (r1386 is also required but has
-  already been cherry-picked) cherry-picked from upstream trunk to enable
-  decoding to 565 as a memory optimization;
-  http://crbug.com/516761
-* Commit 8b2c04f774d18e05e321ee67a9a38b4d7e84f168 from upstream to zero-extend
-  32-bit arguments before using them as 64-bit values: http://crbug.com/532214
-* A build file (libjpeg.gyp), and;
-* Patched header files used by Chromium.
-
-More details on cherry-picked revisions and commits can be found at:
-https://sourceforge.net/p/libjpeg-turbo/code/commit_browser
-https://github.com/libjpeg-turbo/libjpeg-turbo/commits/master
+* libjpeg-turbo 1.4.90
+* Cherry picked clang fix for Arm32 assembly from upstream master:
+  https://github.com/libjpeg-turbo/libjpeg-turbo/commit/5e576386b57663bbe9d934edf7c276eb0150cd59
+  https://github.com/libjpeg-turbo/libjpeg-turbo/commit/2e480fa2a3285d9ff83a780ab3417badeb3f2d37
+* Cherry picked uninitialized memory fix from upstream master:
+  https://github.com/libjpeg-turbo/libjpeg-turbo/commit/a572622dd654305c86585724c2a1ea34e22c2103
+* This file (README.chromium)
+* A build file (libjpeg.gyp)
+* Patched header files used by Chromium
+* Deleted unused directories: cmakescripts, doc, java, md5, release, sharedlib,
+* testimages, and win
 
 This libjpeg-turbo can replace our libjpeg-6b without any modifications in the
 Chromium code.
@@ -42,9 +29,6 @@
 arise when system libraries attempt to use our libjpeg. Also, we applied the
 following changes which are not merged to upstream:
 
-* Added the 'private_extern' flags on Mac (or the 'hidden' flags on Linux) to
-  all the global symbols in '.asm' files to prevent making them external ones.
-* Supported motion-JPEG frames that do not have DHT markers.
 * Fix libjpeg_turbo svn r64 libjpeg6b compat issue: make the fast path Huffman
   decoder fallback to slow decoding if the Huffman decoding bit sentinel > 16,
   this to match the exact behavior of jpeg_huff_decode().
diff --git a/README b/README.ijg
similarity index 91%
rename from README
rename to README.ijg
index 9100869..9c450ce 100644
--- a/README
+++ b/README.ijg
@@ -1,7 +1,7 @@
 libjpeg-turbo note:  This file has been modified by The libjpeg-turbo Project
 to include only information relevant to libjpeg-turbo, to wordsmith certain
 sections, and to remove impolitic language that existed in the libjpeg v8
-README.  It is included only for reference.  Please see README-turbo.txt for
+README.  It is included only for reference.  Please see README.md for
 information specific to libjpeg-turbo.
 
 
@@ -36,7 +36,6 @@
 Other documentation files in the distribution are:
 
 User documentation:
-  install.txt       How to configure and install the IJG software.
   usage.txt         Usage instructions for cjpeg, djpeg, jpegtran,
                     rdjpgcom, and wrjpgcom.
   *.1               Unix-style man pages for programs (same info as usage.txt).
@@ -48,9 +47,9 @@
   structure.txt     Overview of the JPEG library's internal structure.
   coderules.txt     Coding style rules --- please read if you contribute code.
 
-Please read at least the files install.txt and usage.txt.  Some information
-can also be found in the JPEG FAQ (Frequently Asked Questions) article.  See
-ARCHIVE LOCATIONS below to find out where to obtain the FAQ article.
+Please read at least usage.txt.  Some information can also be found in the JPEG
+FAQ (Frequently Asked Questions) article.  See ARCHIVE LOCATIONS below to find
+out where to obtain the FAQ article.
 
 If you want to understand how the JPEG code works, we suggest reading one or
 more of the REFERENCES, then looking at the documentation files (in roughly
@@ -62,7 +61,7 @@
 
 This package contains C software to implement JPEG image encoding, decoding,
 and transcoding.  JPEG (pronounced "jay-peg") is a standardized compression
-method for full-color and gray-scale images.  JPEG's strong suit is compressing
+method for full-color and grayscale images.  JPEG's strong suit is compressing
 photographic images or other types of images that have smooth color and
 brightness transitions between neighboring pixels.  Images with sharp lines or
 other abrupt features may not compress well with JPEG, and a higher JPEG
@@ -129,7 +128,7 @@
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.
 
-This software is copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
+This software is copyright (C) 1991-2016, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.
 
 Permission is hereby granted to use, copy, modify, and distribute this
@@ -167,11 +166,11 @@
 but is also freely distributable.
 
 The IJG distribution formerly included code to read and write GIF files.
-To avoid entanglement with the Unisys LZW patent, GIF reading support has
-been removed altogether, and the GIF writer has been simplified to produce
-"uncompressed GIFs".  This technique does not use the LZW algorithm; the
-resulting GIF files are larger than usual, but are readable by all standard
-GIF decoders.
+To avoid entanglement with the Unisys LZW patent (now expired), GIF reading
+support has been removed altogether, and the GIF writer has been simplified
+to produce "uncompressed GIFs".  This technique does not use the LZW
+algorithm; the resulting GIF files are larger than usual, but are readable
+by all standard GIF decoders.
 
 We are required to state that
     "The Graphics Interchange Format(c) is the Copyright property of
@@ -190,8 +189,8 @@
 	Communications of the ACM, April 1991 (vol. 34 no. 4), pp. 30-44.
 (Adjacent articles in that issue discuss MPEG motion picture compression,
 applications of JPEG, and related topics.)  If you don't have the CACM issue
-handy, a PostScript file containing a revised version of Wallace's article is
-available at http://www.ijg.org/files/wallace.ps.gz.  The file (actually
+handy, a PDF file containing a revised version of Wallace's article is
+available at http://www.ijg.org/files/Wallace.JPEG.pdf.  The file (actually
 a preprint for an article that appeared in IEEE Trans. Consumer Electronics)
 omits the sample images that appeared in CACM, but it includes corrections
 and some added material.  Note: the Wallace article is copyright ACM and IEEE,
@@ -247,9 +246,7 @@
 
 The "official" archive site for this software is www.ijg.org.
 The most recent released version can always be found there in
-directory "files".  This particular version will be archived as
-http://www.ijg.org/files/jpegsrc.v8d.tar.gz, and in Windows-compatible
-"zip" archive format as http://www.ijg.org/files/jpegsr8d.zip.
+directory "files".
 
 The JPEG FAQ (Frequently Asked Questions) article is a source of some
 general information about JPEG.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ad614ca
--- /dev/null
+++ b/README.md
@@ -0,0 +1,335 @@
+Background
+==========
+
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
+NEON, AltiVec) to accelerate baseline JPEG compression and decompression on
+x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is
+generally 2-6x as fast as libjpeg, all else being equal.  On other types of
+systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
+virtue of its highly-optimized Huffman coding routines.  In many cases, the
+performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+
+libjpeg-turbo implements both the traditional libjpeg API as well as the less
+powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
+colorspace extensions that allow it to compress from/decompress to 32-bit and
+big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java
+interface.
+
+libjpeg-turbo was originally based on libjpeg/SIMD, an MMX-accelerated
+derivative of libjpeg v6b developed by Miyasaka Masaru.  The TigerVNC and
+VirtualGL projects made numerous enhancements to the codec in 2009, and in
+early 2010, libjpeg-turbo spun off into an independent project, with the goal
+of making high-speed JPEG compression/decompression technology available to a
+broader range of users and developers.
+
+
+License
+=======
+
+libjpeg-turbo is covered by three compatible BSD-style open source licenses.
+Refer to [LICENSE.md](LICENSE.md) for a roll-up of license terms.
+
+
+Using libjpeg-turbo
+===================
+
+libjpeg-turbo includes two APIs that can be used to compress and decompress
+JPEG images:
+
+- **TurboJPEG API**
+  This API provides an easy-to-use interface for compressing and decompressing
+  JPEG images in memory.  It also provides some functionality that would not be
+  straightforward to achieve using the underlying libjpeg API, such as
+  generating planar YUV images and performing multiple simultaneous lossless
+  transforms on an image.  The Java interface for libjpeg-turbo is written on
+  top of the TurboJPEG API.
+
+- **libjpeg API**
+  This is the de facto industry-standard API for compressing and decompressing
+  JPEG images.  It is more difficult to use than the TurboJPEG API but also
+  more powerful.  The libjpeg API implementation in libjpeg-turbo is both
+  API/ABI-compatible and mathematically compatible with libjpeg v6b.  It can
+  also optionally be configured to be API/ABI-compatible with libjpeg v7 and v8
+  (see below.)
+
+There is no significant performance advantage to either API when both are used
+to perform similar operations.
+
+Colorspace Extensions
+---------------------
+
+libjpeg-turbo includes extensions that allow JPEG images to be compressed
+directly from (and decompressed directly to) buffers that use BGR, BGRX,
+RGBX, XBGR, and XRGB pixel ordering.  This is implemented with ten new
+colorspace constants:
+
+    JCS_EXT_RGB   /* red/green/blue */
+    JCS_EXT_RGBX  /* red/green/blue/x */
+    JCS_EXT_BGR   /* blue/green/red */
+    JCS_EXT_BGRX  /* blue/green/red/x */
+    JCS_EXT_XBGR  /* x/blue/green/red */
+    JCS_EXT_XRGB  /* x/red/green/blue */
+    JCS_EXT_RGBA  /* red/green/blue/alpha */
+    JCS_EXT_BGRA  /* blue/green/red/alpha */
+    JCS_EXT_ABGR  /* alpha/blue/green/red */
+    JCS_EXT_ARGB  /* alpha/red/green/blue */
+
+Setting `cinfo.in_color_space` (compression) or `cinfo.out_color_space`
+(decompression) to one of these values will cause libjpeg-turbo to read the
+red, green, and blue values from (or write them to) the appropriate position in
+the pixel when compressing from/decompressing to an RGB buffer.
+
+Your application can check for the existence of these extensions at compile
+time with:
+
+    #ifdef JCS_EXTENSIONS
+
+At run time, attempting to use these extensions with a libjpeg implementation
+that does not support them will result in a "Bogus input colorspace" error.
+Applications can trap this error in order to test whether run-time support is
+available for the colorspace extensions.
+
+When using the RGBX, BGRX, XBGR, and XRGB colorspaces during decompression, the
+X byte is undefined, and in order to ensure the best performance, libjpeg-turbo
+can set that byte to whatever value it wishes.  If an application expects the X
+byte to be used as an alpha channel, then it should specify `JCS_EXT_RGBA`,
+`JCS_EXT_BGRA`, `JCS_EXT_ABGR`, or `JCS_EXT_ARGB`.  When these colorspace
+constants are used, the X byte is guaranteed to be 0xFF, which is interpreted
+as opaque.
+
+Your application can check for the existence of the alpha channel colorspace
+extensions at compile time with:
+
+    #ifdef JCS_ALPHA_EXTENSIONS
+
+[jcstest.c](jcstest.c), located in the libjpeg-turbo source tree, demonstrates
+how to check for the existence of the colorspace extensions at compile time and
+run time.
+
+libjpeg v7 and v8 API/ABI Emulation
+-----------------------------------
+
+With libjpeg v7 and v8, new features were added that necessitated extending the
+compression and decompression structures.  Unfortunately, due to the exposed
+nature of those structures, extending them also necessitated breaking backward
+ABI compatibility with previous libjpeg releases.  Thus, programs that were
+built to use libjpeg v7 or v8 did not work with libjpeg-turbo, since it is
+based on the libjpeg v6b code base.  Although libjpeg v7 and v8 are not
+as widely used as v6b, enough programs (including a few Linux distros) made
+the switch that there was a demand to emulate the libjpeg v7 and v8 ABIs
+in libjpeg-turbo.  It should be noted, however, that this feature was added
+primarily so that applications that had already been compiled to use libjpeg
+v7+ could take advantage of accelerated baseline JPEG encoding/decoding
+without recompiling.  libjpeg-turbo does not claim to support all of the
+libjpeg v7+ features, nor to produce identical output to libjpeg v7+ in all
+cases (see below.)
+
+By passing an argument of `--with-jpeg7` or `--with-jpeg8` to `configure`, or
+an argument of `-DWITH_JPEG7=1` or `-DWITH_JPEG8=1` to `cmake`, you can build a
+version of libjpeg-turbo that emulates the libjpeg v7 or v8 ABI, so that
+programs that are built against libjpeg v7 or v8 can be run with libjpeg-turbo.
+The following section describes which libjpeg v7+ features are supported and
+which aren't.
+
+### Support for libjpeg v7 and v8 Features
+
+#### Fully supported
+
+- **libjpeg: IDCT scaling extensions in decompressor**
+  libjpeg-turbo supports IDCT scaling with scaling factors of 1/8, 1/4, 3/8,
+  1/2, 5/8, 3/4, 7/8, 9/8, 5/4, 11/8, 3/2, 13/8, 7/4, 15/8, and 2/1 (only 1/4
+  and 1/2 are SIMD-accelerated.)
+
+- **libjpeg: Arithmetic coding**
+
+- **libjpeg: In-memory source and destination managers**
+  See notes below.
+
+- **cjpeg: Separate quality settings for luminance and chrominance**
+  Note that the libpjeg v7+ API was extended to accommodate this feature only
+  for convenience purposes.  It has always been possible to implement this
+  feature with libjpeg v6b (see rdswitch.c for an example.)
+
+- **cjpeg: 32-bit BMP support**
+
+- **cjpeg: `-rgb` option**
+
+- **jpegtran: Lossless cropping**
+
+- **jpegtran: `-perfect` option**
+
+- **jpegtran: Forcing width/height when performing lossless crop**
+
+- **rdjpgcom: `-raw` option**
+
+- **rdjpgcom: Locale awareness**
+
+
+#### Not supported
+
+NOTE:  As of this writing, extensive research has been conducted into the
+usefulness of DCT scaling as a means of data reduction and SmartScale as a
+means of quality improvement.  The reader is invited to peruse the research at
+http://www.libjpeg-turbo.org/About/SmartScale and draw his/her own conclusions,
+but it is the general belief of our project that these features have not
+demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
+
+- **libjpeg: DCT scaling in compressor**
+  `cinfo.scale_num` and `cinfo.scale_denom` are silently ignored.
+  There is no technical reason why DCT scaling could not be supported when
+  emulating the libjpeg v7+ API/ABI, but without the SmartScale extension (see
+  below), only scaling factors of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and
+  8/9 would be available, which is of limited usefulness.
+
+- **libjpeg: SmartScale**
+  `cinfo.block_size` is silently ignored.
+  SmartScale is an extension to the JPEG format that allows for DCT block
+  sizes other than 8x8.  Providing support for this new format would be
+  feasible (particularly without full acceleration.)  However, until/unless
+  the format becomes either an official industry standard or, at minimum, an
+  accepted solution in the community, we are hesitant to implement it, as
+  there is no sense of whether or how it might change in the future.  It is
+  our belief that SmartScale has not demonstrated sufficient usefulness as a
+  lossless format nor as a means of quality enhancement, and thus our primary
+  interest in providing this feature would be as a means of supporting
+  additional DCT scaling factors.
+
+- **libjpeg: Fancy downsampling in compressor**
+  `cinfo.do_fancy_downsampling` is silently ignored.
+  This requires the DCT scaling feature, which is not supported.
+
+- **jpegtran: Scaling**
+  This requires both the DCT scaling and SmartScale features, which are not
+  supported.
+
+- **Lossless RGB JPEG files**
+  This requires the SmartScale feature, which is not supported.
+
+### What About libjpeg v9?
+
+libjpeg v9 introduced yet another field to the JPEG compression structure
+(`color_transform`), thus making the ABI backward incompatible with that of
+libjpeg v8.  This new field was introduced solely for the purpose of supporting
+lossless SmartScale encoding.  Furthermore, there was actually no reason to
+extend the API in this manner, as the color transform could have just as easily
+been activated by way of a new JPEG colorspace constant, thus preserving
+backward ABI compatibility.
+
+Our research (see link above) has shown that lossless SmartScale does not
+generally accomplish anything that can't already be accomplished better with
+existing, standard lossless formats.  Therefore, at this time it is our belief
+that there is not sufficient technical justification for software projects to
+upgrade from libjpeg v8 to libjpeg v9, and thus there is not sufficient
+echnical justification for us to emulate the libjpeg v9 ABI.
+
+In-Memory Source/Destination Managers
+-------------------------------------
+
+By default, libjpeg-turbo 1.3 and later includes the `jpeg_mem_src()` and
+`jpeg_mem_dest()` functions, even when not emulating the libjpeg v8 API/ABI.
+Previously, it was necessary to build libjpeg-turbo from source with libjpeg v8
+API/ABI emulation in order to use the in-memory source/destination managers,
+but several projects requested that those functions be included when emulating
+the libjpeg v6b API/ABI as well.  This allows the use of those functions by
+programs that need them, without breaking ABI compatibility for programs that
+don't, and it allows those functions to be provided in the "official"
+libjpeg-turbo binaries.
+
+Those who are concerned about maintaining strict conformance with the libjpeg
+v6b or v7 API can pass an argument of `--without-mem-srcdst` to `configure` or
+an argument of `-DWITH_MEM_SRCDST=0` to `cmake` prior to building
+libjpeg-turbo.  This will restore the pre-1.3 behavior, in which
+`jpeg_mem_src()` and `jpeg_mem_dest()` are only included when emulating the
+libjpeg v8 API/ABI.
+
+On Un*x systems, including the in-memory source/destination managers changes
+the dynamic library version from 62.0.0 to 62.1.0 if using libjpeg v6b API/ABI
+emulation and from 7.0.0 to 7.1.0 if using libjpeg v7 API/ABI emulation.
+
+Note that, on most Un*x systems, the dynamic linker will not look for a
+function in a library until that function is actually used.  Thus, if a program
+is built against libjpeg-turbo 1.3+ and uses `jpeg_mem_src()` or
+`jpeg_mem_dest()`, that program will not fail if run against an older version
+of libjpeg-turbo or against libjpeg v7- until the program actually tries to
+call `jpeg_mem_src()` or `jpeg_mem_dest()`.  Such is not the case on Windows.
+If a program is built against the libjpeg-turbo 1.3+ DLL and uses
+`jpeg_mem_src()` or `jpeg_mem_dest()`, then it must use the libjpeg-turbo 1.3+
+DLL at run time.
+
+Both cjpeg and djpeg have been extended to allow testing the in-memory
+source/destination manager functions.  See their respective man pages for more
+details.
+
+
+Mathematical Compatibility
+==========================
+
+For the most part, libjpeg-turbo should produce identical output to libjpeg
+v6b.  The one exception to this is when using the floating point DCT/IDCT, in
+which case the outputs of libjpeg v6b and libjpeg-turbo can differ for the
+following reasons:
+
+- The SSE/SSE2 floating point DCT implementation in libjpeg-turbo is ever so
+  slightly more accurate than the implementation in libjpeg v6b, but not by
+  any amount perceptible to human vision (generally in the range of 0.01 to
+  0.08 dB gain in PNSR.)
+
+- When not using the SIMD extensions, libjpeg-turbo uses the more accurate
+  (and slightly faster) floating point IDCT algorithm introduced in libjpeg
+  v8a as opposed to the algorithm used in libjpeg v6b.  It should be noted,
+  however, that this algorithm basically brings the accuracy of the floating
+  point IDCT in line with the accuracy of the slow integer IDCT.  The floating
+  point DCT/IDCT algorithms are mainly a legacy feature, and they do not
+  produce significantly more accuracy than the slow integer algorithms (to put
+  numbers on this, the typical difference in PNSR between the two algorithms
+  is less than 0.10 dB, whereas changing the quality level by 1 in the upper
+  range of the quality scale is typically more like a 1.0 dB difference.)
+
+- If the floating point algorithms in libjpeg-turbo are not implemented using
+  SIMD instructions on a particular platform, then the accuracy of the
+  floating point DCT/IDCT can depend on the compiler settings.
+
+While libjpeg-turbo does emulate the libjpeg v8 API/ABI, under the hood it is
+still using the same algorithms as libjpeg v6b, so there are several specific
+cases in which libjpeg-turbo cannot be expected to produce the same output as
+libjpeg v8:
+
+- When decompressing using scaling factors of 1/2 and 1/4, because libjpeg v8
+  implements those scaling algorithms differently than libjpeg v6b does, and
+  libjpeg-turbo's SIMD extensions are based on the libjpeg v6b behavior.
+
+- When using chrominance subsampling, because libjpeg v8 implements this
+  with its DCT/IDCT scaling algorithms rather than with a separate
+  downsampling/upsampling algorithm.  In our testing, the subsampled/upsampled
+  output of libjpeg v8 is less accurate than that of libjpeg v6b for this
+  reason.
+
+- When decompressing using a scaling factor > 1 and merged (AKA "non-fancy" or
+  "non-smooth") chrominance upsampling, because libjpeg v8 does not support
+  merged upsampling with scaling factors > 1.
+
+
+Performance Pitfalls
+====================
+
+Restart Markers
+---------------
+
+The optimized Huffman decoder in libjpeg-turbo does not handle restart markers
+in a way that makes the rest of the libjpeg infrastructure happy, so it is
+necessary to use the slow Huffman decoder when decompressing a JPEG image that
+has restart markers.  This can cause the decompression performance to drop by
+as much as 20%, but the performance will still be much greater than that of
+libjpeg.  Many consumer packages, such as PhotoShop, use restart markers when
+generating JPEG images, so images generated by those programs will experience
+this issue.
+
+Fast Integer Forward DCT at High Quality Levels
+-----------------------------------------------
+
+The algorithm used by the SIMD-accelerated quantization function cannot produce
+correct results whenever the fast integer forward DCT is used along with a JPEG
+quality of 98-100.  Thus, libjpeg-turbo must use the non-SIMD quantization
+function in those cases.  This causes performance to drop by as much as 40%.
+It is therefore strongly advised that you use the slow integer forward DCT
+whenever encoding images with a JPEG quality of 98 or higher.
diff --git a/acinclude.m4 b/acinclude.m4
new file mode 100644
index 0000000..2c90762
--- /dev/null
+++ b/acinclude.m4
@@ -0,0 +1,254 @@
+# AC_PROG_NASM
+# --------------------------
+# Check that NASM exists and determine flags
+AC_DEFUN([AC_PROG_NASM],[
+
+AC_ARG_VAR(NASM, [NASM command (used to build the x86/x86-64 SIMD code)])
+if test "x$NASM" = "x"; then
+  AC_CHECK_PROGS(NASM, [nasm nasmw yasm])
+  test -z "$NASM" && AC_MSG_ERROR([no nasm (Netwide Assembler) found])
+fi
+
+AC_MSG_CHECKING([for object file format of host system])
+case "$host_os" in
+  cygwin* | mingw* | pw32* | interix*)
+    case "$host_cpu" in
+      x86_64)
+        objfmt='Win64-COFF'
+        ;;
+      *)
+        objfmt='Win32-COFF'
+        ;;
+    esac
+  ;;
+  msdosdjgpp* | go32*)
+    objfmt='COFF'
+  ;;
+  os2-emx*)			# not tested
+    objfmt='MSOMF'		# obj
+  ;;
+  linux*coff* | linux*oldld*)
+    objfmt='COFF'		# ???
+  ;;
+  linux*aout*)
+    objfmt='a.out'
+  ;;
+  linux*)
+    case "$host_cpu" in
+      x86_64)
+        objfmt='ELF64'
+        ;;
+      *)
+        objfmt='ELF'
+        ;;
+    esac
+  ;;
+  kfreebsd* | freebsd* | netbsd* | openbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+      objfmt='BSD-a.out'
+    else
+      case "$host_cpu" in
+        x86_64 | amd64)
+          objfmt='ELF64'
+          ;;
+        *)
+          objfmt='ELF'
+          ;;
+      esac
+    fi
+  ;;
+  solaris* | sunos* | sysv* | sco*)
+    case "$host_cpu" in
+      x86_64)
+        objfmt='ELF64'
+        ;;
+      *)
+        objfmt='ELF'
+        ;;
+    esac
+  ;;
+  darwin* | rhapsody* | nextstep* | openstep* | macos*)
+    case "$host_cpu" in
+      x86_64)
+        objfmt='Mach-O64'
+        ;;
+      *)
+        objfmt='Mach-O'
+        ;;
+    esac
+  ;;
+  *)
+    objfmt='ELF ?'
+  ;;
+esac
+
+AC_MSG_RESULT([$objfmt])
+if test "$objfmt" = 'ELF ?'; then
+  objfmt='ELF'
+  AC_MSG_WARN([unexpected host system. assumed that the format is $objfmt.])
+fi
+
+AC_MSG_CHECKING([for object file format specifier (NAFLAGS) ])
+case "$objfmt" in
+  MSOMF)      NAFLAGS='-fobj -DOBJ32';;
+  Win32-COFF) NAFLAGS='-fwin32 -DWIN32';;
+  Win64-COFF) NAFLAGS='-fwin64 -DWIN64 -D__x86_64__';;
+  COFF)       NAFLAGS='-fcoff -DCOFF';;
+  a.out)      NAFLAGS='-faout -DAOUT';;
+  BSD-a.out)  NAFLAGS='-faoutb -DAOUT';;
+  ELF)        NAFLAGS='-felf -DELF';;
+  ELF64)      NAFLAGS='-felf64 -DELF -D__x86_64__';;
+  RDF)        NAFLAGS='-frdf -DRDF';;
+  Mach-O)     NAFLAGS='-fmacho -DMACHO';;
+  Mach-O64)   NAFLAGS='-fmacho64 -DMACHO -D__x86_64__';;
+esac
+AC_MSG_RESULT([$NAFLAGS])
+AC_SUBST([NAFLAGS])
+
+AC_MSG_CHECKING([whether the assembler ($NASM $NAFLAGS) works])
+cat > conftest.asm <<EOF
+[%line __oline__ "configure"
+        section .text
+        global  _main,main
+_main:
+main:   xor     eax,eax
+        ret
+]EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if AC_TRY_EVAL(try_nasm) && test -s conftest.o; then
+  AC_MSG_RESULT(yes)
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.asm >&AC_FD_CC
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([installation or configuration problem: assembler cannot create object files.])
+fi
+
+AC_MSG_CHECKING([whether the linker accepts assembler output])
+try_nasm='${CC-cc} -o conftest${ac_exeext} $LDFLAGS conftest.o $LIBS 1>&AC_FD_CC'
+if AC_TRY_EVAL(try_nasm) && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  AC_MSG_RESULT(yes)
+else
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([configuration problem: maybe object file format mismatch.])
+fi
+
+])
+
+# AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE
+# --------------------------
+# Test whether the assembler is suitable and supports NEON instructions
+AC_DEFUN([AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE],[
+  ac_good_gnu_arm_assembler=no
+  ac_save_CC="$CC"
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="$CCASFLAGS -x assembler-with-cpp"
+  CC="$CCAS"
+  AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+    .text
+    .fpu neon
+    .arch armv7a
+    .object_arch armv4
+    .arm
+    pld [r0]
+    vmovn.u16 d0, q0]])], ac_good_gnu_arm_assembler=yes)
+
+  ac_use_gas_preprocessor=no
+  if test "x$ac_good_gnu_arm_assembler" = "xno" ; then
+    CC="gas-preprocessor.pl $CCAS"
+    AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+      .text
+      .fpu neon
+      .arch armv7a
+      .object_arch armv4
+      .arm
+      pld [r0]
+      vmovn.u16 d0, q0]])], ac_use_gas_preprocessor=yes)
+  fi
+  CFLAGS="$ac_save_CFLAGS"
+  CC="$ac_save_CC"
+
+  if test "x$ac_use_gas_preprocessor" = "xyes" ; then
+    CCAS="gas-preprocessor.pl $CCAS"
+    AC_SUBST([CCAS])
+    ac_good_gnu_arm_assembler=yes
+  fi
+
+  if test "x$ac_good_gnu_arm_assembler" = "xyes" ; then
+    $1
+  else
+    $2
+  fi
+])
+
+# AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE
+# --------------------------
+# Test whether the assembler is suitable and supports MIPS instructions
+AC_DEFUN([AC_CHECK_COMPATIBLE_MIPS_ASSEMBLER_IFELSE],[
+  have_mips_dspr2=no
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="$CCASFLAGS -mdspr2"
+
+  AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+
+  int main ()
+  {
+    int c = 0, a = 0, b = 0;
+    __asm__ __volatile__ (
+        "precr.qb.ph %[c], %[a], %[b]          \n\t"
+        : [c] "=r" (c)
+        : [a] "r" (a), [b] "r" (b)
+    );
+    return c;
+  }
+  ]])], have_mips_dspr2=yes)
+  CFLAGS=$ac_save_CFLAGS
+
+  if test "x$have_mips_dspr2" = "xyes" ; then
+    $1
+  else
+    $2
+  fi
+])
+
+AC_DEFUN([AC_CHECK_COMPATIBLE_ARM64_ASSEMBLER_IFELSE],[
+  ac_good_gnu_arm_assembler=no
+  ac_save_CC="$CC"
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="$CCASFLAGS -x assembler-with-cpp"
+  CC="$CCAS"
+  AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+    .text
+    MYVAR .req x0
+    movi v0.16b, #100
+    mov MYVAR, #100
+    .unreq MYVAR]])], ac_good_gnu_arm_assembler=yes)
+
+  ac_use_gas_preprocessor=no
+  if test "x$ac_good_gnu_arm_assembler" = "xno" ; then
+    CC="gas-preprocessor.pl $CCAS"
+    AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+      .text
+      MYVAR .req x0
+      movi v0.16b, #100
+      mov MYVAR, #100
+      .unreq MYVAR]])], ac_use_gas_preprocessor=yes)
+  fi
+  CFLAGS="$ac_save_CFLAGS"
+  CC="$ac_save_CC"
+
+  if test "x$ac_use_gas_preprocessor" = "xyes" ; then
+    CCAS="gas-preprocessor.pl $CCAS"
+    AC_SUBST([CCAS])
+    ac_good_gnu_arm_assembler=yes
+  fi
+
+  if test "x$ac_good_gnu_arm_assembler" = "xyes" ; then
+    $1
+  else
+    $2
+  fi
+])
diff --git a/bmp.c b/bmp.c
index fa4479d..9fcf7bb 100644
--- a/bmp.c
+++ b/bmp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011, 2015 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -75,30 +75,90 @@
 static void pixelconvert(unsigned char *srcbuf, int srcpf, int srcbottomup,
 	unsigned char *dstbuf, int dstpf, int dstbottomup, int w, int h)
 {
-	unsigned char *srcptr=srcbuf, *srcptr2;
+	unsigned char *srcrowptr=srcbuf, *srccolptr;
 	int srcps=tjPixelSize[srcpf];
 	int srcstride=srcbottomup? -w*srcps:w*srcps;
-	unsigned char *dstptr=dstbuf, *dstptr2;
+	unsigned char *dstrowptr=dstbuf, *dstcolptr;
 	int dstps=tjPixelSize[dstpf];
 	int dststride=dstbottomup? -w*dstps:w*dstps;
 	int row, col;
 
-	if(srcbottomup) srcptr=&srcbuf[w*srcps*(h-1)];
-	if(dstbottomup) dstptr=&dstbuf[w*dstps*(h-1)];
-	for(row=0; row<h; row++, srcptr+=srcstride, dstptr+=dststride)
+	if(srcbottomup) srcrowptr=&srcbuf[w*srcps*(h-1)];
+	if(dstbottomup) dstrowptr=&dstbuf[w*dstps*(h-1)];
+
+	/* NOTE: These quick & dirty CMYK<->RGB conversion routines are for testing
+	   purposes only.  Properly converting between CMYK and RGB requires a color
+	   management system. */
+
+	if(dstpf==TJPF_CMYK)
 	{
-		for(col=0, srcptr2=srcptr, dstptr2=dstptr; col<w; col++, srcptr2+=srcps,
-			dstptr2+=dstps)
+		for(row=0; row<h; row++, srcrowptr+=srcstride, dstrowptr+=dststride)
 		{
-			dstptr2[tjRedOffset[dstpf]]=srcptr2[tjRedOffset[srcpf]];
-			dstptr2[tjGreenOffset[dstpf]]=srcptr2[tjGreenOffset[srcpf]];
-			dstptr2[tjBlueOffset[dstpf]]=srcptr2[tjBlueOffset[srcpf]];
+			for(col=0, srccolptr=srcrowptr, dstcolptr=dstrowptr;
+				col<w; col++, srccolptr+=srcps)
+			{
+				double c=1.0-((double)(srccolptr[tjRedOffset[srcpf]])/255.);
+				double m=1.0-((double)(srccolptr[tjGreenOffset[srcpf]])/255.);
+				double y=1.0-((double)(srccolptr[tjBlueOffset[srcpf]])/255.);
+				double k=min(min(c,m),min(y,1.0));
+				if(k==1.0) c=m=y=0.0;
+				else
+				{
+					c=(c-k)/(1.0-k);
+					m=(m-k)/(1.0-k);
+					y=(y-k)/(1.0-k);
+				}
+				if(c>1.0) c=1.0;  if(c<0.) c=0.;
+				if(m>1.0) m=1.0;  if(m<0.) m=0.;
+				if(y>1.0) y=1.0;  if(y<0.) y=0.;
+				if(k>1.0) k=1.0;  if(k<0.) k=0.;
+				*dstcolptr++=(unsigned char)(255.0-c*255.0+0.5);
+				*dstcolptr++=(unsigned char)(255.0-m*255.0+0.5);
+				*dstcolptr++=(unsigned char)(255.0-y*255.0+0.5);
+				*dstcolptr++=(unsigned char)(255.0-k*255.0+0.5);
+			}
+		}
+	}
+	else if(srcpf==TJPF_CMYK)
+	{
+		for(row=0; row<h; row++, srcrowptr+=srcstride, dstrowptr+=dststride)
+		{
+			for(col=0, srccolptr=srcrowptr, dstcolptr=dstrowptr;
+				col<w; col++, dstcolptr+=dstps)
+			{
+				double c=(double)(*srccolptr++);
+				double m=(double)(*srccolptr++);
+				double y=(double)(*srccolptr++);
+				double k=(double)(*srccolptr++);
+				double r=c*k/255.;
+				double g=m*k/255.;
+				double b=y*k/255.;
+				if(r>255.0) r=255.0;  if(r<0.) r=0.;
+				if(g>255.0) g=255.0;  if(g<0.) g=0.;
+				if(b>255.0) b=255.0;  if(b<0.) b=0.;
+				dstcolptr[tjRedOffset[dstpf]]=(unsigned char)(r+0.5);
+				dstcolptr[tjGreenOffset[dstpf]]=(unsigned char)(g+0.5);
+				dstcolptr[tjBlueOffset[dstpf]]=(unsigned char)(b+0.5);
+			}
+		}
+	}
+	else
+	{
+		for(row=0; row<h; row++, srcrowptr+=srcstride, dstrowptr+=dststride)
+		{
+			for(col=0, srccolptr=srcrowptr, dstcolptr=dstrowptr;
+				col<w; col++, srccolptr+=srcps, dstcolptr+=dstps)
+			{
+				dstcolptr[tjRedOffset[dstpf]]=srccolptr[tjRedOffset[srcpf]];
+				dstcolptr[tjGreenOffset[dstpf]]=srccolptr[tjGreenOffset[srcpf]];
+				dstcolptr[tjBlueOffset[dstpf]]=srccolptr[tjBlueOffset[srcpf]];
+			}
 		}
 	}
 }
 
 
-int loadbmp(char *filename, unsigned char **buf, int *w, int *h, 
+int loadbmp(char *filename, unsigned char **buf, int *w, int *h,
 	int dstpf, int bottomup)
 {
 	int retval=0, dstps, srcpf, tempc;
diff --git a/cderror.h b/cderror.h
index e19c475..63de498 100644
--- a/cderror.h
+++ b/cderror.h
@@ -4,7 +4,8 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * Modified 2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file defines the error and message codes for the cjpeg/djpeg
  * applications.  These strings are not needed as part of the JPEG library
@@ -33,7 +34,7 @@
 
 typedef enum {
 
-#define JMESSAGE(code,string)	code ,
+#define JMESSAGE(code,string)   code ,
 
 #endif /* JMAKE_ENUM_LIST */
 
@@ -62,7 +63,7 @@
 JMESSAGE(JERR_GIF_NOT, "Not a GIF file")
 JMESSAGE(JTRC_GIF, "%ux%ux%d GIF image")
 JMESSAGE(JTRC_GIF_BADVERSION,
-	 "Warning: unexpected GIF version number '%c%c%c'")
+         "Warning: unexpected GIF version number '%c%c%c'")
 JMESSAGE(JTRC_GIF_EXTENSION, "Ignoring GIF extension block of type 0x%02x")
 JMESSAGE(JTRC_GIF_NONSQUARE, "Caution: nonsquare pixels in input")
 JMESSAGE(JWRN_GIF_BADDATA, "Corrupt data in GIF file")
@@ -74,6 +75,7 @@
 #ifdef PPM_SUPPORTED
 JMESSAGE(JERR_PPM_COLORSPACE, "PPM output must be grayscale or RGB")
 JMESSAGE(JERR_PPM_NONNUMERIC, "Nonnumeric data in PPM file")
+JMESSAGE(JERR_PPM_TOOLARGE, "Integer value too large in PPM file")
 JMESSAGE(JERR_PPM_NOT, "Not a PPM/PGM file")
 JMESSAGE(JTRC_PGM, "%ux%u PGM image")
 JMESSAGE(JTRC_PGM_TEXT, "%ux%u text PGM image")
@@ -110,13 +112,13 @@
 #endif /* TARGA_SUPPORTED */
 
 JMESSAGE(JERR_BAD_CMAP_FILE,
-	 "Color map file is invalid or of unsupported format")
+         "Color map file is invalid or of unsupported format")
 JMESSAGE(JERR_TOO_MANY_COLORS,
-	 "Output file format cannot handle %d colormap entries")
+         "Output file format cannot handle %d colormap entries")
 JMESSAGE(JERR_UNGETC_FAILED, "ungetc failed")
 #ifdef TARGA_SUPPORTED
 JMESSAGE(JERR_UNKNOWN_FORMAT,
-	 "Unrecognized input file format --- perhaps you need -targa")
+         "Unrecognized input file format --- perhaps you need -targa")
 #else
 JMESSAGE(JERR_UNKNOWN_FORMAT, "Unrecognized input file format")
 #endif
diff --git a/cdjpeg.c b/cdjpeg.c
index b6250ff..441d671 100644
--- a/cdjpeg.c
+++ b/cdjpeg.c
@@ -1,60 +1,23 @@
 /*
  * cdjpeg.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains common support routines used by the IJG application
  * programs (cjpeg, djpeg, jpegtran).
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
-#include <ctype.h>		/* to declare isupper(), tolower() */
-#ifdef NEED_SIGNAL_CATCHER
-#include <signal.h>		/* to declare signal() */
-#endif
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include <ctype.h>              /* to declare isupper(), tolower() */
 #ifdef USE_SETMODE
-#include <fcntl.h>		/* to declare setmode()'s parameter macros */
+#include <fcntl.h>              /* to declare setmode()'s parameter macros */
 /* If you have setmode() but not <io.h>, just delete this line: */
-#include <io.h>			/* to declare setmode() */
-#endif
-
-
-/*
- * Signal catcher to ensure that temporary files are removed before aborting.
- * NB: for Amiga Manx C this is actually a global routine named _abort();
- * we put "#define signal_catcher _abort" in jconfig.h.  Talk about bogus...
- */
-
-#ifdef NEED_SIGNAL_CATCHER
-
-static j_common_ptr sig_cinfo;
-
-void				/* must be global for Manx C */
-signal_catcher (int signum)
-{
-  if (sig_cinfo != NULL) {
-    if (sig_cinfo->err != NULL) /* turn off trace output */
-      sig_cinfo->err->trace_level = 0;
-    jpeg_destroy(sig_cinfo);	/* clean up memory allocation & temp files */
-  }
-  exit(EXIT_FAILURE);
-}
-
-
-GLOBAL(void)
-enable_signal_catcher (j_common_ptr cinfo)
-{
-  sig_cinfo = cinfo;
-#ifdef SIGINT			/* not all systems have SIGINT */
-  signal(SIGINT, signal_catcher);
-#endif
-#ifdef SIGTERM			/* not all systems have SIGTERM */
-  signal(SIGTERM, signal_catcher);
-#endif
-}
-
+#include <io.h>                 /* to declare setmode() */
 #endif
 
 
@@ -75,8 +38,8 @@
     prog->percent_done = percent_done;
     if (total_passes > 1) {
       fprintf(stderr, "\rPass %d/%d: %3d%% ",
-	      prog->pub.completed_passes + prog->completed_extra_passes + 1,
-	      total_passes, percent_done);
+              prog->pub.completed_passes + prog->completed_extra_passes + 1,
+              total_passes, percent_done);
     } else {
       fprintf(stderr, "\r %3d%% ", percent_done);
     }
@@ -119,24 +82,24 @@
  */
 
 GLOBAL(boolean)
-keymatch (char * arg, const char * keyword, int minchars)
+keymatch (char *arg, const char *keyword, int minchars)
 {
   register int ca, ck;
   register int nmatched = 0;
 
   while ((ca = *arg++) != '\0') {
     if ((ck = *keyword++) == '\0')
-      return FALSE;		/* arg longer than keyword, no good */
-    if (isupper(ca))		/* force arg to lcase (assume ck is already) */
+      return FALSE;             /* arg longer than keyword, no good */
+    if (isupper(ca))            /* force arg to lcase (assume ck is already) */
       ca = tolower(ca);
     if (ca != ck)
-      return FALSE;		/* no good */
-    nmatched++;			/* count matched characters */
+      return FALSE;             /* no good */
+    nmatched++;                 /* count matched characters */
   }
   /* reached end of argument; fail if it's too short for unique abbrev */
   if (nmatched < minchars)
     return FALSE;
-  return TRUE;			/* A-OK */
+  return TRUE;                  /* A-OK */
 }
 
 
@@ -150,10 +113,10 @@
 {
   FILE * input_file = stdin;
 
-#ifdef USE_SETMODE		/* need to hack file mode? */
+#ifdef USE_SETMODE              /* need to hack file mode? */
   setmode(fileno(stdin), O_BINARY);
 #endif
-#ifdef USE_FDOPEN		/* need to re-open in binary mode? */
+#ifdef USE_FDOPEN               /* need to re-open in binary mode? */
   if ((input_file = fdopen(fileno(stdin), READ_BINARY)) == NULL) {
     fprintf(stderr, "Cannot reopen stdin\n");
     exit(EXIT_FAILURE);
@@ -168,10 +131,10 @@
 {
   FILE * output_file = stdout;
 
-#ifdef USE_SETMODE		/* need to hack file mode? */
+#ifdef USE_SETMODE              /* need to hack file mode? */
   setmode(fileno(stdout), O_BINARY);
 #endif
-#ifdef USE_FDOPEN		/* need to re-open in binary mode? */
+#ifdef USE_FDOPEN               /* need to re-open in binary mode? */
   if ((output_file = fdopen(fileno(stdout), WRITE_BINARY)) == NULL) {
     fprintf(stderr, "Cannot reopen stdout\n");
     exit(EXIT_FAILURE);
diff --git a/cdjpeg.h b/cdjpeg.h
index ed024ac..a65310e 100644
--- a/cdjpeg.h
+++ b/cdjpeg.h
@@ -1,35 +1,35 @@
 /*
  * cdjpeg.h
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains common declarations for the sample applications
  * cjpeg and djpeg.  It is NOT used by the core JPEG library.
  */
 
-#define JPEG_CJPEG_DJPEG	/* define proper options in jconfig.h */
-#define JPEG_INTERNAL_OPTIONS	/* cjpeg.c,djpeg.c need to see xxx_SUPPORTED */
+#define JPEG_CJPEG_DJPEG        /* define proper options in jconfig.h */
+#define JPEG_INTERNAL_OPTIONS   /* cjpeg.c,djpeg.c need to see xxx_SUPPORTED */
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jerror.h"		/* get library error codes too */
-#include "cderror.h"		/* get application-specific error codes */
+#include "jerror.h"             /* get library error codes too */
+#include "cderror.h"            /* get application-specific error codes */
 
 
 /*
  * Object interface for cjpeg's source file decoding modules
  */
 
-typedef struct cjpeg_source_struct * cjpeg_source_ptr;
+typedef struct cjpeg_source_struct *cjpeg_source_ptr;
 
 struct cjpeg_source_struct {
-  JMETHOD(void, start_input, (j_compress_ptr cinfo,
-			      cjpeg_source_ptr sinfo));
-  JMETHOD(JDIMENSION, get_pixel_rows, (j_compress_ptr cinfo,
-				       cjpeg_source_ptr sinfo));
-  JMETHOD(void, finish_input, (j_compress_ptr cinfo,
-			       cjpeg_source_ptr sinfo));
+  void (*start_input) (j_compress_ptr cinfo, cjpeg_source_ptr sinfo);
+  JDIMENSION (*get_pixel_rows) (j_compress_ptr cinfo, cjpeg_source_ptr sinfo);
+  void (*finish_input) (j_compress_ptr cinfo, cjpeg_source_ptr sinfo);
 
   FILE *input_file;
 
@@ -42,24 +42,21 @@
  * Object interface for djpeg's output file encoding modules
  */
 
-typedef struct djpeg_dest_struct * djpeg_dest_ptr;
+typedef struct djpeg_dest_struct *djpeg_dest_ptr;
 
 struct djpeg_dest_struct {
   /* start_output is called after jpeg_start_decompress finishes.
    * The color map will be ready at this time, if one is needed.
    */
-  JMETHOD(void, start_output, (j_decompress_ptr cinfo,
-			       djpeg_dest_ptr dinfo));
+  void (*start_output) (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo);
   /* Emit the specified number of pixel rows from the buffer. */
-  JMETHOD(void, put_pixel_rows, (j_decompress_ptr cinfo,
-				 djpeg_dest_ptr dinfo,
-				 JDIMENSION rows_supplied));
+  void (*put_pixel_rows) (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+                          JDIMENSION rows_supplied);
   /* Finish up at the end of the image. */
-  JMETHOD(void, finish_output, (j_decompress_ptr cinfo,
-				djpeg_dest_ptr dinfo));
+  void (*finish_output) (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo);
 
   /* Target file spec; filled in by djpeg.c after object is created. */
-  FILE * output_file;
+  FILE *output_file;
 
   /* Output pixel-row buffer.  Created by module init or start_output.
    * Width is cinfo->output_width * cinfo->output_components;
@@ -79,109 +76,70 @@
  */
 
 struct cdjpeg_progress_mgr {
-  struct jpeg_progress_mgr pub;	/* fields known to JPEG library */
-  int completed_extra_passes;	/* extra passes completed */
-  int total_extra_passes;	/* total extra */
+  struct jpeg_progress_mgr pub; /* fields known to JPEG library */
+  int completed_extra_passes;   /* extra passes completed */
+  int total_extra_passes;       /* total extra */
   /* last printed percentage stored here to avoid multiple printouts */
   int percent_done;
 };
 
-typedef struct cdjpeg_progress_mgr * cd_progress_ptr;
+typedef struct cdjpeg_progress_mgr *cd_progress_ptr;
 
 
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jinit_read_bmp		jIRdBMP
-#define jinit_write_bmp		jIWrBMP
-#define jinit_read_gif		jIRdGIF
-#define jinit_write_gif		jIWrGIF
-#define jinit_read_ppm		jIRdPPM
-#define jinit_write_ppm		jIWrPPM
-#define jinit_read_rle		jIRdRLE
-#define jinit_write_rle		jIWrRLE
-#define jinit_read_targa	jIRdTarga
-#define jinit_write_targa	jIWrTarga
-#define read_quant_tables	RdQTables
-#define read_scan_script	RdScnScript
-#define set_quality_ratings     SetQRates
-#define set_quant_slots		SetQSlots
-#define set_sample_factors	SetSFacts
-#define read_color_map		RdCMap
-#define enable_signal_catcher	EnSigCatcher
-#define start_progress_monitor	StProgMon
-#define end_progress_monitor	EnProgMon
-#define read_stdin		RdStdin
-#define write_stdout		WrStdout
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
 /* Module selection routines for I/O modules. */
 
-EXTERN(cjpeg_source_ptr) jinit_read_bmp JPP((j_compress_ptr cinfo));
-EXTERN(djpeg_dest_ptr) jinit_write_bmp JPP((j_decompress_ptr cinfo,
-					    boolean is_os2));
-EXTERN(cjpeg_source_ptr) jinit_read_gif JPP((j_compress_ptr cinfo));
-EXTERN(djpeg_dest_ptr) jinit_write_gif JPP((j_decompress_ptr cinfo));
-EXTERN(cjpeg_source_ptr) jinit_read_ppm JPP((j_compress_ptr cinfo));
-EXTERN(djpeg_dest_ptr) jinit_write_ppm JPP((j_decompress_ptr cinfo));
-EXTERN(cjpeg_source_ptr) jinit_read_rle JPP((j_compress_ptr cinfo));
-EXTERN(djpeg_dest_ptr) jinit_write_rle JPP((j_decompress_ptr cinfo));
-EXTERN(cjpeg_source_ptr) jinit_read_targa JPP((j_compress_ptr cinfo));
-EXTERN(djpeg_dest_ptr) jinit_write_targa JPP((j_decompress_ptr cinfo));
+EXTERN(cjpeg_source_ptr) jinit_read_bmp (j_compress_ptr cinfo);
+EXTERN(djpeg_dest_ptr) jinit_write_bmp (j_decompress_ptr cinfo,
+                                        boolean is_os2);
+EXTERN(cjpeg_source_ptr) jinit_read_gif (j_compress_ptr cinfo);
+EXTERN(djpeg_dest_ptr) jinit_write_gif (j_decompress_ptr cinfo);
+EXTERN(cjpeg_source_ptr) jinit_read_ppm (j_compress_ptr cinfo);
+EXTERN(djpeg_dest_ptr) jinit_write_ppm (j_decompress_ptr cinfo);
+EXTERN(cjpeg_source_ptr) jinit_read_rle (j_compress_ptr cinfo);
+EXTERN(djpeg_dest_ptr) jinit_write_rle (j_decompress_ptr cinfo);
+EXTERN(cjpeg_source_ptr) jinit_read_targa (j_compress_ptr cinfo);
+EXTERN(djpeg_dest_ptr) jinit_write_targa (j_decompress_ptr cinfo);
 
 /* cjpeg support routines (in rdswitch.c) */
 
-EXTERN(boolean) read_quant_tables JPP((j_compress_ptr cinfo, char * filename,
-				       boolean force_baseline));
-EXTERN(boolean) read_scan_script JPP((j_compress_ptr cinfo, char * filename));
-EXTERN(boolean) set_quality_ratings JPP((j_compress_ptr cinfo, char *arg,
-					 boolean force_baseline));
-EXTERN(boolean) set_quant_slots JPP((j_compress_ptr cinfo, char *arg));
-EXTERN(boolean) set_sample_factors JPP((j_compress_ptr cinfo, char *arg));
+EXTERN(boolean) read_quant_tables (j_compress_ptr cinfo, char *filename,
+                                   boolean force_baseline);
+EXTERN(boolean) read_scan_script (j_compress_ptr cinfo, char *filename);
+EXTERN(boolean) set_quality_ratings (j_compress_ptr cinfo, char *arg,
+                                     boolean force_baseline);
+EXTERN(boolean) set_quant_slots (j_compress_ptr cinfo, char *arg);
+EXTERN(boolean) set_sample_factors (j_compress_ptr cinfo, char *arg);
 
 /* djpeg support routines (in rdcolmap.c) */
 
-EXTERN(void) read_color_map JPP((j_decompress_ptr cinfo, FILE * infile));
+EXTERN(void) read_color_map (j_decompress_ptr cinfo, FILE *infile);
 
 /* common support routines (in cdjpeg.c) */
 
-EXTERN(void) enable_signal_catcher JPP((j_common_ptr cinfo));
-EXTERN(void) start_progress_monitor JPP((j_common_ptr cinfo,
-					 cd_progress_ptr progress));
-EXTERN(void) end_progress_monitor JPP((j_common_ptr cinfo));
-EXTERN(boolean) keymatch JPP((char * arg, const char * keyword, int minchars));
-EXTERN(FILE *) read_stdin JPP((void));
-EXTERN(FILE *) write_stdout JPP((void));
+EXTERN(void) enable_signal_catcher (j_common_ptr cinfo);
+EXTERN(void) start_progress_monitor (j_common_ptr cinfo,
+                                     cd_progress_ptr progress);
+EXTERN(void) end_progress_monitor (j_common_ptr cinfo);
+EXTERN(boolean) keymatch (char *arg, const char *keyword, int minchars);
+EXTERN(FILE *) read_stdin (void);
+EXTERN(FILE *) write_stdout (void);
 
 /* miscellaneous useful macros */
 
-#ifdef DONT_USE_B_MODE		/* define mode parameters for fopen() */
-#define READ_BINARY	"r"
-#define WRITE_BINARY	"w"
+#ifdef DONT_USE_B_MODE          /* define mode parameters for fopen() */
+#define READ_BINARY     "r"
+#define WRITE_BINARY    "w"
 #else
-#ifdef VMS			/* VMS is very nonstandard */
-#define READ_BINARY	"rb", "ctx=stm"
-#define WRITE_BINARY	"wb", "ctx=stm"
-#else				/* standard ANSI-compliant case */
-#define READ_BINARY	"rb"
-#define WRITE_BINARY	"wb"
-#endif
+#define READ_BINARY     "rb"
+#define WRITE_BINARY    "wb"
 #endif
 
-#ifndef EXIT_FAILURE		/* define exit() codes if not provided */
+#ifndef EXIT_FAILURE            /* define exit() codes if not provided */
 #define EXIT_FAILURE  1
 #endif
 #ifndef EXIT_SUCCESS
-#ifdef VMS
-#define EXIT_SUCCESS  1		/* VMS is very nonstandard */
-#else
 #define EXIT_SUCCESS  0
 #endif
-#endif
 #ifndef EXIT_WARNING
-#ifdef VMS
-#define EXIT_WARNING  1		/* VMS is very nonstandard */
-#else
 #define EXIT_WARNING  2
 #endif
-#endif
diff --git a/change.log b/change.log
index b60ddd6..563648a 100644
--- a/change.log
+++ b/change.log
@@ -4,6 +4,23 @@
 CHANGE LOG for Independent JPEG Group's JPEG software
 
 
+Version 9b  17-Jan-2016
+-----------------------
+
+Document 'f' specifier for jpegtran -crop specification.
+Thank to Michele Martone for suggestion.
+
+
+Version 9  13-Jan-2013
+----------------------
+
+Add remark for jpeg_mem_dest() in jdatadst.c.
+Thank to Elie-Gregoire Khoury for the hint.
+
+Correct argument type in format string, avoid compiler warnings.
+Thank to Vincent Torri for hint.
+
+
 Version 8d  15-Jan-2012
 -----------------------
 
diff --git a/cjpeg.1 b/cjpeg.1
new file mode 100644
index 0000000..d1dc304
--- /dev/null
+++ b/cjpeg.1
@@ -0,0 +1,351 @@
+.TH CJPEG 1 "17 February 2016"
+.SH NAME
+cjpeg \- compress an image file to a JPEG file
+.SH SYNOPSIS
+.B cjpeg
+[
+.I options
+]
+[
+.I filename
+]
+.LP
+.SH DESCRIPTION
+.LP
+.B cjpeg
+compresses the named image file, or the standard input if no file is
+named, and produces a JPEG/JFIF file on the standard output.
+The currently supported input file formats are: PPM (PBMPLUS color
+format), PGM (PBMPLUS grayscale format), BMP, Targa, and RLE (Utah Raster
+Toolkit format).  (RLE is supported only if the URT library is available.)
+.SH OPTIONS
+All switch names may be abbreviated; for example,
+.B \-grayscale
+may be written
+.B \-gray
+or
+.BR \-gr .
+Most of the "basic" switches can be abbreviated to as little as one letter.
+Upper and lower case are equivalent (thus
+.B \-BMP
+is the same as
+.BR \-bmp ).
+British spellings are also accepted (e.g.,
+.BR \-greyscale ),
+though for brevity these are not mentioned below.
+.PP
+The basic switches are:
+.TP
+.BI \-quality " N[,...]"
+Scale quantization tables to adjust image quality.  Quality is 0 (worst) to
+100 (best); default is 75.  (See below for more info.)
+.TP
+.B \-grayscale
+Create monochrome JPEG file from color input.  Be sure to use this switch when
+compressing a grayscale BMP file, because
+.B cjpeg
+isn't bright enough to notice whether a BMP file uses only shades of gray.
+By saying
+.BR \-grayscale ,
+you'll get a smaller JPEG file that takes less time to process.
+.TP
+.B \-rgb
+Create RGB JPEG file.
+Using this switch suppresses the conversion from RGB
+colorspace input to the default YCbCr JPEG colorspace.
+.TP
+.B \-optimize
+Perform optimization of entropy encoding parameters.  Without this, default
+encoding parameters are used.
+.B \-optimize
+usually makes the JPEG file a little smaller, but
+.B cjpeg
+runs somewhat slower and needs much more memory.  Image quality and speed of
+decompression are unaffected by
+.BR \-optimize .
+.TP
+.B \-progressive
+Create progressive JPEG file (see below).
+.TP
+.B \-targa
+Input file is Targa format.  Targa files that contain an "identification"
+field will not be automatically recognized by
+.BR cjpeg ;
+for such files you must specify
+.B \-targa
+to make
+.B cjpeg
+treat the input as Targa format.
+For most Targa files, you won't need this switch.
+.PP
+The
+.B \-quality
+switch lets you trade off compressed file size against quality of the
+reconstructed image: the higher the quality setting, the larger the JPEG file,
+and the closer the output image will be to the original input.  Normally you
+want to use the lowest quality setting (smallest file) that decompresses into
+something visually indistinguishable from the original image.  For this
+purpose the quality setting should generally be between 50 and 95 (the default
+is 75) for photographic images.  If you see defects at
+.B \-quality
+75, then go up 5 or 10 counts at a time until you are happy with the output
+image.  (The optimal setting will vary from one image to another.)
+.PP
+.B \-quality
+100 will generate a quantization table of all 1's, minimizing loss in the
+quantization step (but there is still information loss in subsampling, as well
+as roundoff error.)  For most images, specifying a quality value above
+about 95 will increase the size of the compressed file dramatically, and while
+the quality gain from these higher quality values is measurable (using metrics
+such as PSNR or SSIM), it is rarely perceivable by human vision.
+.PP
+In the other direction, quality values below 50 will produce very small files
+of low image quality.  Settings around 5 to 10 might be useful in preparing an
+index of a large image library, for example.  Try
+.B \-quality
+2 (or so) for some amusing Cubist effects.  (Note: quality
+values below about 25 generate 2-byte quantization tables, which are
+considered optional in the JPEG standard.
+.B cjpeg
+emits a warning message when you give such a quality value, because some
+other JPEG programs may be unable to decode the resulting file.  Use
+.B \-baseline
+if you need to ensure compatibility at low quality values.)
+.PP
+The \fB-quality\fR option has been extended in this version of \fBcjpeg\fR to
+support separate quality settings for luminance and chrominance (or, in
+general, separate settings for every quantization table slot.)  The principle
+is the same as chrominance subsampling:  since the human eye is more sensitive
+to spatial changes in brightness than spatial changes in color, the chrominance
+components can be quantized more than the luminance components without
+incurring any visible image quality loss.  However, unlike subsampling, this
+feature reduces data in the frequency domain instead of the spatial domain,
+which allows for more fine-grained control.  This option is useful in
+quality-sensitive applications, for which the artifacts generated by
+subsampling may be unacceptable.
+.PP
+The \fB-quality\fR option accepts a comma-separated list of parameters, which
+respectively refer to the quality levels that should be assigned to the
+quantization table slots.  If there are more q-table slots than parameters,
+then the last parameter is replicated.  Thus, if only one quality parameter is
+given, this is used for both luminance and chrominance (slots 0 and 1,
+respectively), preserving the legacy behavior of cjpeg v6b and prior.
+More (or customized) quantization tables can be set with the \fB-qtables\fR
+option and assigned to components with the \fB-qslots\fR option (see the
+"wizard" switches below.)
+.PP
+JPEG files generated with separate luminance and chrominance quality are fully
+compliant with standard JPEG decoders.
+.PP
+.BR CAUTION:
+For this setting to be useful, be sure to pass an argument of \fB-sample 1x1\fR
+to \fBcjpeg\fR to disable chrominance subsampling.  Otherwise, the default
+subsampling level (2x2, AKA "4:2:0") will be used.
+.PP
+The
+.B \-progressive
+switch creates a "progressive JPEG" file.  In this type of JPEG file, the data
+is stored in multiple scans of increasing quality.  If the file is being
+transmitted over a slow communications link, the decoder can use the first
+scan to display a low-quality image very quickly, and can then improve the
+display with each subsequent scan.  The final image is exactly equivalent to a
+standard JPEG file of the same quality setting, and the total file size is
+about the same --- often a little smaller.
+.PP
+Switches for advanced users:
+.TP
+.B \-arithmetic
+Use arithmetic coding.
+.B Caution:
+arithmetic coded JPEG is not yet widely implemented, so many decoders will be
+unable to view an arithmetic coded JPEG file at all.
+.TP
+.B \-dct int
+Use integer DCT method (default).
+.TP
+.B \-dct fast
+Use fast integer DCT (less accurate).
+In libjpeg-turbo, the fast method is generally about 5-15% faster than the int
+method when using the x86/x86-64 SIMD extensions (results may vary with other
+SIMD implementations, or when using libjpeg-turbo without SIMD extensions.)
+For quality levels of 90 and below, there should be little or no perceptible
+difference between the two algorithms.  For quality levels above 90, however,
+the difference between the fast and the int methods becomes more pronounced.
+With quality=97, for instance, the fast method incurs generally about a 1-3 dB
+loss (in PSNR) relative to the int method, but this can be larger for some
+images.  Do not use the fast method with quality levels above 97.  The
+algorithm often degenerates at quality=98 and above and can actually produce a
+more lossy image than if lower quality levels had been used.  Also, in
+libjpeg-turbo, the fast method is not fully accelerated for quality levels
+above 97, so it will be slower than the int method.
+.TP
+.B \-dct float
+Use floating-point DCT method.
+The float method is mainly a legacy feature.  It does not produce significantly
+more accurate results than the int method, and it is much slower.  The float
+method may also give different results on different machines due to varying
+roundoff behavior, whereas the integer methods should give the same results on
+all machines.
+.TP
+.BI \-restart " N"
+Emit a JPEG restart marker every N MCU rows, or every N MCU blocks if "B" is
+attached to the number.
+.B \-restart 0
+(the default) means no restart markers.
+.TP
+.BI \-smooth " N"
+Smooth the input image to eliminate dithering noise.  N, ranging from 1 to
+100, indicates the strength of smoothing.  0 (the default) means no smoothing.
+.TP
+.BI \-maxmemory " N"
+Set limit for amount of memory to use in processing large images.  Value is
+in thousands of bytes, or millions of bytes if "M" is attached to the
+number.  For example,
+.B \-max 4m
+selects 4000000 bytes.  If more space is needed, temporary files will be used.
+.TP
+.BI \-outfile " name"
+Send output image to the named file, not to standard output.
+.TP
+.BI \-memdst
+Compress to memory instead of a file.  This feature was implemented mainly as a
+way of testing the in-memory destination manager (jpeg_mem_dest()), but it is
+also useful for benchmarking, since it reduces the I/O overhead.
+.TP
+.B \-verbose
+Enable debug printout.  More
+.BR \-v 's
+give more output.  Also, version information is printed at startup.
+.TP
+.B \-debug
+Same as
+.BR \-verbose .
+.TP
+.B \-version
+Print version information and exit.
+.PP
+The
+.B \-restart
+option inserts extra markers that allow a JPEG decoder to resynchronize after
+a transmission error.  Without restart markers, any damage to a compressed
+file will usually ruin the image from the point of the error to the end of the
+image; with restart markers, the damage is usually confined to the portion of
+the image up to the next restart marker.  Of course, the restart markers
+occupy extra space.  We recommend
+.B \-restart 1
+for images that will be transmitted across unreliable networks such as Usenet.
+.PP
+The
+.B \-smooth
+option filters the input to eliminate fine-scale noise.  This is often useful
+when converting dithered images to JPEG: a moderate smoothing factor of 10 to
+50 gets rid of dithering patterns in the input file, resulting in a smaller
+JPEG file and a better-looking image.  Too large a smoothing factor will
+visibly blur the image, however.
+.PP
+Switches for wizards:
+.TP
+.B \-baseline
+Force baseline-compatible quantization tables to be generated.  This clamps
+quantization values to 8 bits even at low quality settings.  (This switch is
+poorly named, since it does not ensure that the output is actually baseline
+JPEG.  For example, you can use
+.B \-baseline
+and
+.B \-progressive
+together.)
+.TP
+.BI \-qtables " file"
+Use the quantization tables given in the specified text file.
+.TP
+.BI \-qslots " N[,...]"
+Select which quantization table to use for each color component.
+.TP
+.BI \-sample " HxV[,...]"
+Set JPEG sampling factors for each color component.
+.TP
+.BI \-scans " file"
+Use the scan script given in the specified text file.
+.PP
+The "wizard" switches are intended for experimentation with JPEG.  If you
+don't know what you are doing, \fBdon't use them\fR.  These switches are
+documented further in the file wizard.txt.
+.SH EXAMPLES
+.LP
+This example compresses the PPM file foo.ppm with a quality factor of
+60 and saves the output as foo.jpg:
+.IP
+.B cjpeg \-quality
+.I 60 foo.ppm
+.B >
+.I foo.jpg
+.SH HINTS
+Color GIF files are not the ideal input for JPEG; JPEG is really intended for
+compressing full-color (24-bit) images.  In particular, don't try to convert
+cartoons, line drawings, and other images that have only a few distinct
+colors.  GIF works great on these, JPEG does not.  If you want to convert a
+GIF to JPEG, you should experiment with
+.BR cjpeg 's
+.B \-quality
+and
+.B \-smooth
+options to get a satisfactory conversion.
+.B \-smooth 10
+or so is often helpful.
+.PP
+Avoid running an image through a series of JPEG compression/decompression
+cycles.  Image quality loss will accumulate; after ten or so cycles the image
+may be noticeably worse than it was after one cycle.  It's best to use a
+lossless format while manipulating an image, then convert to JPEG format when
+you are ready to file the image away.
+.PP
+The
+.B \-optimize
+option to
+.B cjpeg
+is worth using when you are making a "final" version for posting or archiving.
+It's also a win when you are using low quality settings to make very small
+JPEG files; the percentage improvement is often a lot more than it is on
+larger files.  (At present,
+.B \-optimize
+mode is always selected when generating progressive JPEG files.)
+.SH ENVIRONMENT
+.TP
+.B JPEGMEM
+If this environment variable is set, its value is the default memory limit.
+The value is specified as described for the
+.B \-maxmemory
+switch.
+.B JPEGMEM
+overrides the default value specified when the program was compiled, and
+itself is overridden by an explicit
+.BR \-maxmemory .
+.SH SEE ALSO
+.BR djpeg (1),
+.BR jpegtran (1),
+.BR rdjpgcom (1),
+.BR wrjpgcom (1)
+.br
+.BR ppm (5),
+.BR pgm (5)
+.br
+Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
+Communications of the ACM, April 1991 (vol. 34, no. 4), pp. 30-44.
+.SH AUTHOR
+Independent JPEG Group
+.PP
+This file was modified by The libjpeg-turbo Project to include only information
+relevant to libjpeg-turbo, to wordsmith certain sections, and to describe
+features not present in libjpeg.
+.SH ISSUES
+Support for GIF input files was removed in cjpeg v6b due to concerns over
+the Unisys LZW patent.  Although this patent expired in 2006, cjpeg still
+lacks GIF support, for these historical reasons.  (Conversion of GIF files to
+JPEG is usually a bad idea anyway, since GIF is a 256-color format.)
+.PP
+Not all variants of BMP and Targa file formats are supported.
+.PP
+The
+.B \-targa
+switch is not a bug, it's a feature.  (It would be a bug if the Targa format
+designers had not been clueless.)
diff --git a/cjpeg.c b/cjpeg.c
index 0c23fe7..713224f 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -5,16 +5,17 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2003-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2013, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2010, 2013-2014, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a command-line user interface for the JPEG compressor.
  * It should work on any system with Unix- or MS-DOS-style command lines.
  *
  * Two different command line styles are permitted, depending on the
  * compile-time switch TWO_FILE_COMMANDLINE:
- *	cjpeg [options]  inputfile outputfile
- *	cjpeg [options]  [inputfile]
+ *      cjpeg [options]  inputfile outputfile
+ *      cjpeg [options]  [inputfile]
  * In the second style, output is always to standard output, which you'd
  * normally redirect to a file or pipe to some other program.  Input is
  * either from a named file or from standard input (typically redirected).
@@ -22,28 +23,28 @@
  * don't support pipes.  Also, you MUST use the first style if your system
  * doesn't do binary I/O to stdin/stdout.
  * To simplify script writing, the "-outfile" switch is provided.  The syntax
- *	cjpeg [options]  -outfile outputfile  inputfile
+ *      cjpeg [options]  -outfile outputfile  inputfile
  * works regardless of which command line style is used.
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
-#include "jversion.h"		/* for version message */
-#include "config.h"
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include "jversion.h"           /* for version message */
+#include "jconfigint.h"
 
-#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
+#ifdef USE_CCOMMAND             /* command-line reader for Macintosh */
 #ifdef __MWERKS__
 #include <SIOUX.h>              /* Metrowerks needs this */
-#include <console.h>		/* ... and this */
+#include <console.h>            /* ... and this */
 #endif
 #ifdef THINK_C
-#include <console.h>		/* Think declares it here */
+#include <console.h>            /* Think declares it here */
 #endif
 #endif
 
 
 /* Create the add-on message string table. */
 
-#define JMESSAGE(code,string)	string ,
+#define JMESSAGE(code,string)   string ,
 
 static const char * const cdjpeg_message_table[] = {
 #include "cderror.h"
@@ -77,11 +78,11 @@
  * seldom-used ID field), so we provide a switch to force Targa input mode.
  */
 
-static boolean is_targa;	/* records user -targa switch */
+static boolean is_targa;        /* records user -targa switch */
 
 
 LOCAL(cjpeg_source_ptr)
-select_file_type (j_compress_ptr cinfo, FILE * infile)
+select_file_type (j_compress_ptr cinfo, FILE *infile)
 {
   int c;
 
@@ -124,7 +125,7 @@
     break;
   }
 
-  return NULL;			/* suppress compiler warnings */
+  return NULL;                  /* suppress compiler warnings */
 }
 
 
@@ -137,9 +138,9 @@
  */
 
 
-static const char * progname;	/* program name for error messages */
-static char * outfilename;	/* for -outfile switch */
-boolean memdst;  /* for -memdst switch */
+static const char *progname;    /* program name for error messages */
+static char *outfilename;       /* for -outfile switch */
+boolean memdst;                 /* for -memdst switch */
 
 
 LOCAL(void)
@@ -154,7 +155,8 @@
 #endif
 
   fprintf(stderr, "Switches (names may be abbreviated):\n");
-  fprintf(stderr, "  -quality N[,...]   Compression quality (0..100; 5-95 is useful range)\n");
+  fprintf(stderr, "  -quality N[,...]   Compression quality (0..100; 5-95 is most useful range,\n");
+  fprintf(stderr, "                     default is 75)\n");
   fprintf(stderr, "  -grayscale     Create monochrome JPEG file\n");
   fprintf(stderr, "  -rgb           Create RGB JPEG file\n");
 #ifdef ENTROPY_OPT_SUPPORTED
@@ -172,15 +174,15 @@
 #endif
 #ifdef DCT_ISLOW_SUPPORTED
   fprintf(stderr, "  -dct int       Use integer DCT method%s\n",
-	  (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
+          (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
 #endif
 #ifdef DCT_IFAST_SUPPORTED
   fprintf(stderr, "  -dct fast      Use fast integer DCT (less accurate)%s\n",
-	  (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
+          (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
   fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
-	  (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
+          (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
 #endif
   fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
 #ifdef INPUT_SMOOTHING_SUPPORTED
@@ -192,6 +194,7 @@
   fprintf(stderr, "  -memdst        Compress to memory instead of file (useful for benchmarking)\n");
 #endif
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "  -version       Print version information and exit\n");
   fprintf(stderr, "Switches for wizards:\n");
   fprintf(stderr, "  -baseline      Force baseline quantization tables\n");
   fprintf(stderr, "  -qtables file  Use quantization tables given in file\n");
@@ -206,7 +209,7 @@
 
 LOCAL(int)
 parse_switches (j_compress_ptr cinfo, int argc, char **argv,
-		int last_file_arg_seen, boolean for_real)
+                int last_file_arg_seen, boolean for_real)
 /* Parse optional switches.
  * Returns argv[] index of first file-name argument (== argc if none).
  * Any file names with indexes <= last_file_arg_seen are ignored;
@@ -217,18 +220,18 @@
  */
 {
   int argn;
-  char * arg;
+  char *arg;
   boolean force_baseline;
   boolean simple_progressive;
-  char * qualityarg = NULL;	/* saves -quality parm if any */
-  char * qtablefile = NULL;	/* saves -qtables filename if any */
-  char * qslotsarg = NULL;	/* saves -qslots parm if any */
-  char * samplearg = NULL;	/* saves -sample parm if any */
-  char * scansarg = NULL;	/* saves -scans parm if any */
+  char *qualityarg = NULL;      /* saves -quality parm if any */
+  char *qtablefile = NULL;      /* saves -qtables filename if any */
+  char *qslotsarg = NULL;       /* saves -qslots parm if any */
+  char *samplearg = NULL;       /* saves -sample parm if any */
+  char *scansarg = NULL;        /* saves -scans parm if any */
 
   /* Set up default JPEG parameters. */
 
-  force_baseline = FALSE;	/* by default, allow 16-bit quantizers */
+  force_baseline = FALSE;       /* by default, allow 16-bit quantizers */
   simple_progressive = FALSE;
   is_targa = FALSE;
   outfilename = NULL;
@@ -242,12 +245,12 @@
     if (*arg != '-') {
       /* Not a switch, must be a file name argument */
       if (argn <= last_file_arg_seen) {
-	outfilename = NULL;	/* -outfile applies to just one input file */
-	continue;		/* ignore this name if previously processed */
+        outfilename = NULL;     /* -outfile applies to just one input file */
+        continue;               /* ignore this name if previously processed */
       }
-      break;			/* else done parsing switches */
+      break;                    /* else done parsing switches */
     }
-    arg++;			/* advance past switch marker character */
+    arg++;                      /* advance past switch marker character */
 
     if (keymatch(arg, "arithmetic", 1)) {
       /* Use arithmetic coding. */
@@ -255,7 +258,7 @@
       cinfo->arith_code = TRUE;
 #else
       fprintf(stderr, "%s: sorry, arithmetic coding not supported\n",
-	      progname);
+              progname);
       exit(EXIT_FAILURE);
 #endif
 
@@ -265,16 +268,16 @@
 
     } else if (keymatch(arg, "dct", 2)) {
       /* Select DCT algorithm. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (keymatch(argv[argn], "int", 1)) {
-	cinfo->dct_method = JDCT_ISLOW;
+        cinfo->dct_method = JDCT_ISLOW;
       } else if (keymatch(argv[argn], "fast", 2)) {
-	cinfo->dct_method = JDCT_IFAST;
+        cinfo->dct_method = JDCT_IFAST;
       } else if (keymatch(argv[argn], "float", 2)) {
-	cinfo->dct_method = JDCT_FLOAT;
+        cinfo->dct_method = JDCT_FLOAT;
       } else
-	usage();
+        usage();
 
     } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
       /* Enable debug printouts. */
@@ -282,15 +285,20 @@
       static boolean printed_version = FALSE;
 
       if (! printed_version) {
-	fprintf(stderr, "%s version %s (build %s)\n",
-		PACKAGE_NAME, VERSION, BUILD);
-	fprintf(stderr, "%s\n\n", JCOPYRIGHT);
-	fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n",
-		JVERSION);
-	printed_version = TRUE;
+        fprintf(stderr, "%s version %s (build %s)\n",
+                PACKAGE_NAME, VERSION, BUILD);
+        fprintf(stderr, "%s\n\n", JCOPYRIGHT);
+        fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n",
+                JVERSION);
+        printed_version = TRUE;
       }
       cinfo->err->trace_level++;
 
+    } else if (keymatch(arg, "version", 4)) {
+      fprintf(stderr, "%s version %s (build %s)\n",
+              PACKAGE_NAME, VERSION, BUILD);
+      exit(EXIT_SUCCESS);
+
     } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
       /* Force a monochrome JPEG file to be generated. */
       jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
@@ -304,12 +312,12 @@
       long lval;
       char ch = 'x';
 
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
-	usage();
+        usage();
       if (ch == 'm' || ch == 'M')
-	lval *= 1000L;
+        lval *= 1000L;
       cinfo->mem->max_memory_to_use = lval * 1000L;
 
     } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
@@ -318,15 +326,15 @@
       cinfo->optimize_coding = TRUE;
 #else
       fprintf(stderr, "%s: sorry, entropy optimization was not compiled in\n",
-	      progname);
+              progname);
       exit(EXIT_FAILURE);
 #endif
 
     } else if (keymatch(arg, "outfile", 4)) {
       /* Set output file name. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      outfilename = argv[argn];	/* save it away for later use */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      outfilename = argv[argn]; /* save it away for later use */
 
     } else if (keymatch(arg, "progressive", 1)) {
       /* Select simple progressive mode. */
@@ -335,7 +343,7 @@
       /* We must postpone execution until num_components is known. */
 #else
       fprintf(stderr, "%s: sorry, progressive output was not compiled in\n",
-	      progname);
+              progname);
       exit(EXIT_FAILURE);
 #endif
 
@@ -351,14 +359,14 @@
 
     } else if (keymatch(arg, "quality", 1)) {
       /* Quality ratings (quantization table scaling factors). */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       qualityarg = argv[argn];
 
     } else if (keymatch(arg, "qslots", 2)) {
       /* Quantization table slot numbers. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       qslotsarg = argv[argn];
       /* Must delay setting qslots until after we have processed any
        * colorspace-determining switches, since jpeg_set_colorspace sets
@@ -367,8 +375,8 @@
 
     } else if (keymatch(arg, "qtables", 2)) {
       /* Quantization tables fetched from file. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       qtablefile = argv[argn];
       /* We postpone actually reading the file in case -quality comes later. */
 
@@ -377,24 +385,24 @@
       long lval;
       char ch = 'x';
 
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
-	usage();
+        usage();
       if (lval < 0 || lval > 65535L)
-	usage();
+        usage();
       if (ch == 'b' || ch == 'B') {
-	cinfo->restart_interval = (unsigned int) lval;
-	cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */
+        cinfo->restart_interval = (unsigned int) lval;
+        cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */
       } else {
-	cinfo->restart_in_rows = (int) lval;
-	/* restart_interval will be computed during startup */
+        cinfo->restart_in_rows = (int) lval;
+        /* restart_interval will be computed during startup */
       }
 
     } else if (keymatch(arg, "sample", 2)) {
       /* Set sampling factors. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       samplearg = argv[argn];
       /* Must delay setting sample factors until after we have processed any
        * colorspace-determining switches, since jpeg_set_colorspace sets
@@ -404,13 +412,13 @@
     } else if (keymatch(arg, "scans", 4)) {
       /* Set scan script. */
 #ifdef C_MULTISCAN_FILES_SUPPORTED
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       scansarg = argv[argn];
       /* We must postpone reading the file in case -progressive appears. */
 #else
       fprintf(stderr, "%s: sorry, multi-scan output was not compiled in\n",
-	      progname);
+              progname);
       exit(EXIT_FAILURE);
 #endif
 
@@ -418,12 +426,12 @@
       /* Set input smoothing factor. */
       int val;
 
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (sscanf(argv[argn], "%d", &val) != 1)
-	usage();
+        usage();
       if (val < 0 || val > 100)
-	usage();
+        usage();
       cinfo->smoothing_factor = val;
 
     } else if (keymatch(arg, "targa", 1)) {
@@ -431,7 +439,7 @@
       is_targa = TRUE;
 
     } else {
-      usage();			/* bogus switch */
+      usage();                  /* bogus switch */
     }
   }
 
@@ -441,35 +449,35 @@
 
     /* Set quantization tables for selected quality. */
     /* Some or all may be overridden if -qtables is present. */
-    if (qualityarg != NULL)	/* process -quality if it was present */
+    if (qualityarg != NULL)     /* process -quality if it was present */
       if (! set_quality_ratings(cinfo, qualityarg, force_baseline))
-	usage();
+        usage();
 
-    if (qtablefile != NULL)	/* process -qtables if it was present */
+    if (qtablefile != NULL)     /* process -qtables if it was present */
       if (! read_quant_tables(cinfo, qtablefile, force_baseline))
-	usage();
+        usage();
 
-    if (qslotsarg != NULL)	/* process -qslots if it was present */
+    if (qslotsarg != NULL)      /* process -qslots if it was present */
       if (! set_quant_slots(cinfo, qslotsarg))
-	usage();
+        usage();
 
-    if (samplearg != NULL)	/* process -sample if it was present */
+    if (samplearg != NULL)      /* process -sample if it was present */
       if (! set_sample_factors(cinfo, samplearg))
-	usage();
+        usage();
 
 #ifdef C_PROGRESSIVE_SUPPORTED
-    if (simple_progressive)	/* process -progressive; -scans can override */
+    if (simple_progressive)     /* process -progressive; -scans can override */
       jpeg_simple_progression(cinfo);
 #endif
 
 #ifdef C_MULTISCAN_FILES_SUPPORTED
-    if (scansarg != NULL)	/* process -scans if it was present */
+    if (scansarg != NULL)       /* process -scans if it was present */
       if (! read_scan_script(cinfo, scansarg))
-	usage();
+        usage();
 #endif
   }
 
-  return argn;			/* return index of next arg (file name) */
+  return argn;                  /* return index of next arg (file name) */
 }
 
 
@@ -487,8 +495,8 @@
 #endif
   int file_index;
   cjpeg_source_ptr src_mgr;
-  FILE * input_file;
-  FILE * output_file = NULL;
+  FILE *input_file;
+  FILE *output_file = NULL;
   unsigned char *outbuffer = NULL;
   unsigned long outsize = 0;
   JDIMENSION num_scanlines;
@@ -500,7 +508,7 @@
 
   progname = argv[0];
   if (progname == NULL || progname[0] == 0)
-    progname = "cjpeg";		/* in case C library doesn't provide it */
+    progname = "cjpeg";         /* in case C library doesn't provide it */
 
   /* Initialize the JPEG compression object with default error handling. */
   cinfo.err = jpeg_std_error(&jerr);
@@ -510,11 +518,6 @@
   jerr.first_addon_message = JMSG_FIRSTADDONCODE;
   jerr.last_addon_message = JMSG_LASTADDONCODE;
 
-  /* Now safe to enable signal catcher. */
-#ifdef NEED_SIGNAL_CATCHER
-  enable_signal_catcher((j_common_ptr) &cinfo);
-#endif
-
   /* Initialize JPEG parameters.
    * Much of this may be overridden later.
    * In particular, we don't yet know the input file's color space,
@@ -637,5 +640,5 @@
 
   /* All done. */
   exit(jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
-  return 0;			/* suppress no-return-value warnings */
+  return 0;                     /* suppress no-return-value warnings */
 }
diff --git a/coderules.txt b/coderules.txt
new file mode 100644
index 0000000..a2f593a
--- /dev/null
+++ b/coderules.txt
@@ -0,0 +1,78 @@
+IJG JPEG LIBRARY:  CODING RULES
+
+This file was part of the Independent JPEG Group's software:
+Copyright (C) 1991-1996, Thomas G. Lane.
+It was modified by The libjpeg-turbo Project to include only information
+relevant to libjpeg-turbo.
+For conditions of distribution and use, see the accompanying README.ijg file.
+
+
+Since numerous people will be contributing code and bug fixes, it's important
+to establish a common coding style.  The goal of using similar coding styles
+is much more important than the details of just what that style is.
+
+In general we follow the recommendations of "Recommended C Style and Coding
+Standards" revision 6.1 (Cannon et al. as modified by Spencer, Keppel and
+Brader).  This document is available in the IJG FTP archive (see
+jpeg/doc/cstyle.ms.tbl.Z, or cstyle.txt.Z for those without nroff/tbl).
+
+Block comments should be laid out thusly:
+
+/*
+ *  Block comments in this style.
+ */
+
+We indent statements in K&R style, e.g.,
+        if (test) {
+          then-part;
+        } else {
+          else-part;
+        }
+with two spaces per indentation level.  (This indentation convention is
+handled automatically by GNU Emacs and many other text editors.)
+
+Multi-word names should be written in lower case with underscores, e.g.,
+multi_word_name (not multiWordName).  Preprocessor symbols and enum constants
+are similar but upper case (MULTI_WORD_NAME).  Names should be unique within
+the first fifteen characters.
+
+Note that each function definition must begin with GLOBAL(type), LOCAL(type),
+or METHODDEF(type).  These macros expand to "static type" or just "type" as
+appropriate.  They provide a readable indication of the routine's usage and
+can readily be changed for special needs.  (For instance, special linkage
+keywords can be inserted for use in Windows DLLs.)
+
+A similar solution is used for external function declarations (see the EXTERN
+macro.)
+
+
+The JPEG library is intended to be used within larger programs.  Furthermore,
+we want it to be reentrant so that it can be used by applications that process
+multiple images concurrently.  The following rules support these requirements:
+
+1. Avoid direct use of file I/O, "malloc", error report printouts, etc;
+pass these through the common routines provided.
+
+2. Minimize global namespace pollution.  Functions should be declared static
+wherever possible.  (Note that our method-based calling conventions help this
+a lot: in many modules only the initialization function will ever need to be
+called directly, so only that function need be externally visible.)  All
+global function names should begin with "jpeg_".
+
+3. Don't use global variables; anything that must be used in another module
+should be in the common data structures.
+
+4. Don't use static variables except for read-only constant tables.  Variables
+that should be private to a module can be placed into private structures (see
+the system architecture document, structure.txt).
+
+5. Source file names should begin with "j" for files that are part of the
+library proper; source files that are not part of the library, such as cjpeg.c
+and djpeg.c, do not begin with "j".  Keep compression and decompression code in
+separate source files --- some applications may want only one half of the
+library.
+
+Note: these rules (particularly #4) are not followed religiously in the
+modules that are used in cjpeg/djpeg but are not part of the JPEG library
+proper.  Those modules are not really intended to be used in other
+applications.
diff --git a/config.h b/config.h
deleted file mode 100644
index d5a6218..0000000
--- a/config.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/* config.h.  Generated from config.h.in by configure.  */
-/* config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* Build number */
-#define BUILD "20140410"
-
-/* Support arithmetic encoding */
-/* #undef C_ARITH_CODING_SUPPORTED */
-
-/* Support arithmetic decoding */
-/* #undef D_ARITH_CODING_SUPPORTED */
-
-/* Support in-memory source/destination managers */
-/* #undef MEM_SRCDST_SUPPORTED */
-
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#define HAVE_DLFCN_H 1
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the <jni.h> header file. */
-/* #undef HAVE_JNI_H */
-
-/* Define to 1 if you have the `memcpy' function. */
-#define HAVE_MEMCPY 1
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have the `memset' function. */
-#define HAVE_MEMSET 1
-
-/* Define if your compiler supports prototypes */
-#define HAVE_PROTOTYPES 1
-
-/* Define to 1 if you have the <stddef.h> header file. */
-#define HAVE_STDDEF_H 1
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#if !defined(_MSC_VER)
-#define HAVE_UNISTD_H 1
-#endif
-
-/* Define to 1 if the system has the type `unsigned char'. */
-#define HAVE_UNSIGNED_CHAR 1
-
-/* Define to 1 if the system has the type `unsigned short'. */
-#define HAVE_UNSIGNED_SHORT 1
-
-/* Compiler does not support pointers to undefined structures. */
-/* #undef INCOMPLETE_TYPES_BROKEN */
-
-/* How to obtain function inlining. */
-#ifndef INLINE
-#if defined(__GNUC__)
-#define INLINE inline __attribute__((always_inline))
-#elif defined(_MSC_VER)
-#define INLINE __forceinline
-#else
-#define INLINE
-#endif
-#endif
-
-/* libjpeg API version */
-#define JPEG_LIB_VERSION 62
-
-/* libjpeg-turbo version */
-#define LIBJPEG_TURBO_VERSION 1.3.1
-
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
-#define LT_OBJDIR ".libs/"
-
-/* Define if you have BSD-like bzero and bcopy */
-/* #undef NEED_BSD_STRINGS */
-
-/* Define if you need short function names */
-/* #undef NEED_SHORT_EXTERNAL_NAMES */
-
-/* Define if you have sys/types.h */
-#define NEED_SYS_TYPES_H 1
-
-/* Name of package */
-#define PACKAGE "libjpeg-turbo"
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT ""
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "libjpeg-turbo"
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libjpeg-turbo 1.3.1"
-
-/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "libjpeg-turbo"
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "1.3.1"
-
-/* Define if shift is unsigned */
-/* #undef RIGHT_SHIFT_IS_UNSIGNED */
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Version number of package */
-#define VERSION "1.3.1"
-
-/* Use accelerated SIMD routines. */
-#define WITH_SIMD 1
-
-/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
-#ifndef __CHAR_UNSIGNED__
-/* # undef __CHAR_UNSIGNED__ */
-#endif
-
-/* Define to empty if `const' does not conform to ANSI C. */
-/* #undef const */
-
-/* Define to `__inline__' or `__inline' if that's what the C compiler
-   calls it, or to nothing if 'inline' is not supported under any name.  */
-#ifndef __cplusplus
-/* #undef inline */
-#endif
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-/* #undef size_t */
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..d6a3e33
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,595 @@
+#                                               -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ([2.56])
+AC_INIT([libjpeg-turbo], [1.4.90])
+
+AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])
+AC_PREFIX_DEFAULT(/opt/libjpeg-turbo)
+
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+# Checks for programs.
+SAVED_CFLAGS=${CFLAGS}
+SAVED_CPPFLAGS=${CPPFLAGS}
+AC_PROG_CPP
+AC_PROG_CC
+m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
+AM_PROG_AS
+AM_PROG_CC_C_O
+AC_PROG_INSTALL
+AC_PROG_LIBTOOL
+AC_PROG_LN_S
+
+AC_ARG_WITH([build-date], [Use custom build string to enable reproducible builds (default: YYMMDD)],
+  [BUILD="$with_build_date"],
+  [BUILD=`date +%Y%m%d`])
+
+PKG_PROG_PKG_CONFIG
+
+# When the prefix is /opt/libjpeg-turbo, we assume that an "official" binary is
+# being created, and thus we install things into specific locations.
+
+old_prefix=${prefix}
+if test "x$prefix" = "xNONE" -a "x$ac_default_prefix" != "x"; then
+  prefix=$ac_default_prefix
+fi
+DATADIR=`eval echo ${datadir}`
+DATADIR=`eval echo $DATADIR`
+if test "$DATADIR" = "/opt/libjpeg-turbo/share"; then
+  datadir='${prefix}'
+fi
+DATADIR=`eval echo ${datarootdir}`
+DATADIR=`eval echo $DATADIR`
+if test "$DATADIR" = "/opt/libjpeg-turbo/share"; then
+  datarootdir='${prefix}'
+fi
+DOCDIR=`eval echo ${docdir}`
+DOCDIR=`eval echo $DOCDIR`
+if test "$DOCDIR" = "/opt/libjpeg-turbo/doc/libjpeg-turbo"; then
+  docdir='${datadir}/doc'
+fi
+
+old_exec_prefix=${exec_prefix}
+if test "x$exec_prefix" = "xNONE"; then
+  exec_prefix=${prefix}
+fi
+
+AC_CHECK_SIZEOF(size_t)
+
+if test "x${libdir}" = 'x${exec_prefix}/lib' -o "x${libdir}" = 'x${prefix}/lib'; then
+  LIBDIR=`eval echo ${libdir}`
+  LIBDIR=`eval echo $LIBDIR`
+  if test "$LIBDIR" = "/opt/libjpeg-turbo/lib"; then
+    case $host_os in
+      darwin*)
+        ;;
+      *)
+        if test "${ac_cv_sizeof_size_t}" = "8"; then
+          libdir='${exec_prefix}/lib64'
+        elif test "${ac_cv_sizeof_size_t}" = "4"; then
+          libdir='${exec_prefix}/lib32'
+        fi
+        ;;
+    esac
+  fi
+fi
+exec_prefix=${old_exec_prefix}
+prefix=${old_prefix}
+
+# Check whether compiler supports pointers to undefined structures
+AC_MSG_CHECKING(whether compiler supports pointers to undefined structures)
+AC_TRY_COMPILE([ typedef struct undefined_structure *undef_struct_ptr; ], ,
+  AC_MSG_RESULT(yes),
+  [AC_MSG_RESULT(no)
+   AC_DEFINE([INCOMPLETE_TYPES_BROKEN], [1],
+     [Compiler does not support pointers to undefined structures.])])
+
+if test "x${GCC}" = "xyes"; then
+  if test "x${SAVED_CFLAGS}" = "x"; then
+    CFLAGS=-O3
+  fi
+  if test "x${SAVED_CPPFLAGS}" = "x"; then
+    CPPFLAGS=-Wall
+  fi
+fi
+
+AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
+if test "x${SUNCC}" = "xyes"; then
+  if test "x${SAVED_CFLAGS}" = "x"; then
+    CFLAGS=-xO5
+  fi
+fi
+
+# Checks for libraries.
+
+# Checks for header files.
+AC_HEADER_STDC
+AC_CHECK_HEADERS([stddef.h stdlib.h locale.h string.h])
+AC_CHECK_HEADER([sys/types.h],
+  AC_DEFINE([NEED_SYS_TYPES_H], 1, [Define if you need to include <sys/types.h> to get size_t.]))
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_C_CONST
+AC_C_CHAR_UNSIGNED
+AC_C_INLINE
+AC_TYPE_SIZE_T
+AC_CHECK_TYPES([unsigned char, unsigned short])
+
+AC_MSG_CHECKING([if right shift is signed])
+AC_TRY_RUN(
+  [#include <stdio.h>
+   int is_shifting_signed (long arg) {
+     long res = arg >> 4;
+
+     if (res == -0x7F7E80CL)
+       return 1; /* right shift is signed */
+
+     /* see if unsigned-shift hack will fix it. */
+     /* we can't just test exact value since it depends on width of long... */
+     res |= (~0L) << (32-4);
+     if (res == -0x7F7E80CL)
+       return 0; /* right shift is unsigned */
+
+     printf("Right shift isn't acting as I expect it to.\n");
+     printf("I fear the JPEG software will not work at all.\n\n");
+     return 0; /* try it with unsigned anyway */
+   }
+   int main (void) {
+     exit(is_shifting_signed(-0x7F7E80B1L));
+   }],
+  [AC_MSG_RESULT(no)
+   AC_DEFINE([RIGHT_SHIFT_IS_UNSIGNED], 1,
+     [Define if your (broken) compiler shifts signed values as if they were unsigned.])],
+  [AC_MSG_RESULT(yes)],
+  [AC_MSG_RESULT(Assuming that right shift is signed on target machine.)])
+
+# Checks for library functions.
+AC_CHECK_FUNCS([memset memcpy], [],
+  [AC_DEFINE([NEED_BSD_STRINGS], 1,
+     [Define if you have BSD-like bzero and bcopy in <strings.h> rather than memset/memcpy in <string.h>.])])
+
+AC_MSG_CHECKING([libjpeg API version])
+AC_ARG_VAR(JPEG_LIB_VERSION, [libjpeg API version (62, 70, or 80)])
+if test "x$JPEG_LIB_VERSION" = "x"; then
+  AC_ARG_WITH([jpeg7],
+    AC_HELP_STRING([--with-jpeg7],
+      [Emulate libjpeg v7 API/ABI (this makes libjpeg-turbo backward incompatible with libjpeg v6b.)]))
+  AC_ARG_WITH([jpeg8],
+    AC_HELP_STRING([--with-jpeg8],
+      [Emulate libjpeg v8 API/ABI (this makes libjpeg-turbo backward incompatible with libjpeg v6b.)]))
+  if test "x${with_jpeg8}" = "xyes"; then
+    JPEG_LIB_VERSION=80
+  else
+    if test "x${with_jpeg7}" = "xyes"; then
+      JPEG_LIB_VERSION=70
+    else
+      JPEG_LIB_VERSION=62
+    fi
+  fi
+fi
+JPEG_LIB_VERSION_DECIMAL=`expr $JPEG_LIB_VERSION / 10`.`expr $JPEG_LIB_VERSION % 10`
+AC_SUBST(JPEG_LIB_VERSION_DECIMAL)
+AC_MSG_RESULT([$JPEG_LIB_VERSION_DECIMAL])
+AC_DEFINE_UNQUOTED(JPEG_LIB_VERSION, [$JPEG_LIB_VERSION],
+  [libjpeg API version])
+
+AC_ARG_VAR(SO_MAJOR_VERSION,
+  [Major version of the libjpeg-turbo shared library (default is determined by the API version)])
+AC_ARG_VAR(SO_MINOR_VERSION,
+  [Minor version of the libjpeg-turbo shared library (default is determined by the API version)])
+if test "x$SO_MAJOR_VERSION" = "x"; then
+  case "$JPEG_LIB_VERSION" in
+    62)  SO_MAJOR_VERSION=$JPEG_LIB_VERSION ;;
+    *)   SO_MAJOR_VERSION=`expr $JPEG_LIB_VERSION / 10` ;;
+  esac
+fi
+if test "x$SO_MINOR_VERSION" = "x"; then
+  case "$JPEG_LIB_VERSION" in
+    80)  SO_MINOR_VERSION=2 ;;
+    *)   SO_MINOR_VERSION=0 ;;
+  esac
+fi
+
+RPM_CONFIG_ARGS=
+
+# Memory source/destination managers
+SO_AGE=1
+MEM_SRCDST_FUNCTIONS=
+if test "x${with_jpeg8}" != "xyes"; then
+  AC_MSG_CHECKING([whether to include in-memory source/destination managers])
+  AC_ARG_WITH([mem-srcdst],
+    AC_HELP_STRING([--without-mem-srcdst],
+      [Do not include in-memory source/destination manager functions when emulating the libjpeg v6b or v7 API/ABI]))
+  if test "x$with_mem_srcdst" != "xno"; then
+    AC_MSG_RESULT(yes)
+    AC_DEFINE([MEM_SRCDST_SUPPORTED], [1],
+      [Support in-memory source/destination managers])
+    SO_AGE=2
+    MEM_SRCDST_FUNCTIONS="global:  jpeg_mem_dest;  jpeg_mem_src;";
+  else
+    AC_MSG_RESULT(no)
+    RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --without-mem-srcdst"
+  fi
+fi
+
+AC_MSG_CHECKING([libjpeg shared library version])
+AC_MSG_RESULT([$SO_MAJOR_VERSION.$SO_AGE.$SO_MINOR_VERSION])
+LIBTOOL_CURRENT=`expr $SO_MAJOR_VERSION + $SO_AGE`
+AC_SUBST(LIBTOOL_CURRENT)
+AC_SUBST(SO_MAJOR_VERSION)
+AC_SUBST(SO_MINOR_VERSION)
+AC_SUBST(SO_AGE)
+AC_SUBST(MEM_SRCDST_FUNCTIONS)
+
+AC_DEFINE_UNQUOTED(LIBJPEG_TURBO_VERSION, [$VERSION], [libjpeg-turbo version])
+
+VERSION_SCRIPT=yes
+AC_ARG_ENABLE([ld-version-script],
+  AS_HELP_STRING([--disable-ld-version-script],
+    [Disable linker version script for libjpeg-turbo (default is to use linker version script if the linker supports it)]),
+  [VERSION_SCRIPT=$enableval], [])
+
+AC_MSG_CHECKING([whether the linker supports version scripts])
+SAVED_LDFLAGS="$LDFLAGS"
+LDFLAGS="$LDFLAGS -Wl,--version-script,conftest.map"
+cat > conftest.map <<EOF
+VERS_1 {
+  global: *;
+};
+EOF
+AC_LINK_IFELSE([AC_LANG_PROGRAM([], [])],
+  [VERSION_SCRIPT_FLAG=-Wl,--version-script,;
+   AC_MSG_RESULT([yes (GNU style)])],
+  [])
+if test "x$VERSION_SCRIPT_FLAG" = "x"; then
+  LDFLAGS="$SAVED_LDFLAGS -Wl,-M,conftest.map"
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([], [])],
+    [VERSION_SCRIPT_FLAG=-Wl,-M,;
+     AC_MSG_RESULT([yes (Sun style)])],
+    [])
+fi
+if test "x$VERSION_SCRIPT_FLAG" = "x"; then
+  VERSION_SCRIPT=no
+  AC_MSG_RESULT(no)
+fi
+LDFLAGS="$SAVED_LDFLAGS"
+
+AC_MSG_CHECKING([whether to use version script when building libjpeg-turbo])
+AC_MSG_RESULT($VERSION_SCRIPT)
+
+AM_CONDITIONAL(VERSION_SCRIPT, test "x$VERSION_SCRIPT" = "xyes")
+AC_SUBST(VERSION_SCRIPT_FLAG)
+
+# Check for non-broken inline under various spellings
+AC_MSG_CHECKING(for inline)
+ljt_cv_inline=""
+AC_TRY_COMPILE(, [} inline __attribute__((always_inline)) int foo() { return 0; }
+int bar() { return foo();], ljt_cv_inline="inline __attribute__((always_inline))",
+AC_TRY_COMPILE(, [} __inline__ int foo() { return 0; }
+int bar() { return foo();], ljt_cv_inline="__inline__",
+AC_TRY_COMPILE(, [} __inline int foo() { return 0; }
+int bar() { return foo();], ljt_cv_inline="__inline",
+AC_TRY_COMPILE(, [} inline int foo() { return 0; }
+int bar() { return foo();], ljt_cv_inline="inline"))))
+AC_MSG_RESULT($ljt_cv_inline)
+AC_DEFINE_UNQUOTED([INLINE],[$ljt_cv_inline],[How to obtain function inlining.])
+
+# Arithmetic coding support
+AC_MSG_CHECKING([whether to include arithmetic encoding support])
+AC_ARG_WITH([arith-enc],
+  AC_HELP_STRING([--without-arith-enc],
+    [Do not include arithmetic encoding support]))
+if test "x$with_12bit" = "xyes"; then
+  with_arith_enc=no
+fi
+if test "x$with_arith_enc" = "xno"; then
+  AC_MSG_RESULT(no)
+  RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --without-arith-enc"
+else
+  AC_DEFINE([C_ARITH_CODING_SUPPORTED], [1], [Support arithmetic encoding])
+  AC_MSG_RESULT(yes)
+fi
+AM_CONDITIONAL([WITH_ARITH_ENC], [test "x$with_arith_enc" != "xno"])
+
+AC_MSG_CHECKING([whether to include arithmetic decoding support])
+AC_ARG_WITH([arith-dec],
+  AC_HELP_STRING([--without-arith-dec],
+    [Do not include arithmetic decoding support]))
+if test "x$with_12bit" = "xyes"; then
+  with_arith_dec=no
+fi
+if test "x$with_arith_dec" = "xno"; then
+  AC_MSG_RESULT(no)
+  RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --without-arith-dec"
+else
+  AC_DEFINE([D_ARITH_CODING_SUPPORTED], [1], [Support arithmetic decoding])
+  AC_MSG_RESULT(yes)
+fi
+AM_CONDITIONAL([WITH_ARITH_DEC], [test "x$with_arith_dec" != "xno"])
+
+AM_CONDITIONAL([WITH_ARITH],
+  [test "x$with_arith_dec" != "xno" -o "x$with_arith_enc" != "xno"])
+
+# 12-bit component support
+AC_MSG_CHECKING([whether to use 12-bit samples])
+AC_ARG_WITH([12bit],
+  AC_HELP_STRING([--with-12bit], [Encode/decode JPEG images with 12-bit samples (implies --without-simd --without-turbojpeg --without-arith-dec --without-arith-enc)]))
+if test "x$with_12bit" = "xyes"; then
+  AC_DEFINE([BITS_IN_JSAMPLE], [12], [use 8 or 12])
+  AC_MSG_RESULT(yes)
+else
+  AC_MSG_RESULT(no)
+fi
+AM_CONDITIONAL([WITH_12BIT], [test "x$with_12bit" = "xyes"])
+
+# TurboJPEG support
+AC_MSG_CHECKING([whether to build TurboJPEG C wrapper])
+AC_ARG_WITH([turbojpeg],
+  AC_HELP_STRING([--without-turbojpeg],
+    [Do not include the TurboJPEG wrapper library and associated test programs]))
+if test "x$with_12bit" = "xyes"; then
+  with_turbojpeg=no
+fi
+if test "x$with_turbojpeg" = "xno"; then
+  AC_MSG_RESULT(no)
+  RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --without-turbojpeg"
+else
+  AC_MSG_RESULT(yes)
+fi
+
+# Java support
+AC_ARG_VAR(JAVAC, [Java compiler command (default: javac)])
+if test "x$JAVAC" = "x"; then
+  JAVAC=javac
+fi
+AC_SUBST(JAVAC)
+AC_ARG_VAR(JAVACFLAGS, [Java compiler flags])
+AC_SUBST(JAVACFLAGS)
+AC_ARG_VAR(JAR, [Java archive command (default: jar)])
+if test "x$JAR" = "x"; then
+  JAR=jar
+fi
+AC_SUBST(JAR)
+AC_ARG_VAR(JAVA, [Java runtime command (default: java)])
+if test "x$JAVA" = "x"; then
+  JAVA=java
+fi
+AC_SUBST(JAVA)
+AC_ARG_VAR(JNI_CFLAGS,
+  [C compiler flags needed to include jni.h (default: -I/System/Library/Frameworks/JavaVM.framework/Headers on OS X, '-I/usr/java/include -I/usr/java/include/solaris' on Solaris, and '-I/usr/java/default/include -I/usr/java/default/include/linux' on Linux)])
+
+AC_MSG_CHECKING([whether to build TurboJPEG Java wrapper])
+AC_ARG_WITH([java],
+  AC_HELP_STRING([--with-java], [Build Java wrapper for the TurboJPEG library]))
+if test "x$with_12bit" = "xyes" -o "x$with_turbojpeg" = "xno"; then
+  with_java=no
+fi
+
+WITH_JAVA=0
+if test "x$with_java" = "xyes"; then
+  AC_MSG_RESULT(yes)
+
+  case $host_os in
+    darwin*)
+      DEFAULT_JNI_CFLAGS=-I/System/Library/Frameworks/JavaVM.framework/Headers
+      ;;
+    solaris*)
+      DEFAULT_JNI_CFLAGS='-I/usr/java/include -I/usr/java/include/solaris'
+      ;;
+    linux*)
+      DEFAULT_JNI_CFLAGS='-I/usr/java/default/include -I/usr/java/default/include/linux'
+      ;;
+  esac
+  if test "x$JNI_CFLAGS" = "x"; then
+    JNI_CFLAGS=$DEFAULT_JNI_CFLAGS
+  fi
+
+  SAVE_CPPFLAGS=${CPPFLAGS}
+  CPPFLAGS="${CPPFLAGS} ${JNI_CFLAGS}"
+  AC_CHECK_HEADERS([jni.h], [DUMMY=1],
+    [AC_MSG_ERROR([Could not find JNI header file])])
+  CPPFLAGS=${SAVE_CPPFLAGS}
+  AC_SUBST(JNI_CFLAGS)
+
+  RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --with-java"
+  JAVA_RPM_CONTENTS_1='%dir %{_datadir}/classes'
+  JAVA_RPM_CONTENTS_2=%{_datadir}/classes/turbojpeg.jar
+  WITH_JAVA=1
+else
+  AC_MSG_RESULT(no)
+fi
+AM_CONDITIONAL([WITH_JAVA], [test "x$with_java" = "xyes"])
+AC_SUBST(WITH_JAVA)
+AC_SUBST(JAVA_RPM_CONTENTS_1)
+AC_SUBST(JAVA_RPM_CONTENTS_2)
+
+# optionally force using gas-preprocessor.pl for compatibility testing
+AC_ARG_WITH([gas-preprocessor],
+  AC_HELP_STRING([--with-gas-preprocessor],
+    [Force using gas-preprocessor.pl on ARM.]))
+if test "x${with_gas_preprocessor}" = "xyes"; then
+  case $host_os in
+    darwin*)
+      CCAS="gas-preprocessor.pl -fix-unreq $CC"
+      ;;
+    *)
+      CCAS="gas-preprocessor.pl -no-fix-unreq $CC"
+      ;;
+  esac
+  AC_SUBST([CCAS])
+fi
+
+# SIMD is optional
+AC_ARG_WITH([simd],
+  AC_HELP_STRING([--without-simd], [Do not include SIMD extensions]))
+if test "x$with_12bit" = "xyes"; then
+  with_simd=no
+fi
+if test "x${with_simd}" != "xno"; then
+  require_simd=no
+  if test "x${with_simd}" = "xyes"; then
+    require_simd=yes
+  fi
+  # Check if we're on a supported CPU
+  AC_MSG_CHECKING([if we have SIMD optimisations for cpu type])
+  case "$host_cpu" in
+    x86_64 | amd64)
+      AC_MSG_RESULT([yes (x86_64)])
+      AC_PROG_NASM
+      simd_arch=x86_64
+      ;;
+    i*86 | x86 | ia32)
+      AC_MSG_RESULT([yes (i386)])
+      AC_PROG_NASM
+      simd_arch=i386
+      ;;
+    arm*)
+      AC_MSG_RESULT([yes (arm)])
+      AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
+      AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE(
+        [if test "x$ac_use_gas_preprocessor" = "xyes"; then
+           AC_MSG_RESULT([yes (with gas-preprocessor)])
+         else
+           AC_MSG_RESULT([yes])
+         fi
+         simd_arch=arm],
+        [AC_MSG_RESULT([no])
+         with_simd=no])
+      if test "x${with_simd}" = "xno"; then
+        if test "x${require_simd}" = "xyes"; then
+          AC_MSG_ERROR([SIMD support can't be enabled.])
+        else
+          AC_MSG_WARN([SIMD support can't be enabled.  Performance will suffer.])
+        fi
+      fi
+      ;;
+    aarch64*)
+      AC_MSG_RESULT([yes (arm64)])
+      AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
+      AC_CHECK_COMPATIBLE_ARM64_ASSEMBLER_IFELSE(
+        [if test "x$ac_use_gas_preprocessor" = "xyes"; then
+           AC_MSG_RESULT([yes (with gas-preprocessor)])
+         else
+           AC_MSG_RESULT([yes])
+         fi
+         simd_arch=aarch64],
+        [AC_MSG_RESULT([no])
+         with_simd=no])
+      if test "x${with_simd}" = "xno"; then
+        if test "x${require_simd}" = "xyes"; then
+          AC_MSG_ERROR([SIMD support can't be enabled.])
+        else
+          AC_MSG_WARN([SIMD support can't be enabled.  Performance will suffer.])
+        fi
+      fi
+      ;;
+    mips*)
+      AC_MSG_RESULT([yes (mips)])
+      AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
+      AC_CHECK_COMPATIBLE_MIPS_ASSEMBLER_IFELSE(
+        [AC_MSG_RESULT([yes])
+         simd_arch=mips],
+        [AC_MSG_RESULT([no])
+         with_simd=no])
+      if test "x${with_simd}" = "xno"; then
+        if test "x${require_simd}" = "xyes"; then
+          AC_MSG_ERROR([SIMD support can't be enabled.])
+        else
+          AC_MSG_WARN([SIMD support can't be enabled.  Performance will suffer.])
+        fi
+      fi
+      ;;
+    powerpc*)
+      AC_MSG_RESULT([yes (powerpc)])
+      simd_arch=powerpc
+      ;;
+    *)
+      AC_MSG_RESULT([no ("$host_cpu")])
+      with_simd=no;
+      if test "x${require_simd}" = "xyes"; then
+        AC_MSG_ERROR([SIMD support not available for this CPU.])
+      else
+        AC_MSG_WARN([SIMD support not available for this CPU.  Performance will suffer.])
+      fi
+      ;;
+  esac
+
+  if test "x${with_simd}" != "xno"; then
+    AC_DEFINE([WITH_SIMD], [1], [Use accelerated SIMD routines.])
+  fi
+else
+  RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --without-simd"
+fi
+
+AM_CONDITIONAL([WITH_SIMD], [test "x$with_simd" != "xno"])
+AM_CONDITIONAL([WITH_SSE_FLOAT_DCT], [test "x$simd_arch" = "xx86_64" -o "x$simd_arch" = "xi386"])
+AM_CONDITIONAL([SIMD_I386], [test "x$simd_arch" = "xi386"])
+AM_CONDITIONAL([SIMD_X86_64], [test "x$simd_arch" = "xx86_64"])
+AM_CONDITIONAL([SIMD_ARM], [test "x$simd_arch" = "xarm"])
+AM_CONDITIONAL([SIMD_ARM_64], [test "x$simd_arch" = "xaarch64"])
+AM_CONDITIONAL([SIMD_MIPS], [test "x$simd_arch" = "xmips"])
+AM_CONDITIONAL([SIMD_POWERPC], [test "x$simd_arch" = "xpowerpc"])
+AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"])
+AM_CONDITIONAL([WITH_TURBOJPEG], [test "x$with_turbojpeg" != "xno"])
+AM_CONDITIONAL([CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
+
+AC_ARG_VAR(PKGNAME, [distribution package name (default: libjpeg-turbo)])
+if test "x$PKGNAME" = "x"; then
+  PKGNAME=$PACKAGE_NAME
+fi
+AC_SUBST(PKGNAME)
+
+case "$host_cpu" in
+  x86_64)
+    RPMARCH=x86_64
+    DEBARCH=amd64
+    ;;
+  i*86 | x86 | ia32)
+    RPMARCH=i386
+    DEBARCH=i386
+    ;;
+  *)
+    RPMARCH=`uname -m`
+    DEBARCH=$RPMARCH
+    ;;
+esac
+
+if test "${docdir}" = ""; then
+  docdir=${datadir}/doc
+  AC_SUBST(docdir)
+fi
+
+AC_SUBST(RPMARCH)
+AC_SUBST(RPM_CONFIG_ARGS)
+AC_SUBST(DEBARCH)
+AC_SUBST(BUILD)
+AC_DEFINE_UNQUOTED([BUILD], "$BUILD", [libjpeg-turbo build number])
+
+# NOTE: autoheader automatically modifies the input file of the first
+# invocation of AC_CONFIG_HEADERS, so we put config.h first to prevent
+# jconfig.h.in from being clobbered.  config.h is used only internally, whereas
+# jconfig.h contains macros that are relevant to external programs (macros that
+# specify which features were built into the library.)
+AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_HEADERS([jconfig.h])
+AC_CONFIG_HEADERS([jconfigint.h])
+AC_CONFIG_FILES([pkgscripts/libjpeg-turbo.spec.tmpl:release/libjpeg-turbo.spec.in])
+AC_CONFIG_FILES([pkgscripts/makecygwinpkg.tmpl:release/makecygwinpkg.in])
+AC_CONFIG_FILES([pkgscripts/makedpkg.tmpl:release/makedpkg.in])
+AC_CONFIG_FILES([pkgscripts/makemacpkg.tmpl:release/makemacpkg.in])
+AC_CONFIG_FILES([pkgscripts/uninstall.tmpl:release/uninstall.in])
+AC_CONFIG_FILES([pkgscripts/libjpeg.pc:release/libjpeg.pc.in])
+AC_CONFIG_FILES([pkgscripts/libturbojpeg.pc:release/libturbojpeg.pc.in])
+if test "x$with_turbojpeg" != "xno"; then
+  AC_CONFIG_FILES([tjbenchtest])
+fi
+if test "x$with_java" = "xyes"; then
+  AC_CONFIG_FILES([tjbenchtest.java])
+  AC_CONFIG_FILES([tjexampletest])
+fi
+AC_CONFIG_FILES([libjpeg.map])
+AC_CONFIG_FILES([Makefile simd/Makefile])
+AC_CONFIG_FILES([java/Makefile])
+AC_CONFIG_FILES([md5/Makefile])
+AC_OUTPUT
diff --git a/djpeg.1 b/djpeg.1
new file mode 100644
index 0000000..7efde43
--- /dev/null
+++ b/djpeg.1
@@ -0,0 +1,292 @@
+.TH DJPEG 1 "18 February 2016"
+.SH NAME
+djpeg \- decompress a JPEG file to an image file
+.SH SYNOPSIS
+.B djpeg
+[
+.I options
+]
+[
+.I filename
+]
+.LP
+.SH DESCRIPTION
+.LP
+.B djpeg
+decompresses the named JPEG file, or the standard input if no file is named,
+and produces an image file on the standard output.  PBMPLUS (PPM/PGM), BMP,
+GIF, Targa, or RLE (Utah Raster Toolkit) output format can be selected.
+(RLE is supported only if the URT library is available.)
+.SH OPTIONS
+All switch names may be abbreviated; for example,
+.B \-grayscale
+may be written
+.B \-gray
+or
+.BR \-gr .
+Most of the "basic" switches can be abbreviated to as little as one letter.
+Upper and lower case are equivalent (thus
+.B \-BMP
+is the same as
+.BR \-bmp ).
+British spellings are also accepted (e.g.,
+.BR \-greyscale ),
+though for brevity these are not mentioned below.
+.PP
+The basic switches are:
+.TP
+.BI \-colors " N"
+Reduce image to at most N colors.  This reduces the number of colors used in
+the output image, so that it can be displayed on a colormapped display or
+stored in a colormapped file format.  For example, if you have an 8-bit
+display, you'd need to reduce to 256 or fewer colors.
+.TP
+.BI \-quantize " N"
+Same as
+.BR \-colors .
+.B \-colors
+is the recommended name,
+.B \-quantize
+is provided only for backwards compatibility.
+.TP
+.B \-fast
+Select recommended processing options for fast, low quality output.  (The
+default options are chosen for highest quality output.)  Currently, this is
+equivalent to \fB\-dct fast \-nosmooth \-onepass \-dither ordered\fR.
+.TP
+.B \-grayscale
+Force grayscale output even if JPEG file is color.  Useful for viewing on
+monochrome displays; also,
+.B djpeg
+runs noticeably faster in this mode.
+.TP
+.B \-rgb
+Force RGB output even if JPEG file is grayscale.
+.TP
+.BI \-scale " M/N"
+Scale the output image by a factor M/N.  Currently the scale factor must be
+M/8, where M is an integer between 1 and 16 inclusive, or any reduced fraction
+thereof (such as 1/2, 3/4, etc.)  Scaling is handy if the image is larger than
+your screen; also,
+.B djpeg
+runs much faster when scaling down the output.
+.TP
+.B \-bmp
+Select BMP output format (Windows flavor).  8-bit colormapped format is
+emitted if
+.B \-colors
+or
+.B \-grayscale
+is specified, or if the JPEG file is grayscale; otherwise, 24-bit full-color
+format is emitted.
+.TP
+.B \-gif
+Select GIF output format.  Since GIF does not support more than 256 colors,
+.B \-colors 256
+is assumed (unless you specify a smaller number of colors).
+.TP
+.B \-os2
+Select BMP output format (OS/2 1.x flavor).  8-bit colormapped format is
+emitted if
+.B \-colors
+or
+.B \-grayscale
+is specified, or if the JPEG file is grayscale; otherwise, 24-bit full-color
+format is emitted.
+.TP
+.B \-pnm
+Select PBMPLUS (PPM/PGM) output format (this is the default format).
+PGM is emitted if the JPEG file is grayscale or if
+.B \-grayscale
+is specified; otherwise PPM is emitted.
+.TP
+.B \-rle
+Select RLE output format.  (Requires URT library.)
+.TP
+.B \-targa
+Select Targa output format.  Grayscale format is emitted if the JPEG file is
+grayscale or if
+.B \-grayscale
+is specified; otherwise, colormapped format is emitted if
+.B \-colors
+is specified; otherwise, 24-bit full-color format is emitted.
+.PP
+Switches for advanced users:
+.TP
+.B \-dct int
+Use integer DCT method (default).
+.TP
+.B \-dct fast
+Use fast integer DCT (less accurate).
+In libjpeg-turbo, the fast method is generally about 5-15% faster than the int
+method when using the x86/x86-64 SIMD extensions (results may vary with other
+SIMD implementations, or when using libjpeg-turbo without SIMD extensions.)  If
+the JPEG image was compressed using a quality level of 85 or below, then there
+should be little or no perceptible difference between the two algorithms.  When
+decompressing images that were compressed using quality levels above 85,
+however, the difference between the fast and int methods becomes more
+pronounced.  With images compressed using quality=97, for instance, the fast
+method incurs generally about a 4-6 dB loss (in PSNR) relative to the int
+method, but this can be larger for some images.  If you can avoid it, do not
+use the fast method when decompressing images that were compressed using
+quality levels above 97.  The algorithm often degenerates for such images and
+can actually produce a more lossy output image than if the JPEG image had been
+compressed using lower quality levels.
+.TP
+.B \-dct float
+Use floating-point DCT method.
+The float method is mainly a legacy feature.  It does not produce significantly
+more accurate results than the int method, and it is much slower.  The float
+method may also give different results on different machines due to varying
+roundoff behavior, whereas the integer methods should give the same results on
+all machines.
+.TP
+.B \-dither fs
+Use Floyd-Steinberg dithering in color quantization.
+.TP
+.B \-dither ordered
+Use ordered dithering in color quantization.
+.TP
+.B \-dither none
+Do not use dithering in color quantization.
+By default, Floyd-Steinberg dithering is applied when quantizing colors; this
+is slow but usually produces the best results.  Ordered dither is a compromise
+between speed and quality; no dithering is fast but usually looks awful.  Note
+that these switches have no effect unless color quantization is being done.
+Ordered dither is only available in
+.B \-onepass
+mode.
+.TP
+.BI \-map " file"
+Quantize to the colors used in the specified image file.  This is useful for
+producing multiple files with identical color maps, or for forcing a
+predefined set of colors to be used.  The
+.I file
+must be a GIF or PPM file. This option overrides
+.B \-colors
+and
+.BR \-onepass .
+.TP
+.B \-nosmooth
+Use a faster, lower-quality upsampling routine.
+.TP
+.B \-onepass
+Use one-pass instead of two-pass color quantization.  The one-pass method is
+faster and needs less memory, but it produces a lower-quality image.
+.B \-onepass
+is ignored unless you also say
+.B \-colors
+.IR N .
+Also, the one-pass method is always used for grayscale output (the two-pass
+method is no improvement then).
+.TP
+.BI \-maxmemory " N"
+Set limit for amount of memory to use in processing large images.  Value is
+in thousands of bytes, or millions of bytes if "M" is attached to the
+number.  For example,
+.B \-max 4m
+selects 4000000 bytes.  If more space is needed, temporary files will be used.
+.TP
+.BI \-outfile " name"
+Send output image to the named file, not to standard output.
+.TP
+.BI \-memsrc
+Load input file into memory before decompressing.  This feature was implemented
+mainly as a way of testing the in-memory source manager (jpeg_mem_src().)
+.TP
+.BI \-skip " Y0,Y1"
+Decompress all rows of the JPEG image except those between Y0 and Y1
+(inclusive.)  Note that if decompression scaling is being used, then Y0 and Y1
+are relative to the scaled image dimensions.
+.TP
+.BI \-crop " WxH+X+Y"
+Decompress only a rectangular subregion of the image, starting at point X,Y
+with width W and height H.  If necessary, X will be shifted left to the nearest
+iMCU boundary, and the width will be increased accordingly.  Note that if
+decompression scaling is being used, then X, Y, W, and H are relative to the
+scaled image dimensions.
+.TP
+.B \-verbose
+Enable debug printout.  More
+.BR \-v 's
+give more output.  Also, version information is printed at startup.
+.TP
+.B \-debug
+Same as
+.BR \-verbose .
+.TP
+.B \-version
+Print version information and exit.
+.SH EXAMPLES
+.LP
+This example decompresses the JPEG file foo.jpg, quantizes it to
+256 colors, and saves the output in 8-bit BMP format in foo.bmp:
+.IP
+.B djpeg \-colors 256 \-bmp
+.I foo.jpg
+.B >
+.I foo.bmp
+.SH HINTS
+To get a quick preview of an image, use the
+.B \-grayscale
+and/or
+.B \-scale
+switches.
+.B \-grayscale \-scale 1/8
+is the fastest case.
+.PP
+Several options are available that trade off image quality to gain speed.
+.B \-fast
+turns on the recommended settings.
+.PP
+.B \-dct fast
+and/or
+.B \-nosmooth
+gain speed at a small sacrifice in quality.
+When producing a color-quantized image,
+.B \-onepass \-dither ordered
+is fast but much lower quality than the default behavior.
+.B \-dither none
+may give acceptable results in two-pass mode, but is seldom tolerable in
+one-pass mode.
+.PP
+If you are fortunate enough to have very fast floating point hardware,
+\fB\-dct float\fR may be even faster than \fB\-dct fast\fR.  But on most
+machines \fB\-dct float\fR is slower than \fB\-dct int\fR; in this case it is
+not worth using, because its theoretical accuracy advantage is too small to be
+significant in practice.
+.SH ENVIRONMENT
+.TP
+.B JPEGMEM
+If this environment variable is set, its value is the default memory limit.
+The value is specified as described for the
+.B \-maxmemory
+switch.
+.B JPEGMEM
+overrides the default value specified when the program was compiled, and
+itself is overridden by an explicit
+.BR \-maxmemory .
+.SH SEE ALSO
+.BR cjpeg (1),
+.BR jpegtran (1),
+.BR rdjpgcom (1),
+.BR wrjpgcom (1)
+.br
+.BR ppm (5),
+.BR pgm (5)
+.br
+Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
+Communications of the ACM, April 1991 (vol. 34, no. 4), pp. 30-44.
+.SH AUTHOR
+Independent JPEG Group
+.PP
+This file was modified by The libjpeg-turbo Project to include only information
+relevant to libjpeg-turbo, to wordsmith certain sections, and to describe
+features not present in libjpeg.
+.SH ISSUES
+Support for compressed GIF output files was removed in djpeg v6b due to
+concerns over the Unisys LZW patent.  Although this patent expired in 2006,
+djpeg still lacks compressed GIF support, for these historical reasons.
+(Conversion of JPEG files to GIF is usually a bad idea anyway, since GIF is a
+256-color format.)  The uncompressed GIF files that djpeg generates are larger
+than they should be, but they are readable by standard GIF decoders.
diff --git a/djpeg.c b/djpeg.c
index 237ed51..54cd525 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -3,18 +3,20 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2013 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010-2011, 2013-2015, D. R. Commander.
+ * Copyright (C) 2010-2011, 2013-2016, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a command-line user interface for the JPEG decompressor.
  * It should work on any system with Unix- or MS-DOS-style command lines.
  *
  * Two different command line styles are permitted, depending on the
  * compile-time switch TWO_FILE_COMMANDLINE:
- *	djpeg [options]  inputfile outputfile
- *	djpeg [options]  [inputfile]
+ *      djpeg [options]  inputfile outputfile
+ *      djpeg [options]  [inputfile]
  * In the second style, output is always to standard output, which you'd
  * normally redirect to a file or pipe to some other program.  Input is
  * either from a named file or from standard input (typically redirected).
@@ -22,30 +24,31 @@
  * don't support pipes.  Also, you MUST use the first style if your system
  * doesn't do binary I/O to stdin/stdout.
  * To simplify script writing, the "-outfile" switch is provided.  The syntax
- *	djpeg [options]  -outfile outputfile  inputfile
+ *      djpeg [options]  -outfile outputfile  inputfile
  * works regardless of which command line style is used.
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
-#include "jversion.h"		/* for version message */
-#include "config.h"
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include "jversion.h"           /* for version message */
+#include "jconfigint.h"
+#include "wrppm.h"
 
-#include <ctype.h>		/* to declare isprint() */
+#include <ctype.h>              /* to declare isprint() */
 
-#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
+#ifdef USE_CCOMMAND             /* command-line reader for Macintosh */
 #ifdef __MWERKS__
 #include <SIOUX.h>              /* Metrowerks needs this */
-#include <console.h>		/* ... and this */
+#include <console.h>            /* ... and this */
 #endif
 #ifdef THINK_C
-#include <console.h>		/* Think declares it here */
+#include <console.h>            /* Think declares it here */
 #endif
 #endif
 
 
 /* Create the add-on message string table. */
 
-#define JMESSAGE(code,string)	string ,
+#define JMESSAGE(code,string)   string ,
 
 static const char * const cdjpeg_message_table[] = {
 #include "cderror.h"
@@ -61,17 +64,17 @@
  */
 
 typedef enum {
-	FMT_BMP,		/* BMP format (Windows flavor) */
-	FMT_GIF,		/* GIF format */
-	FMT_OS2,		/* BMP format (OS/2 flavor) */
-	FMT_PPM,		/* PPM/PGM (PBMPLUS formats) */
-	FMT_RLE,		/* RLE format */
-	FMT_TARGA,		/* Targa format */
-	FMT_TIFF		/* TIFF format */
+        FMT_BMP,                /* BMP format (Windows flavor) */
+        FMT_GIF,                /* GIF format */
+        FMT_OS2,                /* BMP format (OS/2 flavor) */
+        FMT_PPM,                /* PPM/PGM (PBMPLUS formats) */
+        FMT_RLE,                /* RLE format */
+        FMT_TARGA,              /* Targa format */
+        FMT_TIFF                /* TIFF format */
 } IMAGE_FORMATS;
 
-#ifndef DEFAULT_FMT		/* so can override from CFLAGS in Makefile */
-#define DEFAULT_FMT	FMT_PPM
+#ifndef DEFAULT_FMT             /* so can override from CFLAGS in Makefile */
+#define DEFAULT_FMT     FMT_PPM
 #endif
 
 static IMAGE_FORMATS requested_fmt;
@@ -86,11 +89,12 @@
  */
 
 
-static const char * progname;	/* program name for error messages */
-static char * outfilename;	/* for -outfile switch */
-boolean memsrc;  /* for -memsrc switch */
-boolean strip, skip;
-JDIMENSION startY, endY;
+static const char *progname;    /* program name for error messages */
+static char *outfilename;       /* for -outfile switch */
+boolean memsrc;                 /* for -memsrc switch */
+boolean skip, crop;
+JDIMENSION skip_start, skip_end;
+JDIMENSION crop_x, crop_y, crop_width, crop_height;
 #define INPUT_BUF_SIZE  4096
 
 
@@ -116,40 +120,40 @@
 #endif
 #ifdef BMP_SUPPORTED
   fprintf(stderr, "  -bmp           Select BMP output format (Windows style)%s\n",
-	  (DEFAULT_FMT == FMT_BMP ? " (default)" : ""));
+          (DEFAULT_FMT == FMT_BMP ? " (default)" : ""));
 #endif
 #ifdef GIF_SUPPORTED
   fprintf(stderr, "  -gif           Select GIF output format%s\n",
-	  (DEFAULT_FMT == FMT_GIF ? " (default)" : ""));
+          (DEFAULT_FMT == FMT_GIF ? " (default)" : ""));
 #endif
 #ifdef BMP_SUPPORTED
   fprintf(stderr, "  -os2           Select BMP output format (OS/2 style)%s\n",
-	  (DEFAULT_FMT == FMT_OS2 ? " (default)" : ""));
+          (DEFAULT_FMT == FMT_OS2 ? " (default)" : ""));
 #endif
 #ifdef PPM_SUPPORTED
   fprintf(stderr, "  -pnm           Select PBMPLUS (PPM/PGM) output format%s\n",
-	  (DEFAULT_FMT == FMT_PPM ? " (default)" : ""));
+          (DEFAULT_FMT == FMT_PPM ? " (default)" : ""));
 #endif
 #ifdef RLE_SUPPORTED
   fprintf(stderr, "  -rle           Select Utah RLE output format%s\n",
-	  (DEFAULT_FMT == FMT_RLE ? " (default)" : ""));
+          (DEFAULT_FMT == FMT_RLE ? " (default)" : ""));
 #endif
 #ifdef TARGA_SUPPORTED
   fprintf(stderr, "  -targa         Select Targa output format%s\n",
-	  (DEFAULT_FMT == FMT_TARGA ? " (default)" : ""));
+          (DEFAULT_FMT == FMT_TARGA ? " (default)" : ""));
 #endif
   fprintf(stderr, "Switches for advanced users:\n");
 #ifdef DCT_ISLOW_SUPPORTED
   fprintf(stderr, "  -dct int       Use integer DCT method%s\n",
-	  (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
+          (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
 #endif
 #ifdef DCT_IFAST_SUPPORTED
   fprintf(stderr, "  -dct fast      Use fast integer DCT (less accurate)%s\n",
-	  (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
+          (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
   fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
-	  (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
+          (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
 #endif
   fprintf(stderr, "  -dither fs     Use F-S dithering (default)\n");
   fprintf(stderr, "  -dither none   Don't use dithering in quantization\n");
@@ -167,16 +171,17 @@
   fprintf(stderr, "  -memsrc        Load input file into memory before decompressing\n");
 #endif
 
-  fprintf(stderr, "  -skip Y0,Y1    Decode all rows except those between Y0 and Y1 (inclusive)\n");
-  fprintf(stderr, "  -strip Y0,Y1   Decode only rows between Y0 and Y1 (inclusive)\n");
+  fprintf(stderr, "  -skip Y0,Y1    Decompress all rows except those between Y0 and Y1 (inclusive)\n");
+  fprintf(stderr, "  -crop WxH+X+Y  Decompress only a rectangular subregion of the image\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "  -version       Print version information and exit\n");
   exit(EXIT_FAILURE);
 }
 
 
 LOCAL(int)
 parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
-		int last_file_arg_seen, boolean for_real)
+                int last_file_arg_seen, boolean for_real)
 /* Parse optional switches.
  * Returns argv[] index of first file-name argument (== argc if none).
  * Any file names with indexes <= last_file_arg_seen are ignored;
@@ -187,14 +192,14 @@
  */
 {
   int argn;
-  char * arg;
+  char *arg;
 
   /* Set up default JPEG parameters. */
-  requested_fmt = DEFAULT_FMT;	/* set default output file format */
+  requested_fmt = DEFAULT_FMT;  /* set default output file format */
   outfilename = NULL;
   memsrc = FALSE;
-  strip = FALSE;
   skip = FALSE;
+  crop = FALSE;
   cinfo->err->trace_level = 0;
 
   /* Scan command line options, adjust parameters */
@@ -204,54 +209,54 @@
     if (*arg != '-') {
       /* Not a switch, must be a file name argument */
       if (argn <= last_file_arg_seen) {
-	outfilename = NULL;	/* -outfile applies to just one input file */
-	continue;		/* ignore this name if previously processed */
+        outfilename = NULL;     /* -outfile applies to just one input file */
+        continue;               /* ignore this name if previously processed */
       }
-      break;			/* else done parsing switches */
+      break;                    /* else done parsing switches */
     }
-    arg++;			/* advance past switch marker character */
+    arg++;                      /* advance past switch marker character */
 
     if (keymatch(arg, "bmp", 1)) {
       /* BMP output format. */
       requested_fmt = FMT_BMP;
 
     } else if (keymatch(arg, "colors", 1) || keymatch(arg, "colours", 1) ||
-	       keymatch(arg, "quantize", 1) || keymatch(arg, "quantise", 1)) {
+               keymatch(arg, "quantize", 1) || keymatch(arg, "quantise", 1)) {
       /* Do color quantization. */
       int val;
 
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (sscanf(argv[argn], "%d", &val) != 1)
-	usage();
+        usage();
       cinfo->desired_number_of_colors = val;
       cinfo->quantize_colors = TRUE;
 
     } else if (keymatch(arg, "dct", 2)) {
       /* Select IDCT algorithm. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (keymatch(argv[argn], "int", 1)) {
-	cinfo->dct_method = JDCT_ISLOW;
+        cinfo->dct_method = JDCT_ISLOW;
       } else if (keymatch(argv[argn], "fast", 2)) {
-	cinfo->dct_method = JDCT_IFAST;
+        cinfo->dct_method = JDCT_IFAST;
       } else if (keymatch(argv[argn], "float", 2)) {
-	cinfo->dct_method = JDCT_FLOAT;
+        cinfo->dct_method = JDCT_FLOAT;
       } else
-	usage();
+        usage();
 
     } else if (keymatch(arg, "dither", 2)) {
       /* Select dithering algorithm. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (keymatch(argv[argn], "fs", 2)) {
-	cinfo->dither_mode = JDITHER_FS;
+        cinfo->dither_mode = JDITHER_FS;
       } else if (keymatch(argv[argn], "none", 2)) {
-	cinfo->dither_mode = JDITHER_NONE;
+        cinfo->dither_mode = JDITHER_NONE;
       } else if (keymatch(argv[argn], "ordered", 2)) {
-	cinfo->dither_mode = JDITHER_ORDERED;
+        cinfo->dither_mode = JDITHER_ORDERED;
       } else
-	usage();
+        usage();
 
     } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
       /* Enable debug printouts. */
@@ -259,21 +264,26 @@
       static boolean printed_version = FALSE;
 
       if (! printed_version) {
-	fprintf(stderr, "%s version %s (build %s)\n",
-		PACKAGE_NAME, VERSION, BUILD);
-	fprintf(stderr, "%s\n\n", JCOPYRIGHT);
-	fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n",
-		JVERSION);
-	printed_version = TRUE;
+        fprintf(stderr, "%s version %s (build %s)\n",
+                PACKAGE_NAME, VERSION, BUILD);
+        fprintf(stderr, "%s\n\n", JCOPYRIGHT);
+        fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n",
+                JVERSION);
+        printed_version = TRUE;
       }
       cinfo->err->trace_level++;
 
+    } else if (keymatch(arg, "version", 4)) {
+      fprintf(stderr, "%s version %s (build %s)\n",
+              PACKAGE_NAME, VERSION, BUILD);
+      exit(EXIT_SUCCESS);
+
     } else if (keymatch(arg, "fast", 1)) {
       /* Select recommended processing options for quick-and-dirty output. */
       cinfo->two_pass_quantize = FALSE;
       cinfo->dither_mode = JDITHER_ORDERED;
       if (! cinfo->quantize_colors) /* don't override an earlier -colors */
-	cinfo->desired_number_of_colors = 216;
+        cinfo->desired_number_of_colors = 216;
       cinfo->dct_method = JDCT_FASTEST;
       cinfo->do_fancy_upsampling = FALSE;
 
@@ -295,21 +305,21 @@
 
     } else if (keymatch(arg, "map", 3)) {
       /* Quantize to a color map taken from an input file. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (for_real) {		/* too expensive to do twice! */
-#ifdef QUANT_2PASS_SUPPORTED	/* otherwise can't quantize to supplied map */
-	FILE * mapfile;
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (for_real) {           /* too expensive to do twice! */
+#ifdef QUANT_2PASS_SUPPORTED    /* otherwise can't quantize to supplied map */
+        FILE *mapfile;
 
-	if ((mapfile = fopen(argv[argn], READ_BINARY)) == NULL) {
-	  fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]);
-	  exit(EXIT_FAILURE);
-	}
-	read_color_map(cinfo, mapfile);
-	fclose(mapfile);
-	cinfo->quantize_colors = TRUE;
+        if ((mapfile = fopen(argv[argn], READ_BINARY)) == NULL) {
+          fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]);
+          exit(EXIT_FAILURE);
+        }
+        read_color_map(cinfo, mapfile);
+        fclose(mapfile);
+        cinfo->quantize_colors = TRUE;
 #else
-	ERREXIT(cinfo, JERR_NOT_COMPILED);
+        ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
       }
 
@@ -318,12 +328,12 @@
       long lval;
       char ch = 'x';
 
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
-	usage();
+        usage();
       if (ch == 'm' || ch == 'M')
-	lval *= 1000L;
+        lval *= 1000L;
       cinfo->mem->max_memory_to_use = lval * 1000L;
 
     } else if (keymatch(arg, "nosmooth", 3)) {
@@ -340,9 +350,9 @@
 
     } else if (keymatch(arg, "outfile", 4)) {
       /* Set output file name. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      outfilename = argv[argn];	/* save it away for later use */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      outfilename = argv[argn]; /* save it away for later use */
 
     } else if (keymatch(arg, "memsrc", 2)) {
       /* Use in-memory source manager */
@@ -364,36 +374,40 @@
 
     } else if (keymatch(arg, "scale", 2)) {
       /* Scale the output image by a fraction M/N. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%d/%d",
-		 &cinfo->scale_num, &cinfo->scale_denom) != 2)
-	usage();
-
-    } else if (keymatch(arg, "strip", 2)) {
-      if (++argn >= argc)
+      if (++argn >= argc)       /* advance to next argument */
         usage();
-      if (sscanf(argv[argn], "%d,%d", &startY, &endY) != 2 || startY > endY)
+      if (sscanf(argv[argn], "%u/%u",
+                 &cinfo->scale_num, &cinfo->scale_denom) != 2)
         usage();
-      strip = TRUE;
 
     } else if (keymatch(arg, "skip", 2)) {
       if (++argn >= argc)
         usage();
-      if (sscanf(argv[argn], "%d,%d", &startY, &endY) != 2 || startY > endY)
+      if (sscanf(argv[argn], "%u,%u", &skip_start, &skip_end) != 2 ||
+          skip_start > skip_end)
         usage();
       skip = TRUE;
 
+    } else if (keymatch(arg, "crop", 2)) {
+      char c;
+      if (++argn >= argc)
+        usage();
+      if (sscanf(argv[argn], "%u%c%u+%u+%u", &crop_width, &c, &crop_height,
+                 &crop_x, &crop_y) != 5 ||
+          (c != 'X' && c != 'x') || crop_width < 1 || crop_height < 1)
+        usage();
+      crop = TRUE;
+
     } else if (keymatch(arg, "targa", 1)) {
       /* Targa output format. */
       requested_fmt = FMT_TARGA;
 
     } else {
-      usage();			/* bogus switch */
+      usage();                  /* bogus switch */
     }
   }
 
-  return argn;			/* return index of next arg (file name) */
+  return argn;                  /* return index of next arg (file name) */
 }
 
 
@@ -408,7 +422,7 @@
 jpeg_getc (j_decompress_ptr cinfo)
 /* Read next byte */
 {
-  struct jpeg_source_mgr * datasrc = cinfo->src;
+  struct jpeg_source_mgr *datasrc = cinfo->src;
 
   if (datasrc->bytes_in_buffer == 0) {
     if (! (*datasrc->fill_input_buffer) (cinfo))
@@ -423,20 +437,20 @@
 print_text_marker (j_decompress_ptr cinfo)
 {
   boolean traceit = (cinfo->err->trace_level >= 1);
-  INT32 length;
+  long length;
   unsigned int ch;
   unsigned int lastch = 0;
 
   length = jpeg_getc(cinfo) << 8;
   length += jpeg_getc(cinfo);
-  length -= 2;			/* discount the length word itself */
+  length -= 2;                  /* discount the length word itself */
 
   if (traceit) {
     if (cinfo->unread_marker == JPEG_COM)
       fprintf(stderr, "Comment, length %ld:\n", (long) length);
-    else			/* assume it is an APPn otherwise */
+    else                        /* assume it is an APPn otherwise */
       fprintf(stderr, "APP%d, length %ld:\n",
-	      cinfo->unread_marker - JPEG_APP0, (long) length);
+              cinfo->unread_marker - JPEG_APP0, (long) length);
   }
 
   while (--length >= 0) {
@@ -448,16 +462,16 @@
        * Newlines in CR, CR/LF, or LF form will be printed as one newline.
        */
       if (ch == '\r') {
-	fprintf(stderr, "\n");
+        fprintf(stderr, "\n");
       } else if (ch == '\n') {
-	if (lastch != '\r')
-	  fprintf(stderr, "\n");
+        if (lastch != '\r')
+          fprintf(stderr, "\n");
       } else if (ch == '\\') {
-	fprintf(stderr, "\\\\");
+        fprintf(stderr, "\\\\");
       } else if (isprint(ch)) {
-	putc(ch, stderr);
+        putc(ch, stderr);
       } else {
-	fprintf(stderr, "\\%03o", ch);
+        fprintf(stderr, "\\%03o", ch);
       }
       lastch = ch;
     }
@@ -484,8 +498,8 @@
 #endif
   int file_index;
   djpeg_dest_ptr dest_mgr = NULL;
-  FILE * input_file;
-  FILE * output_file;
+  FILE *input_file;
+  FILE *output_file;
   unsigned char *inbuffer = NULL;
   unsigned long insize = 0;
   JDIMENSION num_scanlines;
@@ -497,7 +511,7 @@
 
   progname = argv[0];
   if (progname == NULL || progname[0] == 0)
-    progname = "djpeg";		/* in case C library doesn't provide it */
+    progname = "djpeg";         /* in case C library doesn't provide it */
 
   /* Initialize the JPEG decompression object with default error handling. */
   cinfo.err = jpeg_std_error(&jerr);
@@ -516,11 +530,6 @@
   jpeg_set_marker_processor(&cinfo, JPEG_COM, print_text_marker);
   jpeg_set_marker_processor(&cinfo, JPEG_APP0+12, print_text_marker);
 
-  /* Now safe to enable signal catcher. */
-#ifdef NEED_SIGNAL_CATCHER
-  enable_signal_catcher((j_common_ptr) &cinfo);
-#endif
-
   /* Scan command line to find file names. */
   /* It is convenient to use just one switch-parsing routine, but the switch
    * values read here are ignored; we will rescan the switches after opening
@@ -536,14 +545,14 @@
   if (outfilename == NULL) {
     if (file_index != argc-2) {
       fprintf(stderr, "%s: must name one input and one output file\n",
-	      progname);
+              progname);
       usage();
     }
     outfilename = argv[file_index+1];
   } else {
     if (file_index != argc-1) {
       fprintf(stderr, "%s: must name one input and one output file\n",
-	      progname);
+              progname);
       usage();
     }
   }
@@ -654,54 +663,78 @@
   /* Start decompressor */
   (void) jpeg_start_decompress(&cinfo);
 
-  /* Strip decode */
-  if (strip || skip) {
+  /* Skip rows */
+  if (skip) {
     JDIMENSION tmp;
 
-    /* Check for valid endY.  We cannot check this value until after
+    /* Check for valid skip_end.  We cannot check this value until after
      * jpeg_start_decompress() is called.  Note that we have already verified
-     * that startY <= endY.
+     * that skip_start <= skip_end.
      */
-    if (endY > cinfo.output_height - 1) {
-      fprintf(stderr, "%s: strip %d-%d exceeds image height %d\n", progname,
-              startY, endY, cinfo.output_height);
+    if (skip_end > cinfo.output_height - 1) {
+      fprintf(stderr, "%s: skip region exceeds image height %d\n", progname,
+              cinfo.output_height);
       exit(EXIT_FAILURE);
     }
 
     /* Write output file header.  This is a hack to ensure that the destination
-     * manager creates an image of the proper size for the partial decode.
+     * manager creates an output image of the proper size.
      */
     tmp = cinfo.output_height;
-    cinfo.output_height = endY - startY + 1;
-    if (skip)
-      cinfo.output_height = tmp - cinfo.output_height;
+    cinfo.output_height -= (skip_end - skip_start + 1);
     (*dest_mgr->start_output) (&cinfo, dest_mgr);
     cinfo.output_height = tmp;
 
     /* Process data */
-    if (skip) {
-      while (cinfo.output_scanline < startY) {
-        num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
-                                            dest_mgr->buffer_height);
-        (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
-      }
-      jpeg_skip_scanlines(&cinfo, endY - startY + 1);
-      while (cinfo.output_scanline < cinfo.output_height) {
-        num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
-                                            dest_mgr->buffer_height);
-        (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
-      }
-    } else {
-      jpeg_skip_scanlines(&cinfo, startY);
-      while (cinfo.output_scanline <= endY) {
-        num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
-                                            dest_mgr->buffer_height);
-        (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
-      }
-      jpeg_skip_scanlines(&cinfo, cinfo.output_height - endY + 1);
+    while (cinfo.output_scanline < skip_start) {
+      num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+                                          dest_mgr->buffer_height);
+      (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+    }
+    jpeg_skip_scanlines(&cinfo, skip_end - skip_start + 1);
+    while (cinfo.output_scanline < cinfo.output_height) {
+      num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+                                          dest_mgr->buffer_height);
+      (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
     }
 
-  /* Normal full image decode */
+  /* Decompress a subregion */
+  } else if (crop) {
+    JDIMENSION tmp;
+
+    /* Check for valid crop dimensions.  We cannot check these values until
+     * after jpeg_start_decompress() is called.
+     */
+    if (crop_x + crop_width > cinfo.output_width ||
+        crop_y + crop_height > cinfo.output_height) {
+      fprintf(stderr, "%s: crop dimensions exceed image dimensions %d x %d\n",
+              progname, cinfo.output_width, cinfo.output_height);
+      exit(EXIT_FAILURE);
+    }
+
+    jpeg_crop_scanline(&cinfo, &crop_x, &crop_width);
+    ((ppm_dest_ptr) dest_mgr)->buffer_width = cinfo.output_width *
+                                              cinfo.out_color_components *
+                                              sizeof(JSAMPLE);
+
+    /* Write output file header.  This is a hack to ensure that the destination
+     * manager creates an output image of the proper size.
+     */
+    tmp = cinfo.output_height;
+    cinfo.output_height = crop_height;
+    (*dest_mgr->start_output) (&cinfo, dest_mgr);
+    cinfo.output_height = tmp;
+
+    /* Process data */
+    jpeg_skip_scanlines(&cinfo, crop_y);
+    while (cinfo.output_scanline < crop_y + crop_height) {
+      num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+                                          dest_mgr->buffer_height);
+      (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+    }
+    jpeg_skip_scanlines(&cinfo, cinfo.output_height - crop_y - crop_height);
+
+  /* Normal full-image decompress */
   } else {
     /* Write output file header */
     (*dest_mgr->start_output) (&cinfo, dest_mgr);
@@ -744,5 +777,5 @@
 
   /* All done. */
   exit(jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
-  return 0;			/* suppress no-return-value warnings */
+  return 0;                     /* suppress no-return-value warnings */
 }
diff --git a/doxygen-extra.css b/doxygen-extra.css
new file mode 100644
index 0000000..5abbcc2
--- /dev/null
+++ b/doxygen-extra.css
@@ -0,0 +1,3 @@
+code {
+	color: #4665A2; 
+}
diff --git a/doxygen.config b/doxygen.config
new file mode 100644
index 0000000..1723123
--- /dev/null
+++ b/doxygen.config
@@ -0,0 +1,16 @@
+PROJECT_NAME = TurboJPEG
+PROJECT_NUMBER = 1.5
+OUTPUT_DIRECTORY = doc/
+USE_WINDOWS_ENCODING = NO
+OPTIMIZE_OUTPUT_FOR_C = YES
+WARN_NO_PARAMDOC = YES
+GENERATE_LATEX = NO
+FILE_PATTERNS = turbojpeg.h
+HIDE_UNDOC_MEMBERS = YES
+VERBATIM_HEADERS = NO
+EXTRACT_STATIC = YES
+JAVADOC_AUTOBRIEF = YES
+MAX_INITIALIZER_LINES = 0
+ALWAYS_DETAILED_SEC = YES
+HTML_TIMESTAMP = NO
+HTML_EXTRA_STYLESHEET = doxygen-extra.css
diff --git a/example.c b/example.c
index 1d6f6cc..ac27f49 100644
--- a/example.c
+++ b/example.c
@@ -6,7 +6,7 @@
  * conjunction with the documentation file libjpeg.txt.
  *
  * This code will not do anything useful as-is, but it may be helpful as a
- * skeleton for constructing routines that call the JPEG library.  
+ * skeleton for constructing routines that call the JPEG library.
  *
  * We present these routines in the same coding style used in the JPEG code
  * (ANSI function definitions, etc); but you are of course free to code your
@@ -58,9 +58,9 @@
  * RGB color and is described by:
  */
 
-extern JSAMPLE * image_buffer;	/* Points to large array of R,G,B-order data */
-extern int image_height;	/* Number of rows in image */
-extern int image_width;		/* Number of columns in image */
+extern JSAMPLE *image_buffer;   /* Points to large array of R,G,B-order data */
+extern int image_height;        /* Number of rows in image */
+extern int image_width;         /* Number of columns in image */
 
 
 /*
@@ -69,7 +69,7 @@
  */
 
 GLOBAL(void)
-write_JPEG_file (char * filename, int quality)
+write_JPEG_file (char *filename, int quality)
 {
   /* This struct contains the JPEG compression parameters and pointers to
    * working space (which is allocated as needed by the JPEG library).
@@ -88,9 +88,9 @@
    */
   struct jpeg_error_mgr jerr;
   /* More stuff */
-  FILE * outfile;		/* target file */
-  JSAMPROW row_pointer[1];	/* pointer to JSAMPLE row[s] */
-  int row_stride;		/* physical row width in image buffer */
+  FILE *outfile;                /* target file */
+  JSAMPROW row_pointer[1];      /* pointer to JSAMPLE row[s] */
+  int row_stride;               /* physical row width in image buffer */
 
   /* Step 1: allocate and initialize JPEG compression object */
 
@@ -122,10 +122,10 @@
   /* First we supply a description of the input image.
    * Four fields of the cinfo struct must be filled in:
    */
-  cinfo.image_width = image_width; 	/* image width and height, in pixels */
+  cinfo.image_width = image_width;      /* image width and height, in pixels */
   cinfo.image_height = image_height;
-  cinfo.input_components = 3;		/* # of color components per pixel */
-  cinfo.in_color_space = JCS_RGB; 	/* colorspace of input image */
+  cinfo.input_components = 3;           /* # of color components per pixel */
+  cinfo.in_color_space = JCS_RGB;       /* colorspace of input image */
   /* Now use the library's routine to set default compression parameters.
    * (You must set at least cinfo.in_color_space before calling this,
    * since the defaults depend on the source color space.)
@@ -151,7 +151,7 @@
    * To keep things simple, we pass one scanline per call; you can pass
    * more if you wish, though.
    */
-  row_stride = image_width * 3;	/* JSAMPLEs per row in image_buffer */
+  row_stride = image_width * 3; /* JSAMPLEs per row in image_buffer */
 
   while (cinfo.next_scanline < cinfo.image_height) {
     /* jpeg_write_scanlines expects an array of pointers to scanlines.
@@ -248,12 +248,12 @@
  */
 
 struct my_error_mgr {
-  struct jpeg_error_mgr pub;	/* "public" fields */
+  struct jpeg_error_mgr pub;    /* "public" fields */
 
-  jmp_buf setjmp_buffer;	/* for return to caller */
+  jmp_buf setjmp_buffer;        /* for return to caller */
 };
 
-typedef struct my_error_mgr * my_error_ptr;
+typedef struct my_error_mgr *my_error_ptr;
 
 /*
  * Here's the routine that will replace the standard error_exit method:
@@ -281,7 +281,7 @@
 
 
 GLOBAL(int)
-read_JPEG_file (char * filename)
+read_JPEG_file (char *filename)
 {
   /* This struct contains the JPEG decompression parameters and pointers to
    * working space (which is allocated as needed by the JPEG library).
@@ -293,9 +293,9 @@
    */
   struct my_error_mgr jerr;
   /* More stuff */
-  FILE * infile;		/* source file */
-  JSAMPARRAY buffer;		/* Output row buffer */
-  int row_stride;		/* physical row width in output buffer */
+  FILE *infile;                 /* source file */
+  JSAMPARRAY buffer;            /* Output row buffer */
+  int row_stride;               /* physical row width in output buffer */
 
   /* In this example we want to open the input file before doing anything else,
    * so that the setjmp() error recovery below can assume the file is open.
@@ -356,12 +356,12 @@
    * output image dimensions available, as well as the output colormap
    * if we asked for color quantization.
    * In this example, we need to make an output work buffer of the right size.
-   */ 
+   */
   /* JSAMPLEs per row in output buffer */
   row_stride = cinfo.output_width * cinfo.output_components;
   /* Make a one-row-high sample array that will go away when done with image */
   buffer = (*cinfo.mem->alloc_sarray)
-		((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1);
+                ((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1);
 
   /* Step 6: while (scan lines remain to be read) */
   /*           jpeg_read_scanlines(...); */
diff --git a/jaricom.c b/jaricom.c
index f43e2ea..3bb557f 100644
--- a/jaricom.c
+++ b/jaricom.c
@@ -1,9 +1,12 @@
 /*
  * jaricom.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2009 by Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains probability estimation tables for common use in
  * arithmetic entropy encoding and decoding routines.
@@ -18,7 +21,7 @@
 #include "jpeglib.h"
 
 /* The following #define specifies the packing of the four components
- * into the compact INT32 representation.
+ * into the compact JLONG representation.
  * Note that this formula must match the actual arithmetic encoder
  * and decoder implementation.  The implementation has to be changed
  * if this formula is changed.
@@ -26,9 +29,9 @@
  * implementation (jbig_tab.c).
  */
 
-#define V(i,a,b,c,d) (((INT32)a << 16) | ((INT32)c << 8) | ((INT32)d << 7) | b)
+#define V(i,a,b,c,d) (((JLONG)a << 16) | ((JLONG)c << 8) | ((JLONG)d << 7) | b)
 
-const INT32 jpeg_aritab[113+1] = {
+const JLONG jpeg_aritab[113+1] = {
 /*
  * Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS
  */
diff --git a/jcapimin.c b/jcapimin.c
index 20ba9e9..15674be 100644
--- a/jcapimin.c
+++ b/jcapimin.c
@@ -1,10 +1,13 @@
 /*
  * jcapimin.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * Modified 2003-2010 by Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains application interface code for the compression half
  * of the JPEG library.  These are the "minimum" API routines that may be
@@ -33,12 +36,12 @@
   int i;
 
   /* Guard against version mismatches between library and caller. */
-  cinfo->mem = NULL;		/* so jpeg_destroy knows mem mgr not called */
+  cinfo->mem = NULL;            /* so jpeg_destroy knows mem mgr not called */
   if (version != JPEG_LIB_VERSION)
     ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version);
-  if (structsize != SIZEOF(struct jpeg_compress_struct))
-    ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE, 
-	     (int) SIZEOF(struct jpeg_compress_struct), (int) structsize);
+  if (structsize != sizeof(struct jpeg_compress_struct))
+    ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE,
+             (int) sizeof(struct jpeg_compress_struct), (int) structsize);
 
   /* For debugging purposes, we zero the whole master structure.
    * But the application has already set the err pointer, and may have set
@@ -47,9 +50,9 @@
    * complain here.
    */
   {
-    struct jpeg_error_mgr * err = cinfo->err;
-    void * client_data = cinfo->client_data; /* ignore Purify complaint here */
-    MEMZERO(cinfo, SIZEOF(struct jpeg_compress_struct));
+    struct jpeg_error_mgr *err = cinfo->err;
+    void *client_data = cinfo->client_data; /* ignore Purify complaint here */
+    MEMZERO(cinfo, sizeof(struct jpeg_compress_struct));
     cinfo->err = err;
     cinfo->client_data = client_data;
   }
@@ -85,7 +88,7 @@
 
   cinfo->script_space = NULL;
 
-  cinfo->input_gamma = 1.0;	/* in case application forgets */
+  cinfo->input_gamma = 1.0;     /* in case application forgets */
 
   /* OK, I'm ready */
   cinfo->global_state = CSTATE_START;
@@ -131,8 +134,8 @@
 jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress)
 {
   int i;
-  JQUANT_TBL * qtbl;
-  JHUFF_TBL * htbl;
+  JQUANT_TBL *qtbl;
+  JHUFF_TBL *htbl;
 
   for (i = 0; i < NUM_QUANT_TBLS; i++) {
     if ((qtbl = cinfo->quant_tbl_ptrs[i]) != NULL)
@@ -173,15 +176,15 @@
     (*cinfo->master->prepare_for_pass) (cinfo);
     for (iMCU_row = 0; iMCU_row < cinfo->total_iMCU_rows; iMCU_row++) {
       if (cinfo->progress != NULL) {
-	cinfo->progress->pass_counter = (long) iMCU_row;
-	cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows;
-	(*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        cinfo->progress->pass_counter = (long) iMCU_row;
+        cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows;
+        (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
       }
       /* We bypass the main controller and invoke coef controller directly;
        * all work is being done from the coefficient buffer.
        */
       if (! (*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE) NULL))
-	ERREXIT(cinfo, JERR_CANT_SUSPEND);
+        ERREXIT(cinfo, JERR_CANT_SUSPEND);
     }
     (*cinfo->master->finish_pass) (cinfo);
   }
@@ -202,9 +205,9 @@
 
 GLOBAL(void)
 jpeg_write_marker (j_compress_ptr cinfo, int marker,
-		   const JOCTET *dataptr, unsigned int datalen)
+                   const JOCTET *dataptr, unsigned int datalen)
 {
-  JMETHOD(void, write_marker_byte, (j_compress_ptr info, int val));
+  void (*write_marker_byte) (j_compress_ptr info, int val);
 
   if (cinfo->next_scanline != 0 ||
       (cinfo->global_state != CSTATE_SCANNING &&
@@ -213,7 +216,7 @@
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
 
   (*cinfo->marker->write_marker_header) (cinfo, marker, datalen);
-  write_marker_byte = cinfo->marker->write_marker_byte;	/* copy for speed */
+  write_marker_byte = cinfo->marker->write_marker_byte; /* copy for speed */
   while (datalen--) {
     (*write_marker_byte) (cinfo, *dataptr);
     dataptr++;
@@ -248,14 +251,14 @@
  * To produce a pair of files containing abbreviated tables and abbreviated
  * image data, one would proceed as follows:
  *
- *		initialize JPEG object
- *		set JPEG parameters
- *		set destination to table file
- *		jpeg_write_tables(cinfo);
- *		set destination to image file
- *		jpeg_start_compress(cinfo, FALSE);
- *		write data...
- *		jpeg_finish_compress(cinfo);
+ *              initialize JPEG object
+ *              set JPEG parameters
+ *              set destination to table file
+ *              jpeg_write_tables(cinfo);
+ *              set destination to image file
+ *              jpeg_start_compress(cinfo, FALSE);
+ *              write data...
+ *              jpeg_finish_compress(cinfo);
  *
  * jpeg_write_tables has the side effect of marking all tables written
  * (same as jpeg_suppress_tables(..., TRUE)).  Thus a subsequent start_compress
diff --git a/jcapistd.c b/jcapistd.c
index c0320b1..5c6d0be 100644
--- a/jcapistd.c
+++ b/jcapistd.c
@@ -3,7 +3,8 @@
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains application interface code for the compression half
  * of the JPEG library.  These are the "standard" API routines that are
@@ -41,7 +42,7 @@
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
 
   if (write_all_tables)
-    jpeg_suppress_tables(cinfo, FALSE);	/* mark all tables to be written */
+    jpeg_suppress_tables(cinfo, FALSE); /* mark all tables to be written */
 
   /* (Re)initialize error mgr and destination modules */
   (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
@@ -75,7 +76,7 @@
 
 GLOBAL(JDIMENSION)
 jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines,
-		      JDIMENSION num_lines)
+                      JDIMENSION num_lines)
 {
   JDIMENSION row_ctr, rows_left;
 
@@ -118,7 +119,7 @@
 
 GLOBAL(JDIMENSION)
 jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
-		     JDIMENSION num_lines)
+                     JDIMENSION num_lines)
 {
   JDIMENSION lines_per_iMCU_row;
 
diff --git a/jcarith.c b/jcarith.c
index a9ca1c3..6d3b8af 100644
--- a/jcarith.c
+++ b/jcarith.c
@@ -1,9 +1,12 @@
 /*
  * jcarith.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2009 by Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains portable arithmetic entropy encoding routines for JPEG
  * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
@@ -23,10 +26,10 @@
 typedef struct {
   struct jpeg_entropy_encoder pub; /* public fields */
 
-  INT32 c; /* C register, base of coding interval, layout as in sec. D.1.3 */
-  INT32 a;               /* A register, normalized size of coding interval */
-  INT32 sc;        /* counter for stacked 0xFF values which might overflow */
-  INT32 zc;          /* counter for pending 0x00 output values which might *
+  JLONG c; /* C register, base of coding interval, layout as in sec. D.1.3 */
+  JLONG a;               /* A register, normalized size of coding interval */
+  JLONG sc;        /* counter for stacked 0xFF values which might overflow */
+  JLONG zc;          /* counter for pending 0x00 output values which might *
                           * be discarded at the end ("Pacman" termination) */
   int ct;  /* bit shift counter, determines when next byte will be written */
   int buffer;                /* buffer for most recent output byte != 0xFF */
@@ -34,18 +37,18 @@
   int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
   int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
 
-  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
-  int next_restart_num;		/* next restart number to write (0-7) */
+  unsigned int restarts_to_go;  /* MCUs left in this restart interval */
+  int next_restart_num;         /* next restart number to write (0-7) */
 
   /* Pointers to statistics areas (these workspaces have image lifespan) */
-  unsigned char * dc_stats[NUM_ARITH_TBLS];
-  unsigned char * ac_stats[NUM_ARITH_TBLS];
+  unsigned char *dc_stats[NUM_ARITH_TBLS];
+  unsigned char *ac_stats[NUM_ARITH_TBLS];
 
   /* Statistics bin for coding with fixed probability 0.5 */
   unsigned char fixed_bin[4];
 } arith_entropy_encoder;
 
-typedef arith_entropy_encoder * arith_entropy_ptr;
+typedef arith_entropy_encoder *arith_entropy_ptr;
 
 /* The following two definitions specify the allocation chunk size
  * for the statistics area.
@@ -95,20 +98,20 @@
 #define CALCULATE_SPECTRAL_CONDITIONING
  */
 
-/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
- * We assume that int right shift is unsigned if INT32 right shift is,
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than JLONG.
+ * We assume that int right shift is unsigned if JLONG right shift is,
  * which should be safe.
  */
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
-#define ISHIFT_TEMPS	int ishift_temp;
+#define ISHIFT_TEMPS    int ishift_temp;
 #define IRIGHT_SHIFT(x,shft)  \
-	((ishift_temp = (x)) < 0 ? \
-	 (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
-	 (ishift_temp >> (shft)))
+        ((ishift_temp = (x)) < 0 ? \
+         (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
+         (ishift_temp >> (shft)))
 #else
 #define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
+#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
 #endif
 
 
@@ -116,7 +119,7 @@
 emit_byte (int val, j_compress_ptr cinfo)
 /* Write next output byte; we do not support suspension in this module. */
 {
-  struct jpeg_destination_mgr * dest = cinfo->dest;
+  struct jpeg_destination_mgr *dest = cinfo->dest;
 
   *dest->next_output_byte++ = (JOCTET) val;
   if (--dest->free_in_buffer == 0)
@@ -133,7 +136,7 @@
 finish_pass (j_compress_ptr cinfo)
 {
   arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
-  INT32 temp;
+  JLONG temp;
 
   /* Section D.1.8: Termination of encoding */
 
@@ -149,11 +152,11 @@
     /* One final overflow has to be handled */
     if (e->buffer >= 0) {
       if (e->zc)
-	do emit_byte(0x00, cinfo);
-	while (--e->zc);
+        do emit_byte(0x00, cinfo);
+        while (--e->zc);
       emit_byte(e->buffer + 1, cinfo);
       if (e->buffer + 1 == 0xFF)
-	emit_byte(0x00, cinfo);
+        emit_byte(0x00, cinfo);
     }
     e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
     e->sc = 0;
@@ -162,17 +165,17 @@
       ++e->zc;
     else if (e->buffer >= 0) {
       if (e->zc)
-	do emit_byte(0x00, cinfo);
-	while (--e->zc);
+        do emit_byte(0x00, cinfo);
+        while (--e->zc);
       emit_byte(e->buffer, cinfo);
     }
     if (e->sc) {
       if (e->zc)
-	do emit_byte(0x00, cinfo);
-	while (--e->zc);
+        do emit_byte(0x00, cinfo);
+        while (--e->zc);
       do {
-	emit_byte(0xFF, cinfo);
-	emit_byte(0x00, cinfo);
+        emit_byte(0xFF, cinfo);
+        emit_byte(0x00, cinfo);
       } while (--e->sc);
     }
   }
@@ -187,7 +190,7 @@
     if (e->c & 0x7F800L) {
       emit_byte((e->c >> 11) & 0xFF, cinfo);
       if (((e->c >> 11) & 0xFF) == 0xFF)
-	emit_byte(0x00, cinfo);
+        emit_byte(0x00, cinfo);
     }
   }
 }
@@ -216,20 +219,20 @@
  */
 
 LOCAL(void)
-arith_encode (j_compress_ptr cinfo, unsigned char *st, int val) 
+arith_encode (j_compress_ptr cinfo, unsigned char *st, int val)
 {
   register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
   register unsigned char nl, nm;
-  register INT32 qe, temp;
+  register JLONG qe, temp;
   register int sv;
 
   /* Fetch values from our compact representation of Table D.2:
    * Qe values and probability estimation state machine
    */
   sv = *st;
-  qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
-  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
-  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
+  qe = jpeg_aritab[sv & 0x7F];  /* => Qe_Value */
+  nl = qe & 0xFF; qe >>= 8;     /* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF; qe >>= 8;     /* Next_Index_MPS */
 
   /* Encode & estimation procedures per sections D.1.4 & D.1.5 */
   e->a -= qe;
@@ -243,7 +246,7 @@
       e->c += e->a;
       e->a = qe;
     }
-    *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+    *st = (sv & 0x80) ^ nl;     /* Estimate_after_LPS */
   } else {
     /* Encode the more probable symbol */
     if (e->a >= 0x8000L)
@@ -255,7 +258,7 @@
       e->c += e->a;
       e->a = qe;
     }
-    *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+    *st = (sv & 0x80) ^ nm;     /* Estimate_after_MPS */
   }
 
   /* Renormalization & data output per section D.1.6 */
@@ -266,43 +269,43 @@
       /* Another byte is ready for output */
       temp = e->c >> 19;
       if (temp > 0xFF) {
-	/* Handle overflow over all stacked 0xFF bytes */
-	if (e->buffer >= 0) {
-	  if (e->zc)
-	    do emit_byte(0x00, cinfo);
-	    while (--e->zc);
-	  emit_byte(e->buffer + 1, cinfo);
-	  if (e->buffer + 1 == 0xFF)
-	    emit_byte(0x00, cinfo);
-	}
-	e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
-	e->sc = 0;
-	/* Note: The 3 spacer bits in the C register guarantee
-	 * that the new buffer byte can't be 0xFF here
-	 * (see page 160 in the P&M JPEG book). */
-	e->buffer = temp & 0xFF;  /* new output byte, might overflow later */
+        /* Handle overflow over all stacked 0xFF bytes */
+        if (e->buffer >= 0) {
+          if (e->zc)
+            do emit_byte(0x00, cinfo);
+            while (--e->zc);
+          emit_byte(e->buffer + 1, cinfo);
+          if (e->buffer + 1 == 0xFF)
+            emit_byte(0x00, cinfo);
+        }
+        e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
+        e->sc = 0;
+        /* Note: The 3 spacer bits in the C register guarantee
+         * that the new buffer byte can't be 0xFF here
+         * (see page 160 in the P&M JPEG book). */
+        e->buffer = temp & 0xFF;  /* new output byte, might overflow later */
       } else if (temp == 0xFF) {
-	++e->sc;  /* stack 0xFF byte (which might overflow later) */
+        ++e->sc;  /* stack 0xFF byte (which might overflow later) */
       } else {
-	/* Output all stacked 0xFF bytes, they will not overflow any more */
-	if (e->buffer == 0)
-	  ++e->zc;
-	else if (e->buffer >= 0) {
-	  if (e->zc)
-	    do emit_byte(0x00, cinfo);
-	    while (--e->zc);
-	  emit_byte(e->buffer, cinfo);
-	}
-	if (e->sc) {
-	  if (e->zc)
-	    do emit_byte(0x00, cinfo);
-	    while (--e->zc);
-	  do {
-	    emit_byte(0xFF, cinfo);
-	    emit_byte(0x00, cinfo);
-	  } while (--e->sc);
-	}
-	e->buffer = temp & 0xFF;  /* new output byte (can still overflow) */
+        /* Output all stacked 0xFF bytes, they will not overflow any more */
+        if (e->buffer == 0)
+          ++e->zc;
+        else if (e->buffer >= 0) {
+          if (e->zc)
+            do emit_byte(0x00, cinfo);
+            while (--e->zc);
+          emit_byte(e->buffer, cinfo);
+        }
+        if (e->sc) {
+          if (e->zc)
+            do emit_byte(0x00, cinfo);
+            while (--e->zc);
+          do {
+            emit_byte(0xFF, cinfo);
+            emit_byte(0x00, cinfo);
+          } while (--e->sc);
+        }
+        e->buffer = temp & 0xFF;  /* new output byte (can still overflow) */
       }
       e->c &= 0x7FFFFL;
       e->ct += 8;
@@ -320,7 +323,7 @@
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   finish_pass(cinfo);
 
@@ -398,45 +401,45 @@
     /* Figure F.4: Encode_DC_DIFF */
     if ((v = m - entropy->last_dc_val[ci]) == 0) {
       arith_encode(cinfo, st, 0);
-      entropy->dc_context[ci] = 0;	/* zero diff category */
+      entropy->dc_context[ci] = 0;      /* zero diff category */
     } else {
       entropy->last_dc_val[ci] = m;
       arith_encode(cinfo, st, 1);
       /* Figure F.6: Encoding nonzero value v */
       /* Figure F.7: Encoding the sign of v */
       if (v > 0) {
-	arith_encode(cinfo, st + 1, 0);	/* Table F.4: SS = S0 + 1 */
-	st += 2;			/* Table F.4: SP = S0 + 2 */
-	entropy->dc_context[ci] = 4;	/* small positive diff category */
+        arith_encode(cinfo, st + 1, 0); /* Table F.4: SS = S0 + 1 */
+        st += 2;                        /* Table F.4: SP = S0 + 2 */
+        entropy->dc_context[ci] = 4;    /* small positive diff category */
       } else {
-	v = -v;
-	arith_encode(cinfo, st + 1, 1);	/* Table F.4: SS = S0 + 1 */
-	st += 3;			/* Table F.4: SN = S0 + 3 */
-	entropy->dc_context[ci] = 8;	/* small negative diff category */
+        v = -v;
+        arith_encode(cinfo, st + 1, 1); /* Table F.4: SS = S0 + 1 */
+        st += 3;                        /* Table F.4: SN = S0 + 3 */
+        entropy->dc_context[ci] = 8;    /* small negative diff category */
       }
       /* Figure F.8: Encoding the magnitude category of v */
       m = 0;
       if (v -= 1) {
-	arith_encode(cinfo, st, 1);
-	m = 1;
-	v2 = v;
-	st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
-	while (v2 >>= 1) {
-	  arith_encode(cinfo, st, 1);
-	  m <<= 1;
-	  st += 1;
-	}
+        arith_encode(cinfo, st, 1);
+        m = 1;
+        v2 = v;
+        st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+        while (v2 >>= 1) {
+          arith_encode(cinfo, st, 1);
+          m <<= 1;
+          st += 1;
+        }
       }
       arith_encode(cinfo, st, 0);
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
       if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
-	entropy->dc_context[ci] = 0;	/* zero diff category */
+        entropy->dc_context[ci] = 0;    /* zero diff category */
       else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
-	entropy->dc_context[ci] += 8;	/* large diff category */
+        entropy->dc_context[ci] += 8;   /* large diff category */
       /* Figure F.9: Encoding the magnitude bit pattern of v */
       st += 14;
       while (m >>= 1)
-	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+        arith_encode(cinfo, st, (m & v) ? 1 : 0);
     }
   }
 
@@ -491,21 +494,21 @@
   /* Figure F.5: Encode_AC_Coefficients */
   for (k = cinfo->Ss; k <= ke; k++) {
     st = entropy->ac_stats[tbl] + 3 * (k - 1);
-    arith_encode(cinfo, st, 0);		/* EOB decision */
+    arith_encode(cinfo, st, 0);         /* EOB decision */
     for (;;) {
       if ((v = (*block)[jpeg_natural_order[k]]) >= 0) {
-	if (v >>= cinfo->Al) {
-	  arith_encode(cinfo, st + 1, 1);
-	  arith_encode(cinfo, entropy->fixed_bin, 0);
-	  break;
-	}
+        if (v >>= cinfo->Al) {
+          arith_encode(cinfo, st + 1, 1);
+          arith_encode(cinfo, entropy->fixed_bin, 0);
+          break;
+        }
       } else {
-	v = -v;
-	if (v >>= cinfo->Al) {
-	  arith_encode(cinfo, st + 1, 1);
-	  arith_encode(cinfo, entropy->fixed_bin, 1);
-	  break;
-	}
+        v = -v;
+        if (v >>= cinfo->Al) {
+          arith_encode(cinfo, st + 1, 1);
+          arith_encode(cinfo, entropy->fixed_bin, 1);
+          break;
+        }
       }
       arith_encode(cinfo, st + 1, 0); st += 3; k++;
     }
@@ -517,15 +520,15 @@
       m = 1;
       v2 = v;
       if (v2 >>= 1) {
-	arith_encode(cinfo, st, 1);
-	m <<= 1;
-	st = entropy->ac_stats[tbl] +
-	     (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
-	while (v2 >>= 1) {
-	  arith_encode(cinfo, st, 1);
-	  m <<= 1;
-	  st += 1;
-	}
+        arith_encode(cinfo, st, 1);
+        m <<= 1;
+        st = entropy->ac_stats[tbl] +
+             (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+        while (v2 >>= 1) {
+          arith_encode(cinfo, st, 1);
+          m <<= 1;
+          st += 1;
+        }
       }
     }
     arith_encode(cinfo, st, 0);
@@ -566,7 +569,7 @@
     entropy->restarts_to_go--;
   }
 
-  st = entropy->fixed_bin;	/* use fixed probability estimation */
+  st = entropy->fixed_bin;      /* use fixed probability estimation */
   Al = cinfo->Al;
 
   /* Encode the MCU data blocks */
@@ -635,29 +638,29 @@
   for (k = cinfo->Ss; k <= ke; k++) {
     st = entropy->ac_stats[tbl] + 3 * (k - 1);
     if (k > kex)
-      arith_encode(cinfo, st, 0);	/* EOB decision */
+      arith_encode(cinfo, st, 0);       /* EOB decision */
     for (;;) {
       if ((v = (*block)[jpeg_natural_order[k]]) >= 0) {
-	if (v >>= cinfo->Al) {
-	  if (v >> 1)			/* previously nonzero coef */
-	    arith_encode(cinfo, st + 2, (v & 1));
-	  else {			/* newly nonzero coef */
-	    arith_encode(cinfo, st + 1, 1);
-	    arith_encode(cinfo, entropy->fixed_bin, 0);
-	  }
-	  break;
-	}
+        if (v >>= cinfo->Al) {
+          if (v >> 1)                   /* previously nonzero coef */
+            arith_encode(cinfo, st + 2, (v & 1));
+          else {                        /* newly nonzero coef */
+            arith_encode(cinfo, st + 1, 1);
+            arith_encode(cinfo, entropy->fixed_bin, 0);
+          }
+          break;
+        }
       } else {
-	v = -v;
-	if (v >>= cinfo->Al) {
-	  if (v >> 1)			/* previously nonzero coef */
-	    arith_encode(cinfo, st + 2, (v & 1));
-	  else {			/* newly nonzero coef */
-	    arith_encode(cinfo, st + 1, 1);
-	    arith_encode(cinfo, entropy->fixed_bin, 1);
-	  }
-	  break;
-	}
+        v = -v;
+        if (v >>= cinfo->Al) {
+          if (v >> 1)                   /* previously nonzero coef */
+            arith_encode(cinfo, st + 2, (v & 1));
+          else {                        /* newly nonzero coef */
+            arith_encode(cinfo, st + 1, 1);
+            arith_encode(cinfo, entropy->fixed_bin, 1);
+          }
+          break;
+        }
       }
       arith_encode(cinfo, st + 1, 0); st += 3; k++;
     }
@@ -680,7 +683,7 @@
 encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JBLOCKROW block;
   unsigned char *st;
   int blkn, ci, tbl, k, ke;
@@ -713,45 +716,45 @@
     /* Figure F.4: Encode_DC_DIFF */
     if ((v = (*block)[0] - entropy->last_dc_val[ci]) == 0) {
       arith_encode(cinfo, st, 0);
-      entropy->dc_context[ci] = 0;	/* zero diff category */
+      entropy->dc_context[ci] = 0;      /* zero diff category */
     } else {
       entropy->last_dc_val[ci] = (*block)[0];
       arith_encode(cinfo, st, 1);
       /* Figure F.6: Encoding nonzero value v */
       /* Figure F.7: Encoding the sign of v */
       if (v > 0) {
-	arith_encode(cinfo, st + 1, 0);	/* Table F.4: SS = S0 + 1 */
-	st += 2;			/* Table F.4: SP = S0 + 2 */
-	entropy->dc_context[ci] = 4;	/* small positive diff category */
+        arith_encode(cinfo, st + 1, 0); /* Table F.4: SS = S0 + 1 */
+        st += 2;                        /* Table F.4: SP = S0 + 2 */
+        entropy->dc_context[ci] = 4;    /* small positive diff category */
       } else {
-	v = -v;
-	arith_encode(cinfo, st + 1, 1);	/* Table F.4: SS = S0 + 1 */
-	st += 3;			/* Table F.4: SN = S0 + 3 */
-	entropy->dc_context[ci] = 8;	/* small negative diff category */
+        v = -v;
+        arith_encode(cinfo, st + 1, 1); /* Table F.4: SS = S0 + 1 */
+        st += 3;                        /* Table F.4: SN = S0 + 3 */
+        entropy->dc_context[ci] = 8;    /* small negative diff category */
       }
       /* Figure F.8: Encoding the magnitude category of v */
       m = 0;
       if (v -= 1) {
-	arith_encode(cinfo, st, 1);
-	m = 1;
-	v2 = v;
-	st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
-	while (v2 >>= 1) {
-	  arith_encode(cinfo, st, 1);
-	  m <<= 1;
-	  st += 1;
-	}
+        arith_encode(cinfo, st, 1);
+        m = 1;
+        v2 = v;
+        st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+        while (v2 >>= 1) {
+          arith_encode(cinfo, st, 1);
+          m <<= 1;
+          st += 1;
+        }
       }
       arith_encode(cinfo, st, 0);
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
       if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
-	entropy->dc_context[ci] = 0;	/* zero diff category */
+        entropy->dc_context[ci] = 0;    /* zero diff category */
       else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
-	entropy->dc_context[ci] += 8;	/* large diff category */
+        entropy->dc_context[ci] += 8;   /* large diff category */
       /* Figure F.9: Encoding the magnitude bit pattern of v */
       st += 14;
       while (m >>= 1)
-	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+        arith_encode(cinfo, st, (m & v) ? 1 : 0);
     }
 
     /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
@@ -765,43 +768,43 @@
     /* Figure F.5: Encode_AC_Coefficients */
     for (k = 1; k <= ke; k++) {
       st = entropy->ac_stats[tbl] + 3 * (k - 1);
-      arith_encode(cinfo, st, 0);	/* EOB decision */
+      arith_encode(cinfo, st, 0);       /* EOB decision */
       while ((v = (*block)[jpeg_natural_order[k]]) == 0) {
-	arith_encode(cinfo, st + 1, 0); st += 3; k++;
+        arith_encode(cinfo, st + 1, 0); st += 3; k++;
       }
       arith_encode(cinfo, st + 1, 1);
       /* Figure F.6: Encoding nonzero value v */
       /* Figure F.7: Encoding the sign of v */
       if (v > 0) {
-	arith_encode(cinfo, entropy->fixed_bin, 0);
+        arith_encode(cinfo, entropy->fixed_bin, 0);
       } else {
-	v = -v;
-	arith_encode(cinfo, entropy->fixed_bin, 1);
+        v = -v;
+        arith_encode(cinfo, entropy->fixed_bin, 1);
       }
       st += 2;
       /* Figure F.8: Encoding the magnitude category of v */
       m = 0;
       if (v -= 1) {
-	arith_encode(cinfo, st, 1);
-	m = 1;
-	v2 = v;
-	if (v2 >>= 1) {
-	  arith_encode(cinfo, st, 1);
-	  m <<= 1;
-	  st = entropy->ac_stats[tbl] +
-	       (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
-	  while (v2 >>= 1) {
-	    arith_encode(cinfo, st, 1);
-	    m <<= 1;
-	    st += 1;
-	  }
-	}
+        arith_encode(cinfo, st, 1);
+        m = 1;
+        v2 = v;
+        if (v2 >>= 1) {
+          arith_encode(cinfo, st, 1);
+          m <<= 1;
+          st = entropy->ac_stats[tbl] +
+               (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+          while (v2 >>= 1) {
+            arith_encode(cinfo, st, 1);
+            m <<= 1;
+            st += 1;
+          }
+        }
       }
       arith_encode(cinfo, st, 0);
       /* Figure F.9: Encoding the magnitude bit pattern of v */
       st += 14;
       while (m >>= 1)
-	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+        arith_encode(cinfo, st, (m & v) ? 1 : 0);
     }
     /* Encode EOB decision only if k <= DCTSIZE2 - 1 */
     if (k <= DCTSIZE2 - 1) {
@@ -823,7 +826,7 @@
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   int ci, tbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   if (gather_statistics)
     /* Make sure to avoid that in the master control logic!
@@ -838,14 +841,14 @@
   if (cinfo->progressive_mode) {
     if (cinfo->Ah == 0) {
       if (cinfo->Ss == 0)
-	entropy->pub.encode_mcu = encode_mcu_DC_first;
+        entropy->pub.encode_mcu = encode_mcu_DC_first;
       else
-	entropy->pub.encode_mcu = encode_mcu_AC_first;
+        entropy->pub.encode_mcu = encode_mcu_AC_first;
     } else {
       if (cinfo->Ss == 0)
-	entropy->pub.encode_mcu = encode_mcu_DC_refine;
+        entropy->pub.encode_mcu = encode_mcu_DC_refine;
       else
-	entropy->pub.encode_mcu = encode_mcu_AC_refine;
+        entropy->pub.encode_mcu = encode_mcu_AC_refine;
     }
   } else
     entropy->pub.encode_mcu = encode_mcu;
@@ -857,10 +860,10 @@
     if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
       tbl = compptr->dc_tbl_no;
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
-	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+        ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->dc_stats[tbl] == NULL)
-	entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-	  ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+        entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+          ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
       MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
       /* Initialize DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
@@ -870,15 +873,15 @@
     if (cinfo->progressive_mode == 0 || cinfo->Se) {
       tbl = compptr->ac_tbl_no;
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
-	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+        ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->ac_stats[tbl] == NULL)
-	entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-	  ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+        entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+          ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
       MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
 #ifdef CALCULATE_SPECTRAL_CONDITIONING
       if (cinfo->progressive_mode)
-	/* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
-	cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4);
+        /* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
+        cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4);
 #endif
     }
   }
@@ -909,7 +912,7 @@
 
   entropy = (arith_entropy_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(arith_entropy_encoder));
+                                sizeof(arith_entropy_encoder));
   cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
   entropy->pub.start_pass = start_pass;
   entropy->pub.finish_pass = finish_pass;
diff --git a/jccoefct.c b/jccoefct.c
index 1963ddb..a08d6e3 100644
--- a/jccoefct.c
+++ b/jccoefct.c
@@ -1,9 +1,12 @@
 /*
  * jccoefct.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code and
+ * information relevant to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the coefficient buffer controller for compression.
  * This controller is the top level of the JPEG compressor proper.
@@ -34,19 +37,16 @@
 typedef struct {
   struct jpeg_c_coef_controller pub; /* public fields */
 
-  JDIMENSION iMCU_row_num;	/* iMCU row # within image */
-  JDIMENSION mcu_ctr;		/* counts MCUs processed in current row */
-  int MCU_vert_offset;		/* counts MCU rows within iMCU row */
-  int MCU_rows_per_iMCU_row;	/* number of such rows needed */
+  JDIMENSION iMCU_row_num;      /* iMCU row # within image */
+  JDIMENSION mcu_ctr;           /* counts MCUs processed in current row */
+  int MCU_vert_offset;          /* counts MCU rows within iMCU row */
+  int MCU_rows_per_iMCU_row;    /* number of such rows needed */
 
   /* For single-pass compression, it's sufficient to buffer just one MCU
    * (although this may prove a bit slow in practice).  We allocate a
    * workspace of C_MAX_BLOCKS_IN_MCU coefficient blocks, and reuse it for each
-   * MCU constructed and sent.  (On 80x86, the workspace is FAR even though
-   * it's not really very big; this is to keep the module interfaces unchanged
-   * when a large coefficient buffer is necessary.)
-   * In multi-pass modes, this array points to the current MCU's blocks
-   * within the virtual arrays.
+   * MCU constructed and sent.  In multi-pass modes, this array points to the
+   * current MCU's blocks within the virtual arrays.
    */
   JBLOCKROW MCU_buffer[C_MAX_BLOCKS_IN_MCU];
 
@@ -54,17 +54,17 @@
   jvirt_barray_ptr whole_image[MAX_COMPONENTS];
 } my_coef_controller;
 
-typedef my_coef_controller * my_coef_ptr;
+typedef my_coef_controller *my_coef_ptr;
 
 
 /* Forward declarations */
 METHODDEF(boolean) compress_data
-    JPP((j_compress_ptr cinfo, JSAMPIMAGE input_buf));
+        (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
 #ifdef FULL_COEF_BUFFER_SUPPORTED
 METHODDEF(boolean) compress_first_pass
-    JPP((j_compress_ptr cinfo, JSAMPIMAGE input_buf));
+        (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
 METHODDEF(boolean) compress_output
-    JPP((j_compress_ptr cinfo, JSAMPIMAGE input_buf));
+        (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
 #endif
 
 
@@ -143,7 +143,7 @@
 compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
   my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
-  JDIMENSION MCU_col_num;	/* index of current MCU within row */
+  JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   int blkn, bi, ci, yindex, yoffset, blockcnt;
@@ -154,7 +154,7 @@
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
     for (MCU_col_num = coef->mcu_ctr; MCU_col_num <= last_MCU_col;
-	 MCU_col_num++) {
+         MCU_col_num++) {
       /* Determine where data comes from in input_buf and do the DCT thing.
        * Each call on forward_DCT processes a horizontal row of DCT blocks
        * as wide as an MCU; we rely on having allocated the MCU_buffer[] blocks
@@ -166,46 +166,46 @@
        */
       blkn = 0;
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-	compptr = cinfo->cur_comp_info[ci];
-	blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-						: compptr->last_col_width;
-	xpos = MCU_col_num * compptr->MCU_sample_width;
-	ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */
-	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-	  if (coef->iMCU_row_num < last_iMCU_row ||
-	      yoffset+yindex < compptr->last_row_height) {
-	    (*cinfo->fdct->forward_DCT) (cinfo, compptr,
-					 input_buf[compptr->component_index],
-					 coef->MCU_buffer[blkn],
-					 ypos, xpos, (JDIMENSION) blockcnt);
-	    if (blockcnt < compptr->MCU_width) {
-	      /* Create some dummy blocks at the right edge of the image. */
-	      jzero_far((void FAR *) coef->MCU_buffer[blkn + blockcnt],
-			(compptr->MCU_width - blockcnt) * SIZEOF(JBLOCK));
-	      for (bi = blockcnt; bi < compptr->MCU_width; bi++) {
-		coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn+bi-1][0][0];
-	      }
-	    }
-	  } else {
-	    /* Create a row of dummy blocks at the bottom of the image. */
-	    jzero_far((void FAR *) coef->MCU_buffer[blkn],
-		      compptr->MCU_width * SIZEOF(JBLOCK));
-	    for (bi = 0; bi < compptr->MCU_width; bi++) {
-	      coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn-1][0][0];
-	    }
-	  }
-	  blkn += compptr->MCU_width;
-	  ypos += DCTSIZE;
-	}
+        compptr = cinfo->cur_comp_info[ci];
+        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
+                                                : compptr->last_col_width;
+        xpos = MCU_col_num * compptr->MCU_sample_width;
+        ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */
+        for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+          if (coef->iMCU_row_num < last_iMCU_row ||
+              yoffset+yindex < compptr->last_row_height) {
+            (*cinfo->fdct->forward_DCT) (cinfo, compptr,
+                                         input_buf[compptr->component_index],
+                                         coef->MCU_buffer[blkn],
+                                         ypos, xpos, (JDIMENSION) blockcnt);
+            if (blockcnt < compptr->MCU_width) {
+              /* Create some dummy blocks at the right edge of the image. */
+              jzero_far((void *) coef->MCU_buffer[blkn + blockcnt],
+                        (compptr->MCU_width - blockcnt) * sizeof(JBLOCK));
+              for (bi = blockcnt; bi < compptr->MCU_width; bi++) {
+                coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn+bi-1][0][0];
+              }
+            }
+          } else {
+            /* Create a row of dummy blocks at the bottom of the image. */
+            jzero_far((void *) coef->MCU_buffer[blkn],
+                      compptr->MCU_width * sizeof(JBLOCK));
+            for (bi = 0; bi < compptr->MCU_width; bi++) {
+              coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn-1][0][0];
+            }
+          }
+          blkn += compptr->MCU_width;
+          ypos += DCTSIZE;
+        }
       }
       /* Try to write the MCU.  In event of a suspension failure, we will
        * re-DCT the MCU on restart (a bit inefficient, could be fixed...)
        */
       if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
-	/* Suspension forced; update state counters and exit */
-	coef->MCU_vert_offset = yoffset;
-	coef->mcu_ctr = MCU_col_num;
-	return FALSE;
+        /* Suspension forced; update state counters and exit */
+        coef->MCU_vert_offset = yoffset;
+        coef->mcu_ctr = MCU_col_num;
+        return FALSE;
       }
     }
     /* Completed an MCU row, but perhaps not an iMCU row */
@@ -280,17 +280,17 @@
     for (block_row = 0; block_row < block_rows; block_row++) {
       thisblockrow = buffer[block_row];
       (*cinfo->fdct->forward_DCT) (cinfo, compptr,
-				   input_buf[ci], thisblockrow,
-				   (JDIMENSION) (block_row * DCTSIZE),
-				   (JDIMENSION) 0, blocks_across);
+                                   input_buf[ci], thisblockrow,
+                                   (JDIMENSION) (block_row * DCTSIZE),
+                                   (JDIMENSION) 0, blocks_across);
       if (ndummy > 0) {
-	/* Create dummy blocks at the right edge of the image. */
-	thisblockrow += blocks_across; /* => first dummy block */
-	jzero_far((void FAR *) thisblockrow, ndummy * SIZEOF(JBLOCK));
-	lastDC = thisblockrow[-1][0];
-	for (bi = 0; bi < ndummy; bi++) {
-	  thisblockrow[bi][0] = lastDC;
-	}
+        /* Create dummy blocks at the right edge of the image. */
+        thisblockrow += blocks_across; /* => first dummy block */
+        jzero_far((void *) thisblockrow, ndummy * sizeof(JBLOCK));
+        lastDC = thisblockrow[-1][0];
+        for (bi = 0; bi < ndummy; bi++) {
+          thisblockrow[bi][0] = lastDC;
+        }
       }
     }
     /* If at end of image, create dummy block rows as needed.
@@ -299,22 +299,22 @@
      * This squeezes a few more bytes out of the resulting file...
      */
     if (coef->iMCU_row_num == last_iMCU_row) {
-      blocks_across += ndummy;	/* include lower right corner */
+      blocks_across += ndummy;  /* include lower right corner */
       MCUs_across = blocks_across / h_samp_factor;
       for (block_row = block_rows; block_row < compptr->v_samp_factor;
-	   block_row++) {
-	thisblockrow = buffer[block_row];
-	lastblockrow = buffer[block_row-1];
-	jzero_far((void FAR *) thisblockrow,
-		  (size_t) (blocks_across * SIZEOF(JBLOCK)));
-	for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
-	  lastDC = lastblockrow[h_samp_factor-1][0];
-	  for (bi = 0; bi < h_samp_factor; bi++) {
-	    thisblockrow[bi][0] = lastDC;
-	  }
-	  thisblockrow += h_samp_factor; /* advance to next MCU in row */
-	  lastblockrow += h_samp_factor;
-	}
+           block_row++) {
+        thisblockrow = buffer[block_row];
+        lastblockrow = buffer[block_row-1];
+        jzero_far((void *) thisblockrow,
+                  (size_t) (blocks_across * sizeof(JBLOCK)));
+        for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
+          lastDC = lastblockrow[h_samp_factor-1][0];
+          for (bi = 0; bi < h_samp_factor; bi++) {
+            thisblockrow[bi][0] = lastDC;
+          }
+          thisblockrow += h_samp_factor; /* advance to next MCU in row */
+          lastblockrow += h_samp_factor;
+        }
       }
     }
   }
@@ -341,7 +341,7 @@
 compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
   my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
-  JDIMENSION MCU_col_num;	/* index of current MCU within row */
+  JDIMENSION MCU_col_num;       /* index of current MCU within row */
   int blkn, ci, xindex, yindex, yoffset;
   JDIMENSION start_col;
   JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN];
@@ -364,25 +364,25 @@
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
     for (MCU_col_num = coef->mcu_ctr; MCU_col_num < cinfo->MCUs_per_row;
-	 MCU_col_num++) {
+         MCU_col_num++) {
       /* Construct list of pointers to DCT blocks belonging to this MCU */
-      blkn = 0;			/* index of current DCT block within MCU */
+      blkn = 0;                 /* index of current DCT block within MCU */
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-	compptr = cinfo->cur_comp_info[ci];
-	start_col = MCU_col_num * compptr->MCU_width;
-	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-	  buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
-	  for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
-	    coef->MCU_buffer[blkn++] = buffer_ptr++;
-	  }
-	}
+        compptr = cinfo->cur_comp_info[ci];
+        start_col = MCU_col_num * compptr->MCU_width;
+        for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+          buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+          for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
+            coef->MCU_buffer[blkn++] = buffer_ptr++;
+          }
+        }
       }
       /* Try to write the MCU. */
       if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
-	/* Suspension forced; update state counters and exit */
-	coef->MCU_vert_offset = yoffset;
-	coef->mcu_ctr = MCU_col_num;
-	return FALSE;
+        /* Suspension forced; update state counters and exit */
+        coef->MCU_vert_offset = yoffset;
+        coef->mcu_ctr = MCU_col_num;
+        return FALSE;
       }
     }
     /* Completed an MCU row, but perhaps not an iMCU row */
@@ -408,7 +408,7 @@
 
   coef = (my_coef_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_coef_controller));
+                                sizeof(my_coef_controller));
   cinfo->coef = (struct jpeg_c_coef_controller *) coef;
   coef->pub.start_pass = start_pass_coef;
 
@@ -421,14 +421,14 @@
     jpeg_component_info *compptr;
 
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-	 ci++, compptr++) {
+         ci++, compptr++) {
       coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
-	((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-	 (JDIMENSION) jround_up((long) compptr->width_in_blocks,
-				(long) compptr->h_samp_factor),
-	 (JDIMENSION) jround_up((long) compptr->height_in_blocks,
-				(long) compptr->v_samp_factor),
-	 (JDIMENSION) compptr->v_samp_factor);
+        ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
+         (JDIMENSION) jround_up((long) compptr->width_in_blocks,
+                                (long) compptr->h_samp_factor),
+         (JDIMENSION) jround_up((long) compptr->height_in_blocks,
+                                (long) compptr->v_samp_factor),
+         (JDIMENSION) compptr->v_samp_factor);
     }
 #else
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -440,7 +440,7 @@
 
     buffer = (JBLOCKROW)
       (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
+                                  C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
     for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
       coef->MCU_buffer[i] = buffer + i;
     }
diff --git a/jccolext.c b/jccolext.c
index dda3beb..479b320 100644
--- a/jccolext.c
+++ b/jccolext.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2012, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2009-2012, 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains input colorspace conversion routines.
  */
@@ -34,7 +35,7 @@
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int r, g, b;
-  register INT32 * ctab = cconvert->rgb_ycc_tab;
+  register JLONG * ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr0, outptr1, outptr2;
   register JDIMENSION col;
@@ -58,16 +59,16 @@
        */
       /* Y */
       outptr0[col] = (JSAMPLE)
-		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-		 >> SCALEBITS);
+                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
+                 >> SCALEBITS);
       /* Cb */
       outptr1[col] = (JSAMPLE)
-		((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
-		 >> SCALEBITS);
+                ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
+                 >> SCALEBITS);
       /* Cr */
       outptr2[col] = (JSAMPLE)
-		((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
-		 >> SCALEBITS);
+                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
+                 >> SCALEBITS);
     }
   }
 }
@@ -91,7 +92,7 @@
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int r, g, b;
-  register INT32 * ctab = cconvert->rgb_ycc_tab;
+  register JLONG * ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr;
   register JDIMENSION col;
@@ -108,8 +109,8 @@
       inptr += RGB_PIXELSIZE;
       /* Y */
       outptr[col] = (JSAMPLE)
-		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-		 >> SCALEBITS);
+                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
+                 >> SCALEBITS);
     }
   }
 }
diff --git a/jccolor.c b/jccolor.c
index 94b4184..a93498a 100644
--- a/jccolor.c
+++ b/jccolor.c
@@ -5,8 +5,10 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2012, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2009-2012, 2015 D. R. Commander.
+ * Copyright (C) 2014, MIPS Technologies, Inc., California
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains input colorspace conversion routines.
  */
@@ -15,7 +17,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jsimd.h"
-#include "config.h"
+#include "jconfigint.h"
 
 
 /* Private subobject */
@@ -24,10 +26,10 @@
   struct jpeg_color_converter pub; /* public fields */
 
   /* Private state for RGB->YCC conversion */
-  INT32 * rgb_ycc_tab;		/* => table for RGB to YCbCr conversion */
+  JLONG *rgb_ycc_tab;           /* => table for RGB to YCbCr conversion */
 } my_color_converter;
 
-typedef my_color_converter * my_cconvert_ptr;
+typedef my_color_converter *my_cconvert_ptr;
 
 
 /**************** RGB -> YCbCr conversion: most common case **************/
@@ -36,9 +38,9 @@
  * YCbCr is defined per CCIR 601-1, except that Cb and Cr are
  * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5.
  * The conversion equations to be implemented are therefore
- *	Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
- *	Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + CENTERJSAMPLE
- *	Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + CENTERJSAMPLE
+ *      Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *      Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + CENTERJSAMPLE
+ *      Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + CENTERJSAMPLE
  * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.)
  * Note: older versions of the IJG code used a zero offset of MAXJSAMPLE/2,
  * rather than CENTERJSAMPLE, for Cb and Cr.  This gave equal positive and
@@ -60,10 +62,10 @@
  * in the tables to save adding them separately in the inner loop.
  */
 
-#define SCALEBITS	16	/* speediest right-shift on some machines */
-#define CBCR_OFFSET	((INT32) CENTERJSAMPLE << SCALEBITS)
-#define ONE_HALF	((INT32) 1 << (SCALEBITS-1))
-#define FIX(x)		((INT32) ((x) * (1L<<SCALEBITS) + 0.5))
+#define SCALEBITS       16      /* speediest right-shift on some machines */
+#define CBCR_OFFSET     ((JLONG) CENTERJSAMPLE << SCALEBITS)
+#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
+#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
 
 /* We allocate one big table and divide it up into eight parts, instead of
  * doing eight alloc_small requests.  This lets us use a single table base
@@ -71,16 +73,16 @@
  * machines (more than can hold all eight addresses, anyway).
  */
 
-#define R_Y_OFF		0			/* offset to R => Y section */
-#define G_Y_OFF		(1*(MAXJSAMPLE+1))	/* offset to G => Y section */
-#define B_Y_OFF		(2*(MAXJSAMPLE+1))	/* etc. */
-#define R_CB_OFF	(3*(MAXJSAMPLE+1))
-#define G_CB_OFF	(4*(MAXJSAMPLE+1))
-#define B_CB_OFF	(5*(MAXJSAMPLE+1))
-#define R_CR_OFF	B_CB_OFF		/* B=>Cb, R=>Cr are the same */
-#define G_CR_OFF	(6*(MAXJSAMPLE+1))
-#define B_CR_OFF	(7*(MAXJSAMPLE+1))
-#define TABLE_SIZE	(8*(MAXJSAMPLE+1))
+#define R_Y_OFF         0                       /* offset to R => Y section */
+#define G_Y_OFF         (1*(MAXJSAMPLE+1))      /* offset to G => Y section */
+#define B_Y_OFF         (2*(MAXJSAMPLE+1))      /* etc. */
+#define R_CB_OFF        (3*(MAXJSAMPLE+1))
+#define G_CB_OFF        (4*(MAXJSAMPLE+1))
+#define B_CB_OFF        (5*(MAXJSAMPLE+1))
+#define R_CR_OFF        B_CB_OFF                /* B=>Cb, R=>Cr are the same */
+#define G_CR_OFF        (6*(MAXJSAMPLE+1))
+#define B_CR_OFF        (7*(MAXJSAMPLE+1))
+#define TABLE_SIZE      (8*(MAXJSAMPLE+1))
 
 
 /* Include inline routines for colorspace extensions */
@@ -196,13 +198,13 @@
 rgb_ycc_start (j_compress_ptr cinfo)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  INT32 * rgb_ycc_tab;
-  INT32 i;
+  JLONG *rgb_ycc_tab;
+  JLONG i;
 
   /* Allocate and fill in the conversion tables. */
-  cconvert->rgb_ycc_tab = rgb_ycc_tab = (INT32 *)
+  cconvert->rgb_ycc_tab = rgb_ycc_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(TABLE_SIZE * SIZEOF(INT32)));
+                                (TABLE_SIZE * sizeof(JLONG)));
 
   for (i = 0; i <= MAXJSAMPLE; i++) {
     rgb_ycc_tab[i+R_Y_OFF] = FIX(0.29900) * i;
@@ -230,8 +232,8 @@
 
 METHODDEF(void)
 rgb_ycc_convert (j_compress_ptr cinfo,
-		 JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-		 JDIMENSION output_row, int num_rows)
+                 JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                 JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
     case JCS_EXT_RGB:
@@ -279,8 +281,8 @@
 
 METHODDEF(void)
 rgb_gray_convert (j_compress_ptr cinfo,
-		  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-		  JDIMENSION output_row, int num_rows)
+                  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                  JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
     case JCS_EXT_RGB:
@@ -325,8 +327,8 @@
 
 METHODDEF(void)
 rgb_rgb_convert (j_compress_ptr cinfo,
-		  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-		  JDIMENSION output_row, int num_rows)
+                  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                  JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
     case JCS_EXT_RGB:
@@ -375,12 +377,12 @@
 
 METHODDEF(void)
 cmyk_ycck_convert (j_compress_ptr cinfo,
-		   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-		   JDIMENSION output_row, int num_rows)
+                   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                   JDIMENSION output_row, int num_rows)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int r, g, b;
-  register INT32 * ctab = cconvert->rgb_ycc_tab;
+  register JLONG *ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr0, outptr1, outptr2, outptr3;
   register JDIMENSION col;
@@ -398,7 +400,7 @@
       g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
       b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
       /* K passes through as-is */
-      outptr3[col] = inptr[3];	/* don't need GETJSAMPLE here */
+      outptr3[col] = inptr[3];  /* don't need GETJSAMPLE here */
       inptr += 4;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -407,16 +409,16 @@
        */
       /* Y */
       outptr0[col] = (JSAMPLE)
-		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-		 >> SCALEBITS);
+                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
+                 >> SCALEBITS);
       /* Cb */
       outptr1[col] = (JSAMPLE)
-		((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
-		 >> SCALEBITS);
+                ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
+                 >> SCALEBITS);
       /* Cr */
       outptr2[col] = (JSAMPLE)
-		((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
-		 >> SCALEBITS);
+                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
+                 >> SCALEBITS);
     }
   }
 }
@@ -430,8 +432,8 @@
 
 METHODDEF(void)
 grayscale_convert (j_compress_ptr cinfo,
-		   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-		   JDIMENSION output_row, int num_rows)
+                   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                   JDIMENSION output_row, int num_rows)
 {
   register JSAMPROW inptr;
   register JSAMPROW outptr;
@@ -444,7 +446,7 @@
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr[col] = inptr[0];	/* don't need GETJSAMPLE() here */
+      outptr[col] = inptr[0];   /* don't need GETJSAMPLE() here */
       inptr += instride;
     }
   }
@@ -459,28 +461,58 @@
 
 METHODDEF(void)
 null_convert (j_compress_ptr cinfo,
-	      JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-	      JDIMENSION output_row, int num_rows)
+              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+              JDIMENSION output_row, int num_rows)
 {
   register JSAMPROW inptr;
-  register JSAMPROW outptr;
+  register JSAMPROW outptr, outptr0, outptr1, outptr2, outptr3;
   register JDIMENSION col;
   register int ci;
   int nc = cinfo->num_components;
   JDIMENSION num_cols = cinfo->image_width;
 
-  while (--num_rows >= 0) {
-    /* It seems fastest to make a separate pass for each component. */
-    for (ci = 0; ci < nc; ci++) {
-      inptr = *input_buf;
-      outptr = output_buf[ci][output_row];
+  if (nc == 3) {
+    while (--num_rows >= 0) {
+      inptr = *input_buf++;
+      outptr0 = output_buf[0][output_row];
+      outptr1 = output_buf[1][output_row];
+      outptr2 = output_buf[2][output_row];
+      output_row++;
       for (col = 0; col < num_cols; col++) {
-	outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */
-	inptr += nc;
+        outptr0[col] = *inptr++;
+        outptr1[col] = *inptr++;
+        outptr2[col] = *inptr++;
       }
     }
-    input_buf++;
-    output_row++;
+  } else if (nc == 4) {
+    while (--num_rows >= 0) {
+      inptr = *input_buf++;
+      outptr0 = output_buf[0][output_row];
+      outptr1 = output_buf[1][output_row];
+      outptr2 = output_buf[2][output_row];
+      outptr3 = output_buf[3][output_row];
+      output_row++;
+      for (col = 0; col < num_cols; col++) {
+        outptr0[col] = *inptr++;
+        outptr1[col] = *inptr++;
+        outptr2[col] = *inptr++;
+        outptr3[col] = *inptr++;
+      }
+    }
+  } else {
+    while (--num_rows >= 0) {
+      /* It seems fastest to make a separate pass for each component. */
+      for (ci = 0; ci < nc; ci++) {
+        inptr = *input_buf;
+        outptr = output_buf[ci][output_row];
+        for (col = 0; col < num_cols; col++) {
+          outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */
+          inptr += nc;
+        }
+      }
+      input_buf++;
+      output_row++;
+    }
   }
 }
 
@@ -507,7 +539,7 @@
 
   cconvert = (my_cconvert_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_color_converter));
+                                sizeof(my_color_converter));
   cinfo->cconvert = (struct jpeg_color_converter *) cconvert;
   /* set start_pass to null method until we find out differently */
   cconvert->pub.start_pass = null_method;
@@ -545,7 +577,7 @@
       ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
     break;
 
-  default:			/* JCS_UNKNOWN can be anything */
+  default:                      /* JCS_UNKNOWN can be anything */
     if (cinfo->input_components < 1)
       ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
     break;
@@ -587,19 +619,24 @@
     if (rgb_red[cinfo->in_color_space] == 0 &&
         rgb_green[cinfo->in_color_space] == 1 &&
         rgb_blue[cinfo->in_color_space] == 2 &&
-        rgb_pixelsize[cinfo->in_color_space] == 3)
-      cconvert->pub.color_convert = null_convert;
-    else if (cinfo->in_color_space == JCS_RGB ||
-             cinfo->in_color_space == JCS_EXT_RGB ||
-             cinfo->in_color_space == JCS_EXT_RGBX ||
-             cinfo->in_color_space == JCS_EXT_BGR ||
-             cinfo->in_color_space == JCS_EXT_BGRX ||
-             cinfo->in_color_space == JCS_EXT_XBGR ||
-             cinfo->in_color_space == JCS_EXT_XRGB ||
-             cinfo->in_color_space == JCS_EXT_RGBA ||
-             cinfo->in_color_space == JCS_EXT_BGRA ||
-             cinfo->in_color_space == JCS_EXT_ABGR ||
-             cinfo->in_color_space == JCS_EXT_ARGB)
+        rgb_pixelsize[cinfo->in_color_space] == 3) {
+#if defined(__mips__)
+      if (jsimd_c_can_null_convert())
+        cconvert->pub.color_convert = jsimd_c_null_convert;
+      else
+#endif
+        cconvert->pub.color_convert = null_convert;
+    } else if (cinfo->in_color_space == JCS_RGB ||
+               cinfo->in_color_space == JCS_EXT_RGB ||
+               cinfo->in_color_space == JCS_EXT_RGBX ||
+               cinfo->in_color_space == JCS_EXT_BGR ||
+               cinfo->in_color_space == JCS_EXT_BGRX ||
+               cinfo->in_color_space == JCS_EXT_XBGR ||
+               cinfo->in_color_space == JCS_EXT_XRGB ||
+               cinfo->in_color_space == JCS_EXT_RGBA ||
+               cinfo->in_color_space == JCS_EXT_BGRA ||
+               cinfo->in_color_space == JCS_EXT_ABGR ||
+               cinfo->in_color_space == JCS_EXT_ARGB)
       cconvert->pub.color_convert = rgb_rgb_convert;
     else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
@@ -625,18 +662,28 @@
         cconvert->pub.start_pass = rgb_ycc_start;
         cconvert->pub.color_convert = rgb_ycc_convert;
       }
-    } else if (cinfo->in_color_space == JCS_YCbCr)
-      cconvert->pub.color_convert = null_convert;
-    else
+    } else if (cinfo->in_color_space == JCS_YCbCr) {
+#if defined(__mips__)
+      if (jsimd_c_can_null_convert())
+        cconvert->pub.color_convert = jsimd_c_null_convert;
+      else
+#endif
+        cconvert->pub.color_convert = null_convert;
+    } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
 
   case JCS_CMYK:
     if (cinfo->num_components != 4)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == JCS_CMYK)
-      cconvert->pub.color_convert = null_convert;
-    else
+    if (cinfo->in_color_space == JCS_CMYK) {
+#if defined(__mips__)
+      if (jsimd_c_can_null_convert())
+        cconvert->pub.color_convert = jsimd_c_null_convert;
+      else
+#endif
+        cconvert->pub.color_convert = null_convert;
+    } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
 
@@ -646,17 +693,27 @@
     if (cinfo->in_color_space == JCS_CMYK) {
       cconvert->pub.start_pass = rgb_ycc_start;
       cconvert->pub.color_convert = cmyk_ycck_convert;
-    } else if (cinfo->in_color_space == JCS_YCCK)
-      cconvert->pub.color_convert = null_convert;
-    else
+    } else if (cinfo->in_color_space == JCS_YCCK) {
+#if defined(__mips__)
+      if (jsimd_c_can_null_convert())
+        cconvert->pub.color_convert = jsimd_c_null_convert;
+      else
+#endif
+        cconvert->pub.color_convert = null_convert;
+    } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
 
-  default:			/* allow null conversion of JCS_UNKNOWN */
+  default:                      /* allow null conversion of JCS_UNKNOWN */
     if (cinfo->jpeg_color_space != cinfo->in_color_space ||
-	cinfo->num_components != cinfo->input_components)
+        cinfo->num_components != cinfo->input_components)
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
-    cconvert->pub.color_convert = null_convert;
+#if defined(__mips__)
+    if (jsimd_c_can_null_convert())
+      cconvert->pub.color_convert = jsimd_c_null_convert;
+    else
+#endif
+      cconvert->pub.color_convert = null_convert;
     break;
   }
 }
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 3234a01..aef8517 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -6,8 +6,9 @@
  * libjpeg-turbo Modifications:
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011 D. R. Commander
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2011, 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the forward-DCT management logic.
  * This code selects a particular DCT implementation to be used,
@@ -18,33 +19,32 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jdct.h"               /* Private declarations for DCT subsystem */
 #include "jsimddct.h"
 
 
 /* Private subobject for this module */
 
-typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
-typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
+typedef void (*forward_DCT_method_ptr) (DCTELEM *data);
+typedef void (*float_DCT_method_ptr) (FAST_FLOAT *data);
 
-typedef JMETHOD(void, convsamp_method_ptr,
-                (JSAMPARRAY sample_data, JDIMENSION start_col,
-                 DCTELEM * workspace));
-typedef JMETHOD(void, float_convsamp_method_ptr,
-                (JSAMPARRAY sample_data, JDIMENSION start_col,
-                 FAST_FLOAT *workspace));
+typedef void (*convsamp_method_ptr) (JSAMPARRAY sample_data,
+                                     JDIMENSION start_col,
+                                     DCTELEM *workspace);
+typedef void (*float_convsamp_method_ptr) (JSAMPARRAY sample_data,
+                                           JDIMENSION start_col,
+                                           FAST_FLOAT *workspace);
 
-typedef JMETHOD(void, quantize_method_ptr,
-                (JCOEFPTR coef_block, DCTELEM * divisors,
-                 DCTELEM * workspace));
-typedef JMETHOD(void, float_quantize_method_ptr,
-                (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                 FAST_FLOAT * workspace));
+typedef void (*quantize_method_ptr) (JCOEFPTR coef_block, DCTELEM *divisors,
+                                     DCTELEM *workspace);
+typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block,
+                                           FAST_FLOAT *divisors,
+                                           FAST_FLOAT *workspace);
 
 METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
 
 typedef struct {
-  struct jpeg_forward_dct pub;	/* public fields */
+  struct jpeg_forward_dct pub;  /* public fields */
 
   /* Pointer to the DCT routine actually in use */
   forward_DCT_method_ptr dct;
@@ -55,27 +55,30 @@
    * entries, because of scaling (especially for an unnormalized DCT).
    * Each table is given in normal array order.
    */
-  DCTELEM * divisors[NUM_QUANT_TBLS];
+  DCTELEM *divisors[NUM_QUANT_TBLS];
 
   /* work area for FDCT subroutine */
-  DCTELEM * workspace;
+  DCTELEM *workspace;
 
 #ifdef DCT_FLOAT_SUPPORTED
   /* Same as above for the floating-point case. */
   float_DCT_method_ptr float_dct;
   float_convsamp_method_ptr float_convsamp;
   float_quantize_method_ptr float_quantize;
-  FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
-  FAST_FLOAT * float_workspace;
+  FAST_FLOAT *float_divisors[NUM_QUANT_TBLS];
+  FAST_FLOAT *float_workspace;
 #endif
 } my_fdct_controller;
 
-typedef my_fdct_controller * my_fdct_ptr;
+typedef my_fdct_controller *my_fdct_ptr;
 
 
+#if BITS_IN_JSAMPLE == 8
+
 /*
  * Find the highest bit in an integer through binary search.
  */
+
 LOCAL(int)
 flss (UINT16 val)
 {
@@ -106,6 +109,7 @@
   return bit;
 }
 
+
 /*
  * Compute values to do a division using reciprocal.
  *
@@ -147,7 +151,7 @@
  *
  * In order to allow SIMD implementations we also tweak the values to
  * allow the same calculation to be made at all times:
- * 
+ *
  *   dctbl[0] = f rounded to nearest integer
  *   dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
  *   dctbl[2] = 1 << ((word size) * 2 - r)
@@ -164,13 +168,27 @@
  * of in a consecutive manner, yet again in order to allow SIMD
  * routines.
  */
+
 LOCAL(int)
-compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
+compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
 {
   UDCTELEM2 fq, fr;
   UDCTELEM c;
   int b, r;
 
+  if (divisor == 1) {
+    /* divisor == 1 means unquantized, so these reciprocal/correction/shift
+     * values will cause the C quantization algorithm to act like the
+     * identity function.  Since only the C quantization algorithm is used in
+     * these cases, the scale value is irrelevant.
+     */
+    dtbl[DCTSIZE2 * 0] = (DCTELEM) 1;                       /* reciprocal */
+    dtbl[DCTSIZE2 * 1] = (DCTELEM) 0;                       /* correction */
+    dtbl[DCTSIZE2 * 2] = (DCTELEM) 1;                       /* scale */
+    dtbl[DCTSIZE2 * 3] = -(DCTELEM) (sizeof(DCTELEM) * 8);  /* shift */
+    return 0;
+  }
+
   b = flss(divisor) - 1;
   r  = sizeof(DCTELEM) * 8 + b;
 
@@ -191,13 +209,20 @@
 
   dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;      /* reciprocal */
   dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
+#ifdef WITH_SIMD
   dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
+#else
+  dtbl[DCTSIZE2 * 2] = 1;
+#endif
   dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
 
   if(r <= 16) return 0;
   else return 1;
 }
 
+#endif
+
+
 /*
  * Initialize for a processing pass.
  * Verify that all referenced Q-tables are present, and set up
@@ -213,15 +238,15 @@
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
   int ci, qtblno, i;
   jpeg_component_info *compptr;
-  JQUANT_TBL * qtbl;
-  DCTELEM * dtbl;
+  JQUANT_TBL *qtbl;
+  DCTELEM *dtbl;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     qtblno = compptr->quant_tbl_no;
     /* Make sure specified quantization table is present */
     if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
-	cinfo->quant_tbl_ptrs[qtblno] == NULL)
+        cinfo->quant_tbl_ptrs[qtblno] == NULL)
       ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
     qtbl = cinfo->quant_tbl_ptrs[qtblno];
     /* Compute divisors for this quant table */
@@ -233,91 +258,102 @@
        * coefficients multiplied by 8 (to counteract scaling).
        */
       if (fdct->divisors[qtblno] == NULL) {
-	fdct->divisors[qtblno] = (DCTELEM *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
+        fdct->divisors[qtblno] = (DCTELEM *)
+          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                      (DCTSIZE2 * 4) * sizeof(DCTELEM));
       }
       dtbl = fdct->divisors[qtblno];
       for (i = 0; i < DCTSIZE2; i++) {
-	if(!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i])
-	  && fdct->quantize == jsimd_quantize)
-	  fdct->quantize = quantize;
+#if BITS_IN_JSAMPLE == 8
+        if (!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) &&
+            fdct->quantize == jsimd_quantize)
+          fdct->quantize = quantize;
+#else
+        dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
+#endif
       }
       break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
     case JDCT_IFAST:
       {
-	/* For AA&N IDCT method, divisors are equal to quantization
-	 * coefficients scaled by scalefactor[row]*scalefactor[col], where
-	 *   scalefactor[0] = 1
-	 *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
-	 * We apply a further scale factor of 8.
-	 */
+        /* For AA&N IDCT method, divisors are equal to quantization
+         * coefficients scaled by scalefactor[row]*scalefactor[col], where
+         *   scalefactor[0] = 1
+         *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+         * We apply a further scale factor of 8.
+         */
 #define CONST_BITS 14
-	static const INT16 aanscales[DCTSIZE2] = {
-	  /* precomputed values scaled up by 14 bits */
-	  16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
-	  22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
-	  21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
-	  19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
-	  16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
-	  12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
-	   8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
-	   4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
-	};
-	SHIFT_TEMPS
+        static const INT16 aanscales[DCTSIZE2] = {
+          /* precomputed values scaled up by 14 bits */
+          16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+          22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
+          21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
+          19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
+          16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+          12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
+           8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
+           4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
+        };
+        SHIFT_TEMPS
 
-	if (fdct->divisors[qtblno] == NULL) {
-	  fdct->divisors[qtblno] = (DCTELEM *)
-	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-					(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
-	}
-	dtbl = fdct->divisors[qtblno];
-	for (i = 0; i < DCTSIZE2; i++) {
-	  if(!compute_reciprocal(
-	    DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
-				  (INT32) aanscales[i]),
-		    CONST_BITS-3), &dtbl[i])
-	    && fdct->quantize == jsimd_quantize)
-	    fdct->quantize = quantize;
-	}
+        if (fdct->divisors[qtblno] == NULL) {
+          fdct->divisors[qtblno] = (DCTELEM *)
+            (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                        (DCTSIZE2 * 4) * sizeof(DCTELEM));
+        }
+        dtbl = fdct->divisors[qtblno];
+        for (i = 0; i < DCTSIZE2; i++) {
+#if BITS_IN_JSAMPLE == 8
+          if (!compute_reciprocal(
+                DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
+                                      (JLONG) aanscales[i]),
+                        CONST_BITS-3), &dtbl[i]) &&
+              fdct->quantize == jsimd_quantize)
+            fdct->quantize = quantize;
+#else
+           dtbl[i] = (DCTELEM)
+             DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
+                                   (JLONG) aanscales[i]),
+                     CONST_BITS-3);
+#endif
+        }
       }
       break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
     case JDCT_FLOAT:
       {
-	/* For float AA&N IDCT method, divisors are equal to quantization
-	 * coefficients scaled by scalefactor[row]*scalefactor[col], where
-	 *   scalefactor[0] = 1
-	 *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
-	 * We apply a further scale factor of 8.
-	 * What's actually stored is 1/divisor so that the inner loop can
-	 * use a multiplication rather than a division.
-	 */
-	FAST_FLOAT * fdtbl;
-	int row, col;
-	static const double aanscalefactor[DCTSIZE] = {
-	  1.0, 1.387039845, 1.306562965, 1.175875602,
-	  1.0, 0.785694958, 0.541196100, 0.275899379
-	};
+        /* For float AA&N IDCT method, divisors are equal to quantization
+         * coefficients scaled by scalefactor[row]*scalefactor[col], where
+         *   scalefactor[0] = 1
+         *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+         * We apply a further scale factor of 8.
+         * What's actually stored is 1/divisor so that the inner loop can
+         * use a multiplication rather than a division.
+         */
+        FAST_FLOAT *fdtbl;
+        int row, col;
+        static const double aanscalefactor[DCTSIZE] = {
+          1.0, 1.387039845, 1.306562965, 1.175875602,
+          1.0, 0.785694958, 0.541196100, 0.275899379
+        };
 
-	if (fdct->float_divisors[qtblno] == NULL) {
-	  fdct->float_divisors[qtblno] = (FAST_FLOAT *)
-	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-					DCTSIZE2 * SIZEOF(FAST_FLOAT));
-	}
-	fdtbl = fdct->float_divisors[qtblno];
-	i = 0;
-	for (row = 0; row < DCTSIZE; row++) {
-	  for (col = 0; col < DCTSIZE; col++) {
-	    fdtbl[i] = (FAST_FLOAT)
-	      (1.0 / (((double) qtbl->quantval[i] *
-		       aanscalefactor[row] * aanscalefactor[col] * 8.0)));
-	    i++;
-	  }
-	}
+        if (fdct->float_divisors[qtblno] == NULL) {
+          fdct->float_divisors[qtblno] = (FAST_FLOAT *)
+            (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                        DCTSIZE2 * sizeof(FAST_FLOAT));
+        }
+        fdtbl = fdct->float_divisors[qtblno];
+        i = 0;
+        for (row = 0; row < DCTSIZE; row++) {
+          for (col = 0; col < DCTSIZE; col++) {
+            fdtbl[i] = (FAST_FLOAT)
+              (1.0 / (((double) qtbl->quantval[i] *
+                       aanscalefactor[row] * aanscalefactor[col] * 8.0)));
+            i++;
+          }
+        }
       }
       break;
 #endif
@@ -334,7 +370,7 @@
  */
 
 METHODDEF(void)
-convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
+convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
 {
   register DCTELEM *workspaceptr;
   register JSAMPROW elemptr;
@@ -344,7 +380,7 @@
   for (elemr = 0; elemr < DCTSIZE; elemr++) {
     elemptr = sample_data[elemr] + start_col;
 
-#if DCTSIZE == 8		/* unroll the inner loop */
+#if DCTSIZE == 8                /* unroll the inner loop */
     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
@@ -369,14 +405,18 @@
  */
 
 METHODDEF(void)
-quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
+quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 {
   int i;
   DCTELEM temp;
-  UDCTELEM recip, corr, shift;
-  UDCTELEM2 product;
   JCOEFPTR output_ptr = coef_block;
 
+#if BITS_IN_JSAMPLE == 8
+
+  UDCTELEM recip, corr;
+  int shift;
+  UDCTELEM2 product;
+
   for (i = 0; i < DCTSIZE2; i++) {
     temp = workspace[i];
     recip = divisors[i + DCTSIZE2 * 0];
@@ -387,16 +427,54 @@
       temp = -temp;
       product = (UDCTELEM2)(temp + corr) * recip;
       product >>= shift + sizeof(DCTELEM)*8;
-      temp = product;
+      temp = (DCTELEM)product;
       temp = -temp;
     } else {
       product = (UDCTELEM2)(temp + corr) * recip;
       product >>= shift + sizeof(DCTELEM)*8;
-      temp = product;
+      temp = (DCTELEM)product;
     }
-
     output_ptr[i] = (JCOEF) temp;
   }
+
+#else
+
+  register DCTELEM qval;
+
+  for (i = 0; i < DCTSIZE2; i++) {
+    qval = divisors[i];
+    temp = workspace[i];
+    /* Divide the coefficient value by qval, ensuring proper rounding.
+     * Since C does not specify the direction of rounding for negative
+     * quotients, we have to force the dividend positive for portability.
+     *
+     * In most files, at least half of the output values will be zero
+     * (at default quantization settings, more like three-quarters...)
+     * so we should ensure that this case is fast.  On many machines,
+     * a comparison is enough cheaper than a divide to make a special test
+     * a win.  Since both inputs will be nonnegative, we need only test
+     * for a < b to discover whether a/b is 0.
+     * If your machine's division is fast enough, define FAST_DIVIDE.
+     */
+#ifdef FAST_DIVIDE
+#define DIVIDE_BY(a,b)  a /= b
+#else
+#define DIVIDE_BY(a,b)  if (a >= b) a /= b; else a = 0
+#endif
+    if (temp < 0) {
+      temp = -temp;
+      temp += qval>>1;  /* for rounding */
+      DIVIDE_BY(temp, qval);
+      temp = -temp;
+    } else {
+      temp += qval>>1;  /* for rounding */
+      DIVIDE_BY(temp, qval);
+    }
+    output_ptr[i] = (JCOEF) temp;
+  }
+
+#endif
+
 }
 
 
@@ -409,16 +487,16 @@
  */
 
 METHODDEF(void)
-forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
-	     JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-	     JDIMENSION start_row, JDIMENSION start_col,
-	     JDIMENSION num_blocks)
+forward_DCT (j_compress_ptr cinfo, jpeg_component_info *compptr,
+             JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+             JDIMENSION start_row, JDIMENSION start_col,
+             JDIMENSION num_blocks)
 /* This version is used for integer DCT implementations. */
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
-  DCTELEM * workspace;
+  DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no];
+  DCTELEM *workspace;
   JDIMENSION bi;
 
   /* Make sure the compiler doesn't look up these every pass */
@@ -427,7 +505,7 @@
   quantize_method_ptr do_quantize = fdct->quantize;
   workspace = fdct->workspace;
 
-  sample_data += start_row;	/* fold in the vertical offset once */
+  sample_data += start_row;     /* fold in the vertical offset once */
 
   for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
     /* Load data into workspace, applying unsigned->signed conversion */
@@ -446,7 +524,7 @@
 
 
 METHODDEF(void)
-convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)
+convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace)
 {
   register FAST_FLOAT *workspaceptr;
   register JSAMPROW elemptr;
@@ -455,7 +533,7 @@
   workspaceptr = workspace;
   for (elemr = 0; elemr < DCTSIZE; elemr++) {
     elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
+#if DCTSIZE == 8                /* unroll the inner loop */
     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
@@ -477,7 +555,7 @@
 
 
 METHODDEF(void)
-quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)
+quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace)
 {
   register FAST_FLOAT temp;
   register int i;
@@ -499,16 +577,16 @@
 
 
 METHODDEF(void)
-forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
-		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-		   JDIMENSION start_row, JDIMENSION start_col,
-		   JDIMENSION num_blocks)
+forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+                   JDIMENSION start_row, JDIMENSION start_col,
+                   JDIMENSION num_blocks)
 /* This version is used for floating-point DCT implementations. */
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
-  FAST_FLOAT * workspace;
+  FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no];
+  FAST_FLOAT *workspace;
   JDIMENSION bi;
 
 
@@ -518,7 +596,7 @@
   float_quantize_method_ptr do_quantize = fdct->float_quantize;
   workspace = fdct->float_workspace;
 
-  sample_data += start_row;	/* fold in the vertical offset once */
+  sample_data += start_row;     /* fold in the vertical offset once */
 
   for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
     /* Load data into workspace, applying unsigned->signed conversion */
@@ -547,7 +625,7 @@
 
   fdct = (my_fdct_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_fdct_controller));
+                                sizeof(my_fdct_controller));
   cinfo->fdct = (struct jpeg_forward_dct *) fdct;
   fdct->pub.start_pass = start_pass_fdctmgr;
 
@@ -626,12 +704,12 @@
   if (cinfo->dct_method == JDCT_FLOAT)
     fdct->float_workspace = (FAST_FLOAT *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(FAST_FLOAT) * DCTSIZE2);
+                                  sizeof(FAST_FLOAT) * DCTSIZE2);
   else
 #endif
     fdct->workspace = (DCTELEM *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(DCTELEM) * DCTSIZE2);
+                                  sizeof(DCTELEM) * DCTSIZE2);
 
   /* Mark divisor tables unallocated */
   for (i = 0; i < NUM_QUANT_TBLS; i++) {
diff --git a/jchuff.c b/jchuff.c
index 68e4e0e..58acd70 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2009-2011, 2014-2016 D. R. Commander.
+ * Copyright (C) 2015 Matthieu Darbois.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains Huffman entropy encoding routines.
  *
@@ -19,7 +21,8 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jchuff.h"		/* Declarations shared with jcphuff.c */
+#include "jsimd.h"
+#include "jconfigint.h"
 #include <limits.h>
 
 /*
@@ -37,7 +40,7 @@
  */
 
 /* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined __GNUC__ && defined __arm__
+#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
 #if !defined __thumb__ || defined __thumb2__
 #define USE_CLZ_INTRINSIC
 #endif
@@ -47,8 +50,7 @@
 #define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
 #define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
 #else
-static unsigned char jpeg_nbits_table[65536];
-static int jpeg_nbits_table_init = 0;
+#include "jpeg_nbits_table.h"
 #define JPEG_NBITS(x) (jpeg_nbits_table[x])
 #define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
 #endif
@@ -65,8 +67,8 @@
  */
 
 typedef struct {
-  size_t put_buffer;		/* current bit-accumulation buffer */
-  int put_bits;			/* # of bits now in it */
+  size_t put_buffer;            /* current bit-accumulation buffer */
+  int put_bits;                 /* # of bits now in it */
   int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
 } savable_state;
 
@@ -80,12 +82,12 @@
 #else
 #if MAX_COMPS_IN_SCAN == 4
 #define ASSIGN_STATE(dest,src)  \
-	((dest).put_buffer = (src).put_buffer, \
-	 (dest).put_bits = (src).put_bits, \
-	 (dest).last_dc_val[0] = (src).last_dc_val[0], \
-	 (dest).last_dc_val[1] = (src).last_dc_val[1], \
-	 (dest).last_dc_val[2] = (src).last_dc_val[2], \
-	 (dest).last_dc_val[3] = (src).last_dc_val[3])
+        ((dest).put_buffer = (src).put_buffer, \
+         (dest).put_bits = (src).put_bits, \
+         (dest).last_dc_val[0] = (src).last_dc_val[0], \
+         (dest).last_dc_val[1] = (src).last_dc_val[1], \
+         (dest).last_dc_val[2] = (src).last_dc_val[2], \
+         (dest).last_dc_val[3] = (src).last_dc_val[3])
 #endif
 #endif
 
@@ -93,44 +95,45 @@
 typedef struct {
   struct jpeg_entropy_encoder pub; /* public fields */
 
-  savable_state saved;		/* Bit buffer & DC state at start of MCU */
+  savable_state saved;          /* Bit buffer & DC state at start of MCU */
 
   /* These fields are NOT loaded into local working state. */
-  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
-  int next_restart_num;		/* next restart number to write (0-7) */
+  unsigned int restarts_to_go;  /* MCUs left in this restart interval */
+  int next_restart_num;         /* next restart number to write (0-7) */
 
   /* Pointers to derived tables (these workspaces have image lifespan) */
-  c_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
-  c_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
+  c_derived_tbl *dc_derived_tbls[NUM_HUFF_TBLS];
+  c_derived_tbl *ac_derived_tbls[NUM_HUFF_TBLS];
 
-#ifdef ENTROPY_OPT_SUPPORTED	/* Statistics tables for optimization */
-  long * dc_count_ptrs[NUM_HUFF_TBLS];
-  long * ac_count_ptrs[NUM_HUFF_TBLS];
+#ifdef ENTROPY_OPT_SUPPORTED    /* Statistics tables for optimization */
+  long *dc_count_ptrs[NUM_HUFF_TBLS];
+  long *ac_count_ptrs[NUM_HUFF_TBLS];
 #endif
+
+  int simd;
 } huff_entropy_encoder;
 
-typedef huff_entropy_encoder * huff_entropy_ptr;
+typedef huff_entropy_encoder *huff_entropy_ptr;
 
 /* Working state while writing an MCU.
  * This struct contains all the fields that are needed by subroutines.
  */
 
 typedef struct {
-  JOCTET * next_output_byte;	/* => next byte to write in buffer */
-  size_t free_in_buffer;	/* # of byte spaces remaining in buffer */
-  savable_state cur;		/* Current bit buffer & DC state */
-  j_compress_ptr cinfo;		/* dump_buffer needs access to this */
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
+  savable_state cur;            /* Current bit buffer & DC state */
+  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
 } working_state;
 
 
 /* Forward declarations */
-METHODDEF(boolean) encode_mcu_huff JPP((j_compress_ptr cinfo,
-					JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_huff JPP((j_compress_ptr cinfo));
+METHODDEF(boolean) encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_huff (j_compress_ptr cinfo);
 #ifdef ENTROPY_OPT_SUPPORTED
-METHODDEF(boolean) encode_mcu_gather JPP((j_compress_ptr cinfo,
-					  JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_gather JPP((j_compress_ptr cinfo));
+METHODDEF(boolean) encode_mcu_gather (j_compress_ptr cinfo,
+                                      JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_gather (j_compress_ptr cinfo);
 #endif
 
 
@@ -145,7 +148,7 @@
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   int ci, dctbl, actbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   if (gather_statistics) {
 #ifdef ENTROPY_OPT_SUPPORTED
@@ -159,6 +162,8 @@
     entropy->pub.finish_pass = finish_pass_huff;
   }
 
+  entropy->simd = jsimd_can_huff_encode_one_block();
+
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     dctbl = compptr->dc_tbl_no;
@@ -168,29 +173,29 @@
       /* Check for invalid table indexes */
       /* (make_c_derived_tbl does this in the other path) */
       if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS)
-	ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);
+        ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);
       if (actbl < 0 || actbl >= NUM_HUFF_TBLS)
-	ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl);
+        ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl);
       /* Allocate and zero the statistics tables */
       /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
       if (entropy->dc_count_ptrs[dctbl] == NULL)
-	entropy->dc_count_ptrs[dctbl] = (long *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      257 * SIZEOF(long));
-      MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * SIZEOF(long));
+        entropy->dc_count_ptrs[dctbl] = (long *)
+          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                      257 * sizeof(long));
+      MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * sizeof(long));
       if (entropy->ac_count_ptrs[actbl] == NULL)
-	entropy->ac_count_ptrs[actbl] = (long *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      257 * SIZEOF(long));
-      MEMZERO(entropy->ac_count_ptrs[actbl], 257 * SIZEOF(long));
+        entropy->ac_count_ptrs[actbl] = (long *)
+          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                      257 * sizeof(long));
+      MEMZERO(entropy->ac_count_ptrs[actbl], 257 * sizeof(long));
 #endif
     } else {
       /* Compute derived values for Huffman tables */
       /* We may do this more than once for a table, but it's not expensive */
       jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
-			      & entropy->dc_derived_tbls[dctbl]);
+                              & entropy->dc_derived_tbls[dctbl]);
       jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,
-			      & entropy->ac_derived_tbls[actbl]);
+                              & entropy->ac_derived_tbls[actbl]);
     }
     /* Initialize DC predictions to 0 */
     entropy->saved.last_dc_val[ci] = 0;
@@ -215,7 +220,7 @@
 
 GLOBAL(void)
 jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
-			 c_derived_tbl ** pdtbl)
+                         c_derived_tbl **pdtbl)
 {
   JHUFF_TBL *htbl;
   c_derived_tbl *dtbl;
@@ -240,22 +245,22 @@
   if (*pdtbl == NULL)
     *pdtbl = (c_derived_tbl *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(c_derived_tbl));
+                                  sizeof(c_derived_tbl));
   dtbl = *pdtbl;
-  
+
   /* Figure C.1: make table of Huffman code length for each symbol */
 
   p = 0;
   for (l = 1; l <= 16; l++) {
     i = (int) htbl->bits[l];
-    if (i < 0 || p + i > 256)	/* protect against table overrun */
+    if (i < 0 || p + i > 256)   /* protect against table overrun */
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     while (i--)
       huffsize[p++] = (char) l;
   }
   huffsize[p] = 0;
   lastp = p;
-  
+
   /* Figure C.2: generate the codes themselves */
   /* We also validate that the counts represent a legal Huffman code tree. */
 
@@ -270,12 +275,12 @@
     /* code is now 1 more than the last code used for codelength si; but
      * it must still fit in si bits, since no code is allowed to be all ones.
      */
-    if (((INT32) code) >= (((INT32) 1) << si))
+    if (((JLONG) code) >= (((JLONG) 1) << si))
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     code <<= 1;
     si++;
   }
-  
+
   /* Figure C.3: generate encoding tables */
   /* These are code and size indexed by symbol value */
 
@@ -283,7 +288,7 @@
    * this lets us detect duplicate VAL entries here, and later
    * allows emit_bits to detect any attempt to emit such symbols.
    */
-  MEMZERO(dtbl->ehufsi, SIZEOF(dtbl->ehufsi));
+  MEMZERO(dtbl->ehufsi, sizeof(dtbl->ehufsi));
 
   /* This is also a convenient place to check for out-of-range
    * and duplicated VAL entries.  We allow 0..255 for AC symbols
@@ -299,17 +304,6 @@
     dtbl->ehufco[i] = huffcode[p];
     dtbl->ehufsi[i] = huffsize[p];
   }
-
-#ifndef USE_CLZ_INTRINSIC
-  if(!jpeg_nbits_table_init) {
-    for(i = 0; i < 65536; i++) {
-      int nbits = 0, temp = i;
-      while (temp) {temp >>= 1;  nbits++;}
-      jpeg_nbits_table[i] = nbits;
-    }
-    jpeg_nbits_table_init = 1;
-  }
-#endif
 }
 
 
@@ -317,17 +311,17 @@
 
 /* Emit a byte, taking 'action' if must suspend. */
 #define emit_byte(state,val,action)  \
-	{ *(state)->next_output_byte++ = (JOCTET) (val);  \
-	  if (--(state)->free_in_buffer == 0)  \
-	    if (! dump_buffer(state))  \
-	      { action; } }
+        { *(state)->next_output_byte++ = (JOCTET) (val);  \
+          if (--(state)->free_in_buffer == 0)  \
+            if (! dump_buffer(state))  \
+              { action; } }
 
 
 LOCAL(boolean)
-dump_buffer (working_state * state)
+dump_buffer (working_state *state)
 /* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
 {
-  struct jpeg_destination_mgr * dest = state->cinfo->dest;
+  struct jpeg_destination_mgr *dest = state->cinfo->dest;
 
   if (! (*dest->empty_output_buffer) (state->cinfo))
     return FALSE;
@@ -389,7 +383,11 @@
   } \
 }
 
-#if __WORDSIZE==64 || defined(_WIN64)
+#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T)
+#error Cannot determine word size
+#endif
+
+#if SIZEOF_SIZE_T==8 || defined(_WIN64)
 
 #define EMIT_BITS(code, size) { \
   CHECKBUF47() \
@@ -397,7 +395,7 @@
 }
 
 #define EMIT_CODE(code, size) { \
-  temp2 &= (((INT32) 1)<<nbits) - 1; \
+  temp2 &= (((JLONG) 1)<<nbits) - 1; \
   CHECKBUF31() \
   PUT_BITS(code, size) \
   PUT_BITS(temp2, nbits) \
@@ -411,7 +409,7 @@
 }
 
 #define EMIT_CODE(code, size) { \
-  temp2 &= (((INT32) 1)<<nbits) - 1; \
+  temp2 &= (((JLONG) 1)<<nbits) - 1; \
   PUT_BITS(code, size) \
   CHECKBUF15() \
   PUT_BITS(temp2, nbits) \
@@ -421,7 +419,16 @@
 #endif
 
 
-#define BUFSIZE (DCTSIZE2 * 2)
+/* Although it is exceedingly rare, it is possible for a Huffman-encoded
+ * coefficient block to be larger than the 128-byte unencoded block.  For each
+ * of the 64 coefficients, PUT_BITS is invoked twice, and each invocation can
+ * theoretically store 16 bits (for a maximum of 2048 bits or 256 bytes per
+ * encoded block.)  If, for instance, one artificially sets the AC
+ * coefficients to alternating values of 32767 and -32768 (using the JPEG
+ * scanning order-- 1, 8, 16, etc.), then this will produce an encoded block
+ * larger than 200 bytes.
+ */
+#define BUFSIZE (DCTSIZE2 * 4)
 
 #define LOAD_BUFFER() { \
   if (state->free_in_buffer < BUFSIZE) { \
@@ -454,7 +461,7 @@
 
 
 LOCAL(boolean)
-flush_bits (working_state * state)
+flush_bits (working_state *state)
 {
   JOCTET _buffer[BUFSIZE], *buffer;
   size_t put_buffer;  int put_bits;
@@ -468,7 +475,7 @@
   PUT_BITS(0x7F, 7)
   while (put_bits >= 8) EMIT_BYTE()
 
-  state->cur.put_buffer = 0;	/* and reset bit-buffer to empty */
+  state->cur.put_buffer = 0;    /* and reset bit-buffer to empty */
   state->cur.put_bits = 0;
   STORE_BUFFER()
 
@@ -479,8 +486,25 @@
 /* Encode a single block's worth of coefficients */
 
 LOCAL(boolean)
-encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
-		  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+encode_one_block_simd (working_state *state, JCOEFPTR block, int last_dc_val,
+                       c_derived_tbl *dctbl, c_derived_tbl *actbl)
+{
+  JOCTET _buffer[BUFSIZE], *buffer;
+  size_t bytes, bytestocopy;  int localbuf = 0;
+
+  LOAD_BUFFER()
+
+  buffer = jsimd_huff_encode_one_block(state, buffer, block, last_dc_val,
+                                       dctbl, actbl);
+
+  STORE_BUFFER()
+
+  return TRUE;
+}
+
+LOCAL(boolean)
+encode_one_block (working_state *state, JCOEFPTR block, int last_dc_val,
+                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
   int temp, temp2, temp3;
   int nbits;
@@ -495,7 +519,7 @@
   LOAD_BUFFER()
 
   /* Encode the DC coefficient difference per section F.1.2.1 */
-  
+
   temp = temp2 = block[0] - last_dc_val;
 
  /* This is a well-known technique for obtaining the absolute value without a
@@ -517,20 +541,18 @@
   /* Emit the Huffman-coded symbol for the number of bits */
   code = dctbl->ehufco[nbits];
   size = dctbl->ehufsi[nbits];
-  PUT_BITS(code, size)
-  CHECKBUF15()
+  EMIT_BITS(code, size)
 
   /* Mask off any extra bits in code */
-  temp2 &= (((INT32) 1)<<nbits) - 1;
+  temp2 &= (((JLONG) 1)<<nbits) - 1;
 
   /* Emit that number of bits of the value, if positive, */
   /* or the complement of its magnitude, if negative. */
-  PUT_BITS(temp2, nbits)
-  CHECKBUF15()
+  EMIT_BITS(temp2, nbits)
 
   /* Encode the AC coefficients per section F.1.2.2 */
-  
-  r = 0;			/* r = run length of zeros */
+
+  r = 0;                        /* r = run length of zeros */
 
 /* Manually unroll the k loop to eliminate the counter variable.  This
  * improves performance greatly on systems with a limited number of
@@ -594,7 +616,7 @@
  */
 
 LOCAL(boolean)
-emit_restart (working_state * state, int restart_num)
+emit_restart (working_state *state, int restart_num)
 {
   int ci;
 
@@ -624,7 +646,7 @@
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   working_state state;
   int blkn, ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   /* Load up working state */
   state.next_output_byte = cinfo->dest->next_output_byte;
@@ -636,20 +658,34 @@
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
       if (! emit_restart(&state, entropy->next_restart_num))
-	return FALSE;
+        return FALSE;
   }
 
   /* Encode the MCU data blocks */
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    ci = cinfo->MCU_membership[blkn];
-    compptr = cinfo->cur_comp_info[ci];
-    if (! encode_one_block(&state,
-			   MCU_data[blkn][0], state.cur.last_dc_val[ci],
-			   entropy->dc_derived_tbls[compptr->dc_tbl_no],
-			   entropy->ac_derived_tbls[compptr->ac_tbl_no]))
-      return FALSE;
-    /* Update last_dc_val */
-    state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
+  if (entropy->simd) {
+    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+      ci = cinfo->MCU_membership[blkn];
+      compptr = cinfo->cur_comp_info[ci];
+      if (! encode_one_block_simd(&state,
+                                  MCU_data[blkn][0], state.cur.last_dc_val[ci],
+                                  entropy->dc_derived_tbls[compptr->dc_tbl_no],
+                                  entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+        return FALSE;
+      /* Update last_dc_val */
+      state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
+    }
+  } else {
+    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+      ci = cinfo->MCU_membership[blkn];
+      compptr = cinfo->cur_comp_info[ci];
+      if (! encode_one_block(&state,
+                             MCU_data[blkn][0], state.cur.last_dc_val[ci],
+                             entropy->dc_derived_tbls[compptr->dc_tbl_no],
+                             entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+        return FALSE;
+      /* Update last_dc_val */
+      state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
+    }
   }
 
   /* Completed MCU, so update state */
@@ -716,18 +752,18 @@
 
 LOCAL(void)
 htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
-		 long dc_counts[], long ac_counts[])
+                 long dc_counts[], long ac_counts[])
 {
   register int temp;
   register int nbits;
   register int k, r;
-  
+
   /* Encode the DC coefficient difference per section F.1.2.1 */
-  
+
   temp = block[0] - last_dc_val;
   if (temp < 0)
     temp = -temp;
-  
+
   /* Find the number of bits needed for the magnitude of the coefficient */
   nbits = 0;
   while (temp) {
@@ -742,36 +778,36 @@
 
   /* Count the Huffman symbol for the number of bits */
   dc_counts[nbits]++;
-  
+
   /* Encode the AC coefficients per section F.1.2.2 */
-  
-  r = 0;			/* r = run length of zeros */
-  
+
+  r = 0;                        /* r = run length of zeros */
+
   for (k = 1; k < DCTSIZE2; k++) {
     if ((temp = block[jpeg_natural_order[k]]) == 0) {
       r++;
     } else {
       /* if run length > 15, must emit special run-length-16 codes (0xF0) */
       while (r > 15) {
-	ac_counts[0xF0]++;
-	r -= 16;
+        ac_counts[0xF0]++;
+        r -= 16;
       }
-      
+
       /* Find the number of bits needed for the magnitude of the coefficient */
       if (temp < 0)
-	temp = -temp;
-      
+        temp = -temp;
+
       /* Find the number of bits needed for the magnitude of the coefficient */
-      nbits = 1;		/* there must be at least one 1 bit */
+      nbits = 1;                /* there must be at least one 1 bit */
       while ((temp >>= 1))
-	nbits++;
+        nbits++;
       /* Check for out-of-range coefficient values */
       if (nbits > MAX_COEF_BITS)
-	ERREXIT(cinfo, JERR_BAD_DCT_COEF);
-      
+        ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+
       /* Count Huffman symbol for run length / number of bits */
       ac_counts[(r << 4) + nbits]++;
-      
+
       r = 0;
     }
   }
@@ -792,14 +828,14 @@
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   int blkn, ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   /* Take care of restart intervals if needed */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0) {
       /* Re-initialize DC predictions to 0 */
       for (ci = 0; ci < cinfo->comps_in_scan; ci++)
-	entropy->saved.last_dc_val[ci] = 0;
+        entropy->saved.last_dc_val[ci] = 0;
       /* Update restart state */
       entropy->restarts_to_go = cinfo->restart_interval;
     }
@@ -810,8 +846,8 @@
     ci = cinfo->MCU_membership[blkn];
     compptr = cinfo->cur_comp_info[ci];
     htest_one_block(cinfo, MCU_data[blkn][0], entropy->saved.last_dc_val[ci],
-		    entropy->dc_count_ptrs[compptr->dc_tbl_no],
-		    entropy->ac_count_ptrs[compptr->ac_tbl_no]);
+                    entropy->dc_count_ptrs[compptr->dc_tbl_no],
+                    entropy->ac_count_ptrs[compptr->ac_tbl_no]);
     entropy->saved.last_dc_val[ci] = MCU_data[blkn][0][0];
   }
 
@@ -848,24 +884,24 @@
  */
 
 GLOBAL(void)
-jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[])
+jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
 {
-#define MAX_CLEN 32		/* assumed maximum initial code length */
-  UINT8 bits[MAX_CLEN+1];	/* bits[k] = # of symbols with code length k */
-  int codesize[257];		/* codesize[k] = code length of symbol k */
-  int others[257];		/* next symbol in current branch of tree */
+#define MAX_CLEN 32             /* assumed maximum initial code length */
+  UINT8 bits[MAX_CLEN+1];       /* bits[k] = # of symbols with code length k */
+  int codesize[257];            /* codesize[k] = code length of symbol k */
+  int others[257];              /* next symbol in current branch of tree */
   int c1, c2;
   int p, i, j;
   long v;
 
   /* This algorithm is explained in section K.2 of the JPEG standard */
 
-  MEMZERO(bits, SIZEOF(bits));
-  MEMZERO(codesize, SIZEOF(codesize));
+  MEMZERO(bits, sizeof(bits));
+  MEMZERO(codesize, sizeof(codesize));
   for (i = 0; i < 257; i++)
-    others[i] = -1;		/* init links to empty */
-  
-  freq[256] = 1;		/* make sure 256 has a nonzero count */
+    others[i] = -1;             /* init links to empty */
+
+  freq[256] = 1;                /* make sure 256 has a nonzero count */
   /* Including the pseudo-symbol 256 in the Huffman procedure guarantees
    * that no real symbol is given code-value of all ones, because 256
    * will be placed last in the largest codeword category.
@@ -880,8 +916,8 @@
     v = 1000000000L;
     for (i = 0; i <= 256; i++) {
       if (freq[i] && freq[i] <= v) {
-	v = freq[i];
-	c1 = i;
+        v = freq[i];
+        c1 = i;
       }
     }
 
@@ -891,15 +927,15 @@
     v = 1000000000L;
     for (i = 0; i <= 256; i++) {
       if (freq[i] && freq[i] <= v && i != c1) {
-	v = freq[i];
-	c2 = i;
+        v = freq[i];
+        c2 = i;
       }
     }
 
     /* Done if we've merged everything into one frequency */
     if (c2 < 0)
       break;
-    
+
     /* Else merge the two counts/trees */
     freq[c1] += freq[c2];
     freq[c2] = 0;
@@ -910,9 +946,9 @@
       c1 = others[c1];
       codesize[c1]++;
     }
-    
-    others[c1] = c2;		/* chain c2 onto c1's tree branch */
-    
+
+    others[c1] = c2;            /* chain c2 onto c1's tree branch */
+
     /* Increment the codesize of everything in c2's tree branch */
     codesize[c2]++;
     while (others[c2] >= 0) {
@@ -927,7 +963,7 @@
       /* The JPEG standard seems to think that this can't happen, */
       /* but I'm paranoid... */
       if (codesize[i] > MAX_CLEN)
-	ERREXIT(cinfo, JERR_HUFF_CLEN_OVERFLOW);
+        ERREXIT(cinfo, JERR_HUFF_CLEN_OVERFLOW);
 
       bits[codesize[i]]++;
     }
@@ -943,28 +979,28 @@
    * shortest nonzero BITS entry is converted into a prefix for two code words
    * one bit longer.
    */
-  
+
   for (i = MAX_CLEN; i > 16; i--) {
     while (bits[i] > 0) {
-      j = i - 2;		/* find length of new prefix to be used */
+      j = i - 2;                /* find length of new prefix to be used */
       while (bits[j] == 0)
-	j--;
-      
-      bits[i] -= 2;		/* remove two symbols */
-      bits[i-1]++;		/* one goes in this length */
-      bits[j+1] += 2;		/* two new symbols in this length */
-      bits[j]--;		/* symbol of this length is now a prefix */
+        j--;
+
+      bits[i] -= 2;             /* remove two symbols */
+      bits[i-1]++;              /* one goes in this length */
+      bits[j+1] += 2;           /* two new symbols in this length */
+      bits[j]--;                /* symbol of this length is now a prefix */
     }
   }
 
   /* Remove the count for the pseudo-symbol 256 from the largest codelength */
-  while (bits[i] == 0)		/* find largest codelength still in use */
+  while (bits[i] == 0)          /* find largest codelength still in use */
     i--;
   bits[i]--;
-  
+
   /* Return final symbol counts (only for lengths 0..16) */
-  MEMCOPY(htbl->bits, bits, SIZEOF(htbl->bits));
-  
+  MEMCOPY(htbl->bits, bits, sizeof(htbl->bits));
+
   /* Return a list of the symbols sorted by code length */
   /* It's not real clear to me why we don't need to consider the codelength
    * changes made above, but the JPEG spec seems to think this works.
@@ -973,8 +1009,8 @@
   for (i = 1; i <= MAX_CLEN; i++) {
     for (j = 0; j <= 255; j++) {
       if (codesize[j] == i) {
-	htbl->huffval[p] = (UINT8) j;
-	p++;
+        htbl->huffval[p] = (UINT8) j;
+        p++;
       }
     }
   }
@@ -993,7 +1029,7 @@
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   int ci, dctbl, actbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JHUFF_TBL **htblptr;
   boolean did_dc[NUM_HUFF_TBLS];
   boolean did_ac[NUM_HUFF_TBLS];
@@ -1001,8 +1037,8 @@
   /* It's important not to apply jpeg_gen_optimal_table more than once
    * per table, because it clobbers the input frequency counts!
    */
-  MEMZERO(did_dc, SIZEOF(did_dc));
-  MEMZERO(did_ac, SIZEOF(did_ac));
+  MEMZERO(did_dc, sizeof(did_dc));
+  MEMZERO(did_ac, sizeof(did_ac));
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
@@ -1011,14 +1047,14 @@
     if (! did_dc[dctbl]) {
       htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl];
       if (*htblptr == NULL)
-	*htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
       did_dc[dctbl] = TRUE;
     }
     if (! did_ac[actbl]) {
       htblptr = & cinfo->ac_huff_tbl_ptrs[actbl];
       if (*htblptr == NULL)
-	*htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
       did_ac[actbl] = TRUE;
     }
@@ -1041,7 +1077,7 @@
 
   entropy = (huff_entropy_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(huff_entropy_encoder));
+                                sizeof(huff_entropy_encoder));
   cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
   entropy->pub.start_pass = start_pass_huff;
 
diff --git a/jchuff.h b/jchuff.h
index a9599fc..4236089 100644
--- a/jchuff.h
+++ b/jchuff.h
@@ -1,9 +1,12 @@
 /*
  * jchuff.h
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains declarations for Huffman entropy encoding routines
  * that are shared between the sequential encoder (jchuff.c) and the
@@ -25,23 +28,16 @@
 /* Derived data constructed for each Huffman table */
 
 typedef struct {
-  unsigned int ehufco[256];	/* code for each symbol */
-  char ehufsi[256];		/* length of code for each symbol */
+  unsigned int ehufco[256];     /* code for each symbol */
+  char ehufsi[256];             /* length of code for each symbol */
   /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */
 } c_derived_tbl;
 
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_make_c_derived_tbl	jMkCDerived
-#define jpeg_gen_optimal_table	jGenOptTbl
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
 /* Expand a Huffman table definition into the derived format */
 EXTERN(void) jpeg_make_c_derived_tbl
-	JPP((j_compress_ptr cinfo, boolean isDC, int tblno,
-	     c_derived_tbl ** pdtbl));
+        (j_compress_ptr cinfo, boolean isDC, int tblno,
+         c_derived_tbl ** pdtbl);
 
 /* Generate an optimal table definition given the specified counts */
 EXTERN(void) jpeg_gen_optimal_table
-	JPP((j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]));
+        (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[]);
diff --git a/jcinit.c b/jcinit.c
index de0ade2..463bd8c 100644
--- a/jcinit.c
+++ b/jcinit.c
@@ -3,7 +3,8 @@
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains initialization logic for the JPEG compressor.
  * This routine is in charge of selecting the modules to be executed and
@@ -60,7 +61,7 @@
 
   /* Need a full-image coefficient buffer in any multi-pass mode. */
   jinit_c_coef_controller(cinfo,
-		(boolean) (cinfo->num_scans > 1 || cinfo->optimize_coding));
+                (boolean) (cinfo->num_scans > 1 || cinfo->optimize_coding));
   jinit_c_main_controller(cinfo, FALSE /* never need full buffer here */);
 
   jinit_marker_writer(cinfo);
diff --git a/jcmainct.c b/jcmainct.c
index 5b7ff21..d01f463 100644
--- a/jcmainct.c
+++ b/jcmainct.c
@@ -1,9 +1,12 @@
 /*
  * jcmainct.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the main buffer controller for compression.
  * The main buffer lies between the pre-processor and the JPEG
@@ -15,50 +18,30 @@
 #include "jpeglib.h"
 
 
-/* Note: currently, there is no operating mode in which a full-image buffer
- * is needed at this step.  If there were, that mode could not be used with
- * "raw data" input, since this module is bypassed in that case.  However,
- * we've left the code here for possible use in special applications.
- */
-#undef FULL_MAIN_BUFFER_SUPPORTED
-
-
 /* Private buffer controller object */
 
 typedef struct {
   struct jpeg_c_main_controller pub; /* public fields */
 
-  JDIMENSION cur_iMCU_row;	/* number of current iMCU row */
-  JDIMENSION rowgroup_ctr;	/* counts row groups received in iMCU row */
-  boolean suspended;		/* remember if we suspended output */
-  J_BUF_MODE pass_mode;		/* current operating mode */
+  JDIMENSION cur_iMCU_row;      /* number of current iMCU row */
+  JDIMENSION rowgroup_ctr;      /* counts row groups received in iMCU row */
+  boolean suspended;            /* remember if we suspended output */
+  J_BUF_MODE pass_mode;         /* current operating mode */
 
   /* If using just a strip buffer, this points to the entire set of buffers
    * (we allocate one for each component).  In the full-image case, this
    * points to the currently accessible strips of the virtual arrays.
    */
   JSAMPARRAY buffer[MAX_COMPONENTS];
-
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-  /* If using full-image storage, this array holds pointers to virtual-array
-   * control blocks for each component.  Unused if not full-image storage.
-   */
-  jvirt_sarray_ptr whole_image[MAX_COMPONENTS];
-#endif
 } my_main_controller;
 
-typedef my_main_controller * my_main_ptr;
+typedef my_main_controller *my_main_ptr;
 
 
 /* Forward declarations */
 METHODDEF(void) process_data_simple_main
-	JPP((j_compress_ptr cinfo, JSAMPARRAY input_buf,
-	     JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail));
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-METHODDEF(void) process_data_buffer_main
-	JPP((j_compress_ptr cinfo, JSAMPARRAY input_buf,
-	     JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail));
-#endif
+        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
+         JDIMENSION in_rows_avail);
 
 
 /*
@@ -74,32 +57,14 @@
   if (cinfo->raw_data_in)
     return;
 
-  main_ptr->cur_iMCU_row = 0;	/* initialize counters */
+  if (pass_mode != JBUF_PASS_THRU)
+    ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+
+  main_ptr->cur_iMCU_row = 0;   /* initialize counters */
   main_ptr->rowgroup_ctr = 0;
   main_ptr->suspended = FALSE;
-  main_ptr->pass_mode = pass_mode;	/* save mode for use by process_data */
-
-  switch (pass_mode) {
-  case JBUF_PASS_THRU:
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-    if (main_ptr->whole_image[0] != NULL)
-      ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-#endif
-    main_ptr->pub.process_data = process_data_simple_main;
-    break;
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-  case JBUF_SAVE_SOURCE:
-  case JBUF_CRANK_DEST:
-  case JBUF_SAVE_AND_PASS:
-    if (main_ptr->whole_image[0] == NULL)
-      ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    main_ptr->pub.process_data = process_data_buffer_main;
-    break;
-#endif
-  default:
-    ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    break;
-  }
+  main_ptr->pass_mode = pass_mode;      /* save mode for use by process_data */
+  main_ptr->pub.process_data = process_data_simple_main;
 }
 
 
@@ -111,8 +76,8 @@
 
 METHODDEF(void)
 process_data_simple_main (j_compress_ptr cinfo,
-			  JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-			  JDIMENSION in_rows_avail)
+                          JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
+                          JDIMENSION in_rows_avail)
 {
   my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
 
@@ -120,9 +85,9 @@
     /* Read input data if we haven't filled the main buffer yet */
     if (main_ptr->rowgroup_ctr < DCTSIZE)
       (*cinfo->prep->pre_process_data) (cinfo,
-					input_buf, in_row_ctr, in_rows_avail,
-					main_ptr->buffer, &main_ptr->rowgroup_ctr,
-					(JDIMENSION) DCTSIZE);
+                                        input_buf, in_row_ctr, in_rows_avail,
+                                        main_ptr->buffer, &main_ptr->rowgroup_ctr,
+                                        (JDIMENSION) DCTSIZE);
 
     /* If we don't have a full iMCU row buffered, return to application for
      * more data.  Note that preprocessor will always pad to fill the iMCU row
@@ -140,8 +105,8 @@
        * think we were done.
        */
       if (! main_ptr->suspended) {
-	(*in_row_ctr)--;
-	main_ptr->suspended = TRUE;
+        (*in_row_ctr)--;
+        main_ptr->suspended = TRUE;
       }
       return;
     }
@@ -158,85 +123,6 @@
 }
 
 
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-
-/*
- * Process some data.
- * This routine handles all of the modes that use a full-size buffer.
- */
-
-METHODDEF(void)
-process_data_buffer_main (j_compress_ptr cinfo,
-			  JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-			  JDIMENSION in_rows_avail)
-{
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
-  int ci;
-  jpeg_component_info *compptr;
-  boolean writing = (main_ptr->pass_mode != JBUF_CRANK_DEST);
-
-  while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
-    /* Realign the virtual buffers if at the start of an iMCU row. */
-    if (main_ptr->rowgroup_ctr == 0) {
-      for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-	   ci++, compptr++) {
-	main_ptr->buffer[ci] = (*cinfo->mem->access_virt_sarray)
-	  ((j_common_ptr) cinfo, main_ptr->whole_image[ci],
-	   main_ptr->cur_iMCU_row * (compptr->v_samp_factor * DCTSIZE),
-	   (JDIMENSION) (compptr->v_samp_factor * DCTSIZE), writing);
-      }
-      /* In a read pass, pretend we just read some source data. */
-      if (! writing) {
-	*in_row_ctr += cinfo->max_v_samp_factor * DCTSIZE;
-	main_ptr->rowgroup_ctr = DCTSIZE;
-      }
-    }
-
-    /* If a write pass, read input data until the current iMCU row is full. */
-    /* Note: preprocessor will pad if necessary to fill the last iMCU row. */
-    if (writing) {
-      (*cinfo->prep->pre_process_data) (cinfo,
-					input_buf, in_row_ctr, in_rows_avail,
-					main_ptr->buffer, &main_ptr->rowgroup_ctr,
-					(JDIMENSION) DCTSIZE);
-      /* Return to application if we need more data to fill the iMCU row. */
-      if (main_ptr->rowgroup_ctr < DCTSIZE)
-	return;
-    }
-
-    /* Emit data, unless this is a sink-only pass. */
-    if (main_ptr->pass_mode != JBUF_SAVE_SOURCE) {
-      if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
-	/* If compressor did not consume the whole row, then we must need to
-	 * suspend processing and return to the application.  In this situation
-	 * we pretend we didn't yet consume the last input row; otherwise, if
-	 * it happened to be the last row of the image, the application would
-	 * think we were done.
-	 */
-	if (! main_ptr->suspended) {
-	  (*in_row_ctr)--;
-	  main_ptr->suspended = TRUE;
-	}
-	return;
-      }
-      /* We did finish the row.  Undo our little suspension hack if a previous
-       * call suspended; then mark the main buffer empty.
-       */
-      if (main_ptr->suspended) {
-	(*in_row_ctr)++;
-	main_ptr->suspended = FALSE;
-      }
-    }
-
-    /* If get here, we are done with this iMCU row.  Mark buffer empty. */
-    main_ptr->rowgroup_ctr = 0;
-    main_ptr->cur_iMCU_row++;
-  }
-}
-
-#endif /* FULL_MAIN_BUFFER_SUPPORTED */
-
-
 /*
  * Initialize main buffer controller.
  */
@@ -250,7 +136,7 @@
 
   main_ptr = (my_main_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_main_controller));
+                                sizeof(my_main_controller));
   cinfo->main = (struct jpeg_c_main_controller *) main_ptr;
   main_ptr->pub.start_pass = start_pass_main;
 
@@ -262,32 +148,15 @@
    * may be of a different size.
    */
   if (need_full_buffer) {
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-    /* Allocate a full-image virtual array for each component */
-    /* Note we pad the bottom to a multiple of the iMCU height */
-    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-	 ci++, compptr++) {
-      main_ptr->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
-	((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-	 compptr->width_in_blocks * DCTSIZE,
-	 (JDIMENSION) jround_up((long) compptr->height_in_blocks,
-				(long) compptr->v_samp_factor) * DCTSIZE,
-	 (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
-    }
-#else
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-#endif
   } else {
-#ifdef FULL_MAIN_BUFFER_SUPPORTED
-    main_ptr->whole_image[0] = NULL; /* flag for no virtual arrays */
-#endif
     /* Allocate a strip buffer for each component */
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-	 ci++, compptr++) {
+         ci++, compptr++) {
       main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
-	((j_common_ptr) cinfo, JPOOL_IMAGE,
-	 compptr->width_in_blocks * DCTSIZE,
-	 (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+         compptr->width_in_blocks * DCTSIZE,
+         (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
     }
   }
 }
diff --git a/jcmarker.c b/jcmarker.c
index 4fbece4..463f665 100644
--- a/jcmarker.c
+++ b/jcmarker.c
@@ -6,7 +6,8 @@
  * Modified 2003-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to write JPEG datastream markers.
  */
@@ -17,7 +18,7 @@
 #include "jpegcomp.h"
 
 
-typedef enum {			/* JPEG marker codes */
+typedef enum {                  /* JPEG marker codes */
   M_SOF0  = 0xc0,
   M_SOF1  = 0xc1,
   M_SOF2  = 0xc2,
@@ -93,7 +94,7 @@
   unsigned int last_restart_interval; /* last DRI value emitted; 0 after SOI */
 } my_marker_writer;
 
-typedef my_marker_writer * my_marker_ptr;
+typedef my_marker_writer *my_marker_ptr;
 
 
 /*
@@ -112,7 +113,7 @@
 emit_byte (j_compress_ptr cinfo, int val)
 /* Emit a byte */
 {
-  struct jpeg_destination_mgr * dest = cinfo->dest;
+  struct jpeg_destination_mgr *dest = cinfo->dest;
 
   *(dest->next_output_byte)++ = (JOCTET) val;
   if (--dest->free_in_buffer == 0) {
@@ -149,7 +150,7 @@
 /* Emit a DQT marker */
 /* Returns the precision used (0 = 8bits, 1 = 16bits) for baseline checking */
 {
-  JQUANT_TBL * qtbl = cinfo->quant_tbl_ptrs[index];
+  JQUANT_TBL *qtbl = cinfo->quant_tbl_ptrs[index];
   int prec;
   int i;
 
@@ -173,7 +174,7 @@
       /* The table entries must be emitted in zigzag order. */
       unsigned int qval = qtbl->quantval[jpeg_natural_order[i]];
       if (prec)
-	emit_byte(cinfo, (int) (qval >> 8));
+        emit_byte(cinfo, (int) (qval >> 8));
       emit_byte(cinfo, (int) (qval & 0xFF));
     }
 
@@ -188,35 +189,35 @@
 emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
 /* Emit a DHT marker */
 {
-  JHUFF_TBL * htbl;
+  JHUFF_TBL *htbl;
   int length, i;
-  
+
   if (is_ac) {
     htbl = cinfo->ac_huff_tbl_ptrs[index];
-    index += 0x10;		/* output index has AC bit set */
+    index += 0x10;              /* output index has AC bit set */
   } else {
     htbl = cinfo->dc_huff_tbl_ptrs[index];
   }
 
   if (htbl == NULL)
     ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, index);
-  
+
   if (! htbl->sent_table) {
     emit_marker(cinfo, M_DHT);
-    
+
     length = 0;
     for (i = 1; i <= 16; i++)
       length += htbl->bits[i];
-    
+
     emit_2bytes(cinfo, length + 2 + 1 + 16);
     emit_byte(cinfo, index);
-    
+
     for (i = 1; i <= 16; i++)
       emit_byte(cinfo, htbl->bits[i]);
-    
+
     for (i = 0; i < length; i++)
       emit_byte(cinfo, htbl->huffval[i]);
-    
+
     htbl->sent_table = TRUE;
   }
 }
@@ -258,12 +259,12 @@
 
     for (i = 0; i < NUM_ARITH_TBLS; i++) {
       if (dc_in_use[i]) {
-	emit_byte(cinfo, i);
-	emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i]<<4));
+        emit_byte(cinfo, i);
+        emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i]<<4));
       }
       if (ac_in_use[i]) {
-	emit_byte(cinfo, i + 0x10);
-	emit_byte(cinfo, cinfo->arith_ac_K[i]);
+        emit_byte(cinfo, i + 0x10);
+        emit_byte(cinfo, cinfo->arith_ac_K[i]);
       }
     }
   }
@@ -276,8 +277,8 @@
 /* Emit a DRI marker */
 {
   emit_marker(cinfo, M_DRI);
-  
-  emit_2bytes(cinfo, 4);	/* fixed length */
+
+  emit_2bytes(cinfo, 4);        /* fixed length */
 
   emit_2bytes(cinfo, (int) cinfo->restart_interval);
 }
@@ -289,9 +290,9 @@
 {
   int ci;
   jpeg_component_info *compptr;
-  
+
   emit_marker(cinfo, code);
-  
+
   emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */
 
   /* Make sure image isn't bigger than SOF field can handle */
@@ -320,13 +321,13 @@
 {
   int i, td, ta;
   jpeg_component_info *compptr;
-  
+
   emit_marker(cinfo, M_SOS);
-  
+
   emit_2bytes(cinfo, 2 * cinfo->comps_in_scan + 2 + 1 + 3); /* length */
-  
+
   emit_byte(cinfo, cinfo->comps_in_scan);
-  
+
   for (i = 0; i < cinfo->comps_in_scan; i++) {
     compptr = cinfo->cur_comp_info[i];
     emit_byte(cinfo, compptr->component_id);
@@ -354,22 +355,22 @@
 /* Emit a JFIF-compliant APP0 marker */
 {
   /*
-   * Length of APP0 block	(2 bytes)
-   * Block ID			(4 bytes - ASCII "JFIF")
-   * Zero byte			(1 byte to terminate the ID string)
-   * Version Major, Minor	(2 bytes - major first)
-   * Units			(1 byte - 0x00 = none, 0x01 = inch, 0x02 = cm)
-   * Xdpu			(2 bytes - dots per unit horizontal)
-   * Ydpu			(2 bytes - dots per unit vertical)
-   * Thumbnail X size		(1 byte)
-   * Thumbnail Y size		(1 byte)
+   * Length of APP0 block       (2 bytes)
+   * Block ID                   (4 bytes - ASCII "JFIF")
+   * Zero byte                  (1 byte to terminate the ID string)
+   * Version Major, Minor       (2 bytes - major first)
+   * Units                      (1 byte - 0x00 = none, 0x01 = inch, 0x02 = cm)
+   * Xdpu                       (2 bytes - dots per unit horizontal)
+   * Ydpu                       (2 bytes - dots per unit vertical)
+   * Thumbnail X size           (1 byte)
+   * Thumbnail Y size           (1 byte)
    */
-  
+
   emit_marker(cinfo, M_APP0);
-  
+
   emit_2bytes(cinfo, 2 + 4 + 1 + 2 + 1 + 2 + 2 + 1 + 1); /* length */
 
-  emit_byte(cinfo, 0x4A);	/* Identifier: ASCII "JFIF" */
+  emit_byte(cinfo, 0x4A);       /* Identifier: ASCII "JFIF" */
   emit_byte(cinfo, 0x46);
   emit_byte(cinfo, 0x49);
   emit_byte(cinfo, 0x46);
@@ -379,7 +380,7 @@
   emit_byte(cinfo, cinfo->density_unit); /* Pixel size information */
   emit_2bytes(cinfo, (int) cinfo->X_density);
   emit_2bytes(cinfo, (int) cinfo->Y_density);
-  emit_byte(cinfo, 0);		/* No thumbnail image */
+  emit_byte(cinfo, 0);          /* No thumbnail image */
   emit_byte(cinfo, 0);
 }
 
@@ -389,12 +390,12 @@
 /* Emit an Adobe APP14 marker */
 {
   /*
-   * Length of APP14 block	(2 bytes)
-   * Block ID			(5 bytes - ASCII "Adobe")
-   * Version Number		(2 bytes - currently 100)
-   * Flags0			(2 bytes - currently 0)
-   * Flags1			(2 bytes - currently 0)
-   * Color transform		(1 byte)
+   * Length of APP14 block      (2 bytes)
+   * Block ID                   (5 bytes - ASCII "Adobe")
+   * Version Number             (2 bytes - currently 100)
+   * Flags0                     (2 bytes - currently 0)
+   * Flags1                     (2 bytes - currently 0)
+   * Color transform            (1 byte)
    *
    * Although Adobe TN 5116 mentions Version = 101, all the Adobe files
    * now in circulation seem to use Version = 100, so that's what we write.
@@ -403,28 +404,28 @@
    * YCbCr, 2 if it's YCCK, 0 otherwise.  Adobe's definition has to do with
    * whether the encoder performed a transformation, which is pretty useless.
    */
-  
+
   emit_marker(cinfo, M_APP14);
-  
+
   emit_2bytes(cinfo, 2 + 5 + 2 + 2 + 2 + 1); /* length */
 
-  emit_byte(cinfo, 0x41);	/* Identifier: ASCII "Adobe" */
+  emit_byte(cinfo, 0x41);       /* Identifier: ASCII "Adobe" */
   emit_byte(cinfo, 0x64);
   emit_byte(cinfo, 0x6F);
   emit_byte(cinfo, 0x62);
   emit_byte(cinfo, 0x65);
-  emit_2bytes(cinfo, 100);	/* Version */
-  emit_2bytes(cinfo, 0);	/* Flags0 */
-  emit_2bytes(cinfo, 0);	/* Flags1 */
+  emit_2bytes(cinfo, 100);      /* Version */
+  emit_2bytes(cinfo, 0);        /* Flags0 */
+  emit_2bytes(cinfo, 0);        /* Flags1 */
   switch (cinfo->jpeg_color_space) {
   case JCS_YCbCr:
-    emit_byte(cinfo, 1);	/* Color transform = 1 */
+    emit_byte(cinfo, 1);        /* Color transform = 1 */
     break;
   case JCS_YCCK:
-    emit_byte(cinfo, 2);	/* Color transform = 2 */
+    emit_byte(cinfo, 2);        /* Color transform = 2 */
     break;
   default:
-    emit_byte(cinfo, 0);	/* Color transform = 0 */
+    emit_byte(cinfo, 0);        /* Color transform = 0 */
     break;
   }
 }
@@ -442,12 +443,12 @@
 write_marker_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
 /* Emit an arbitrary marker header */
 {
-  if (datalen > (unsigned int) 65533)		/* safety check */
+  if (datalen > (unsigned int) 65533)           /* safety check */
     ERREXIT(cinfo, JERR_BAD_LENGTH);
 
   emit_marker(cinfo, (JPEG_MARKER) marker);
 
-  emit_2bytes(cinfo, (int) (datalen + 2));	/* total length */
+  emit_2bytes(cinfo, (int) (datalen + 2));      /* total length */
 }
 
 METHODDEF(void)
@@ -474,12 +475,12 @@
 {
   my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
 
-  emit_marker(cinfo, M_SOI);	/* first the SOI */
+  emit_marker(cinfo, M_SOI);    /* first the SOI */
 
   /* SOI is defined to reset restart interval to 0 */
   marker->last_restart_interval = 0;
 
-  if (cinfo->write_JFIF_header)	/* next an optional JFIF APP0 */
+  if (cinfo->write_JFIF_header) /* next an optional JFIF APP0 */
     emit_jfif_app0(cinfo);
   if (cinfo->write_Adobe_marker) /* next an optional Adobe APP14 */
     emit_adobe_app14(cinfo);
@@ -500,7 +501,7 @@
   int ci, prec;
   boolean is_baseline;
   jpeg_component_info *compptr;
-  
+
   /* Emit DQT for each quantization table.
    * Note that emit_dqt() suppresses any duplicate tables.
    */
@@ -520,9 +521,9 @@
   } else {
     is_baseline = TRUE;
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-	 ci++, compptr++) {
+         ci++, compptr++) {
       if (compptr->dc_tbl_no > 1 || compptr->ac_tbl_no > 1)
-	is_baseline = FALSE;
+        is_baseline = FALSE;
     }
     if (prec && is_baseline) {
       is_baseline = FALSE;
@@ -539,11 +540,11 @@
       emit_sof(cinfo, M_SOF9);  /* SOF code for sequential arithmetic */
   } else {
     if (cinfo->progressive_mode)
-      emit_sof(cinfo, M_SOF2);	/* SOF code for progressive Huffman */
+      emit_sof(cinfo, M_SOF2);  /* SOF code for progressive Huffman */
     else if (is_baseline)
-      emit_sof(cinfo, M_SOF0);	/* SOF code for baseline implementation */
+      emit_sof(cinfo, M_SOF0);  /* SOF code for baseline implementation */
     else
-      emit_sof(cinfo, M_SOF1);	/* SOF code for non-baseline Huffman file */
+      emit_sof(cinfo, M_SOF1);  /* SOF code for non-baseline Huffman file */
   }
 }
 
@@ -575,10 +576,10 @@
       compptr = cinfo->cur_comp_info[i];
       /* DC needs no table for refinement scan */
       if (cinfo->Ss == 0 && cinfo->Ah == 0)
-	emit_dht(cinfo, compptr->dc_tbl_no, FALSE);
+        emit_dht(cinfo, compptr->dc_tbl_no, FALSE);
       /* AC needs no table when not present */
       if (cinfo->Se)
-	emit_dht(cinfo, compptr->ac_tbl_no, TRUE);
+        emit_dht(cinfo, compptr->ac_tbl_no, TRUE);
     }
   }
 
@@ -627,9 +628,9 @@
   if (! cinfo->arith_code) {
     for (i = 0; i < NUM_HUFF_TBLS; i++) {
       if (cinfo->dc_huff_tbl_ptrs[i] != NULL)
-	emit_dht(cinfo, i, FALSE);
+        emit_dht(cinfo, i, FALSE);
       if (cinfo->ac_huff_tbl_ptrs[i] != NULL)
-	emit_dht(cinfo, i, TRUE);
+        emit_dht(cinfo, i, TRUE);
     }
   }
 
@@ -649,7 +650,7 @@
   /* Create the subobject */
   marker = (my_marker_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_marker_writer));
+                                sizeof(my_marker_writer));
   cinfo->marker = (struct jpeg_marker_writer *) marker;
   /* Initialize method pointers */
   marker->pub.write_file_header = write_file_header;
diff --git a/jcmaster.c b/jcmaster.c
index dca0315..d1e1ba7 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -6,11 +6,12 @@
  * Modified 2003-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains master control logic for the JPEG compressor.
  * These routines are concerned with parameter validation, initial setup,
- * and inter-pass control (determining the number of passes and the work 
+ * and inter-pass control (determining the number of passes and the work
  * to be done in each pass).
  */
 
@@ -23,23 +24,23 @@
 /* Private state */
 
 typedef enum {
-	main_pass,		/* input data, also do first output step */
-	huff_opt_pass,		/* Huffman code optimization pass */
-	output_pass		/* data output pass */
+        main_pass,              /* input data, also do first output step */
+        huff_opt_pass,          /* Huffman code optimization pass */
+        output_pass             /* data output pass */
 } c_pass_type;
 
 typedef struct {
-  struct jpeg_comp_master pub;	/* public fields */
+  struct jpeg_comp_master pub;  /* public fields */
 
-  c_pass_type pass_type;	/* the type of the current pass */
+  c_pass_type pass_type;        /* the type of the current pass */
 
-  int pass_number;		/* # of passes completed */
-  int total_passes;		/* total # of passes needed */
+  int pass_number;              /* # of passes completed */
+  int total_passes;             /* total # of passes needed */
 
-  int scan_number;		/* current index in scan_info[] */
+  int scan_number;              /* current index in scan_info[] */
 } my_comp_master;
 
-typedef my_comp_master * my_master_ptr;
+typedef my_comp_master *my_master_ptr;
 
 
 /*
@@ -105,7 +106,7 @@
   /* Check that number of components won't exceed internal array sizes */
   if (cinfo->num_components > MAX_COMPONENTS)
     ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
-	     MAX_COMPONENTS);
+             MAX_COMPONENTS);
 
   /* Compute maximum sampling factors; check factor validity */
   cinfo->max_h_samp_factor = 1;
@@ -113,12 +114,12 @@
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR ||
-	compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
+        compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
       ERREXIT(cinfo, JERR_BAD_SAMPLING);
     cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
-				   compptr->h_samp_factor);
+                                   compptr->h_samp_factor);
     cinfo->max_v_samp_factor = MAX(cinfo->max_v_samp_factor,
-				   compptr->v_samp_factor);
+                                   compptr->v_samp_factor);
   }
 
   /* Compute dimensions of components */
@@ -135,17 +136,17 @@
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
-		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
     compptr->height_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
-		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
       jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
-		    (long) cinfo->max_h_samp_factor);
+                    (long) cinfo->max_h_samp_factor);
     compptr->downsampled_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
-		    (long) cinfo->max_v_samp_factor);
+                    (long) cinfo->max_v_samp_factor);
     /* Mark component needed (this flag isn't actually used for compression) */
     compptr->component_needed = TRUE;
   }
@@ -155,7 +156,7 @@
    */
   cinfo->total_iMCU_rows = (JDIMENSION)
     jdiv_round_up((long) cinfo->_jpeg_height,
-		  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+                  (long) (cinfo->max_v_samp_factor*DCTSIZE));
 }
 
 
@@ -167,12 +168,12 @@
  * determine whether it uses progressive JPEG, and set cinfo->progressive_mode.
  */
 {
-  const jpeg_scan_info * scanptr;
+  const jpeg_scan_info *scanptr;
   int scanno, ncomps, ci, coefi, thisi;
   int Ss, Se, Ah, Al;
   boolean component_sent[MAX_COMPONENTS];
 #ifdef C_PROGRESSIVE_SUPPORTED
-  int * last_bitpos_ptr;
+  int *last_bitpos_ptr;
   int last_bitpos[MAX_COMPONENTS][DCTSIZE2];
   /* -1 until that coefficient has been seen; then last Al for it */
 #endif
@@ -188,15 +189,15 @@
 #ifdef C_PROGRESSIVE_SUPPORTED
     cinfo->progressive_mode = TRUE;
     last_bitpos_ptr = & last_bitpos[0][0];
-    for (ci = 0; ci < cinfo->num_components; ci++) 
+    for (ci = 0; ci < cinfo->num_components; ci++)
       for (coefi = 0; coefi < DCTSIZE2; coefi++)
-	*last_bitpos_ptr++ = -1;
+        *last_bitpos_ptr++ = -1;
 #else
     ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
   } else {
     cinfo->progressive_mode = FALSE;
-    for (ci = 0; ci < cinfo->num_components; ci++) 
+    for (ci = 0; ci < cinfo->num_components; ci++)
       component_sent[ci] = FALSE;
   }
 
@@ -208,10 +209,10 @@
     for (ci = 0; ci < ncomps; ci++) {
       thisi = scanptr->component_index[ci];
       if (thisi < 0 || thisi >= cinfo->num_components)
-	ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
+        ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
       /* Components must appear in SOF order within each scan */
       if (ci > 0 && thisi <= scanptr->component_index[ci-1])
-	ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
+        ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
     }
     /* Validate progression parameters */
     Ss = scanptr->Ss;
@@ -233,43 +234,43 @@
 #define MAX_AH_AL 13
 #endif
       if (Ss < 0 || Ss >= DCTSIZE2 || Se < Ss || Se >= DCTSIZE2 ||
-	  Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL)
-	ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+          Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL)
+        ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       if (Ss == 0) {
-	if (Se != 0)		/* DC and AC together not OK */
-	  ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+        if (Se != 0)            /* DC and AC together not OK */
+          ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       } else {
-	if (ncomps != 1)	/* AC scans must be for only one component */
-	  ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+        if (ncomps != 1)        /* AC scans must be for only one component */
+          ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       }
       for (ci = 0; ci < ncomps; ci++) {
-	last_bitpos_ptr = & last_bitpos[scanptr->component_index[ci]][0];
-	if (Ss != 0 && last_bitpos_ptr[0] < 0) /* AC without prior DC scan */
-	  ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
-	for (coefi = Ss; coefi <= Se; coefi++) {
-	  if (last_bitpos_ptr[coefi] < 0) {
-	    /* first scan of this coefficient */
-	    if (Ah != 0)
-	      ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
-	  } else {
-	    /* not first scan */
-	    if (Ah != last_bitpos_ptr[coefi] || Al != Ah-1)
-	      ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
-	  }
-	  last_bitpos_ptr[coefi] = Al;
-	}
+        last_bitpos_ptr = & last_bitpos[scanptr->component_index[ci]][0];
+        if (Ss != 0 && last_bitpos_ptr[0] < 0) /* AC without prior DC scan */
+          ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+        for (coefi = Ss; coefi <= Se; coefi++) {
+          if (last_bitpos_ptr[coefi] < 0) {
+            /* first scan of this coefficient */
+            if (Ah != 0)
+              ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+          } else {
+            /* not first scan */
+            if (Ah != last_bitpos_ptr[coefi] || Al != Ah-1)
+              ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+          }
+          last_bitpos_ptr[coefi] = Al;
+        }
       }
 #endif
     } else {
       /* For sequential JPEG, all progression parameters must be these: */
       if (Ss != 0 || Se != DCTSIZE2-1 || Ah != 0 || Al != 0)
-	ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+        ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       /* Make sure components are not sent twice */
       for (ci = 0; ci < ncomps; ci++) {
-	thisi = scanptr->component_index[ci];
-	if (component_sent[thisi])
-	  ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
-	component_sent[thisi] = TRUE;
+        thisi = scanptr->component_index[ci];
+        if (component_sent[thisi])
+          ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
+        component_sent[thisi] = TRUE;
       }
     }
   }
@@ -284,13 +285,13 @@
      */
     for (ci = 0; ci < cinfo->num_components; ci++) {
       if (last_bitpos[ci][0] < 0)
-	ERREXIT(cinfo, JERR_MISSING_DATA);
+        ERREXIT(cinfo, JERR_MISSING_DATA);
     }
 #endif
   } else {
     for (ci = 0; ci < cinfo->num_components; ci++) {
       if (! component_sent[ci])
-	ERREXIT(cinfo, JERR_MISSING_DATA);
+        ERREXIT(cinfo, JERR_MISSING_DATA);
     }
   }
 }
@@ -308,12 +309,12 @@
   if (cinfo->scan_info != NULL) {
     /* Prepare for current scan --- the script is already validated */
     my_master_ptr master = (my_master_ptr) cinfo->master;
-    const jpeg_scan_info * scanptr = cinfo->scan_info + master->scan_number;
+    const jpeg_scan_info *scanptr = cinfo->scan_info + master->scan_number;
 
     cinfo->comps_in_scan = scanptr->comps_in_scan;
     for (ci = 0; ci < scanptr->comps_in_scan; ci++) {
       cinfo->cur_comp_info[ci] =
-	&cinfo->comp_info[scanptr->component_index[ci]];
+        &cinfo->comp_info[scanptr->component_index[ci]];
     }
     cinfo->Ss = scanptr->Ss;
     cinfo->Se = scanptr->Se;
@@ -326,7 +327,7 @@
     /* Prepare for single sequential-JPEG scan containing all components */
     if (cinfo->num_components > MAX_COMPS_IN_SCAN)
       ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
-	       MAX_COMPS_IN_SCAN);
+               MAX_COMPS_IN_SCAN);
     cinfo->comps_in_scan = cinfo->num_components;
     for (ci = 0; ci < cinfo->num_components; ci++) {
       cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci];
@@ -346,16 +347,16 @@
 {
   int ci, mcublks, tmp;
   jpeg_component_info *compptr;
-  
+
   if (cinfo->comps_in_scan == 1) {
-    
+
     /* Noninterleaved (single-component) scan */
     compptr = cinfo->cur_comp_info[0];
-    
+
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = compptr->width_in_blocks;
     cinfo->MCU_rows_in_scan = compptr->height_in_blocks;
-    
+
     /* For noninterleaved scan, always one block per MCU */
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
@@ -368,28 +369,28 @@
     tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
     if (tmp == 0) tmp = compptr->v_samp_factor;
     compptr->last_row_height = tmp;
-    
+
     /* Prepare array describing MCU composition */
     cinfo->blocks_in_MCU = 1;
     cinfo->MCU_membership[0] = 0;
-    
+
   } else {
-    
+
     /* Interleaved (multi-component) scan */
     if (cinfo->comps_in_scan <= 0 || cinfo->comps_in_scan > MAX_COMPS_IN_SCAN)
       ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->comps_in_scan,
-	       MAX_COMPS_IN_SCAN);
-    
+               MAX_COMPS_IN_SCAN);
+
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
       jdiv_round_up((long) cinfo->_jpeg_width,
-		    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+                    (long) (cinfo->max_h_samp_factor*DCTSIZE));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
       jdiv_round_up((long) cinfo->_jpeg_height,
-		    (long) (cinfo->max_v_samp_factor*DCTSIZE));
-    
+                    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+
     cinfo->blocks_in_MCU = 0;
-    
+
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       compptr = cinfo->cur_comp_info[ci];
       /* Sampling factors give # of blocks of component in each MCU */
@@ -407,12 +408,12 @@
       /* Prepare array describing MCU composition */
       mcublks = compptr->MCU_blocks;
       if (cinfo->blocks_in_MCU + mcublks > C_MAX_BLOCKS_IN_MCU)
-	ERREXIT(cinfo, JERR_BAD_MCU_SIZE);
+        ERREXIT(cinfo, JERR_BAD_MCU_SIZE);
       while (mcublks-- > 0) {
-	cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci;
+        cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci;
       }
     }
-    
+
   }
 
   /* Convert restart specified in rows to actual MCU count. */
@@ -452,8 +453,8 @@
     (*cinfo->fdct->start_pass) (cinfo);
     (*cinfo->entropy->start_pass) (cinfo, cinfo->optimize_coding);
     (*cinfo->coef->start_pass) (cinfo,
-				(master->total_passes > 1 ?
-				 JBUF_SAVE_AND_PASS : JBUF_PASS_THRU));
+                                (master->total_passes > 1 ?
+                                 JBUF_SAVE_AND_PASS : JBUF_PASS_THRU));
     (*cinfo->main->start_pass) (cinfo, JBUF_PASS_THRU);
     if (cinfo->optimize_coding) {
       /* No immediate data output; postpone writing frame/scan headers */
@@ -581,7 +582,7 @@
 
   master = (my_master_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(my_comp_master));
+                                  sizeof(my_comp_master));
   cinfo->master = (struct jpeg_comp_master *) master;
   master->pub.prepare_for_pass = prepare_for_pass;
   master->pub.pass_startup = pass_startup;
@@ -602,7 +603,7 @@
     cinfo->num_scans = 1;
   }
 
-  if (cinfo->progressive_mode && !cinfo->arith_code)	/*  TEMPORARY HACK ??? */
+  if (cinfo->progressive_mode && !cinfo->arith_code)  /*  TEMPORARY HACK ??? */
     cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */
 
   /* Initialize my private state */
diff --git a/jcomapi.c b/jcomapi.c
index 9b1fa75..6e5bf3d 100644
--- a/jcomapi.c
+++ b/jcomapi.c
@@ -1,9 +1,12 @@
 /*
  * jcomapi.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains application interface routines that are used for both
  * compression and decompression.
@@ -72,8 +75,8 @@
   /* NB: mem pointer is NULL if memory mgr failed to initialize. */
   if (cinfo->mem != NULL)
     (*cinfo->mem->self_destruct) (cinfo);
-  cinfo->mem = NULL;		/* be safe if jpeg_destroy is called twice */
-  cinfo->global_state = 0;	/* mark it destroyed */
+  cinfo->mem = NULL;            /* be safe if jpeg_destroy is called twice */
+  cinfo->global_state = 0;      /* mark it destroyed */
 }
 
 
@@ -88,8 +91,8 @@
   JQUANT_TBL *tbl;
 
   tbl = (JQUANT_TBL *)
-    (*cinfo->mem->alloc_small) (cinfo, JPOOL_PERMANENT, SIZEOF(JQUANT_TBL));
-  tbl->sent_table = FALSE;	/* make sure this is false in any new table */
+    (*cinfo->mem->alloc_small) (cinfo, JPOOL_PERMANENT, sizeof(JQUANT_TBL));
+  tbl->sent_table = FALSE;      /* make sure this is false in any new table */
   return tbl;
 }
 
@@ -100,7 +103,7 @@
   JHUFF_TBL *tbl;
 
   tbl = (JHUFF_TBL *)
-    (*cinfo->mem->alloc_small) (cinfo, JPOOL_PERMANENT, SIZEOF(JHUFF_TBL));
-  tbl->sent_table = FALSE;	/* make sure this is false in any new table */
+    (*cinfo->mem->alloc_small) (cinfo, JPOOL_PERMANENT, sizeof(JHUFF_TBL));
+  tbl->sent_table = FALSE;      /* make sure this is false in any new table */
   return tbl;
 }
diff --git a/jconfig.h b/jconfig.h
index b619181..fe4a437 100644
--- a/jconfig.h
+++ b/jconfig.h
@@ -5,19 +5,27 @@
 #define JPEG_LIB_VERSION 62
 
 /* libjpeg-turbo version */
-#define LIBJPEG_TURBO_VERSION 1.3.1
+#define LIBJPEG_TURBO_VERSION 1.4.2
 
 /* Support arithmetic encoding */
-/* #undef C_ARITH_CODING_SUPPORTED */
+/* #define C_ARITH_CODING_SUPPORTED 1 */
 
 /* Support arithmetic decoding */
-/* #undef D_ARITH_CODING_SUPPORTED */
+/* #define D_ARITH_CODING_SUPPORTED 1 */
 
-/* Support in-memory source/destination managers */
-/* #undef MEM_SRCDST_SUPPORTED */
+/*
+ * Define BITS_IN_JSAMPLE as either
+ *   8   for 8-bit sample values (the usual setting)
+ *   12  for 12-bit sample values
+ * Only 8 and 12 are legal data precisions for lossy JPEG according to the
+ * JPEG standard, and the IJG code does not support anything else!
+ * We do not support run-time selection of data precision, sorry.
+ */
 
-/* Define if your compiler supports prototypes */
-#define HAVE_PROTOTYPES 1
+#define BITS_IN_JSAMPLE  8      /* use 8 or 12 */
+
+/* Define to 1 if you have the <locale.h> header file. */
+#define HAVE_LOCALE_H 1
 
 /* Define to 1 if you have the <stddef.h> header file. */
 #define HAVE_STDDEF_H 1
@@ -31,19 +39,21 @@
 /* Define to 1 if the system has the type `unsigned short'. */
 #define HAVE_UNSIGNED_SHORT 1
 
-/* Define if you want use complete types */
+/* Compiler does not support pointers to undefined structures. */
 /* #undef INCOMPLETE_TYPES_BROKEN */
 
-/* Define if you have BSD-like bzero and bcopy */
+/* Support in-memory source/destination managers */
+/* #undef MEM_SRCDST_SUPPORTED */
+
+/* Define if you have BSD-like bzero and bcopy in <strings.h> rather than
+   memset/memcpy in <string.h>. */
 /* #undef NEED_BSD_STRINGS */
 
-/* Define if you need short function names */
-/* #undef NEED_SHORT_EXTERNAL_NAMES */
+/* Define if you need to include <sys/types.h> to get size_t. */
+/* #undef NEED_SYS_TYPES_H 1 */
 
-/* Define if you have sys/types.h */
-/* #undef NEED_SYS_TYPES_H */
-
-/* Define if shift is unsigned */
+/* Define if your (broken) compiler shifts signed values as if they were
+   unsigned. */
 /* #undef RIGHT_SHIFT_IS_UNSIGNED */
 
 /* Use accelerated SIMD routines. */
diff --git a/jconfig.h.in b/jconfig.h.in
new file mode 100644
index 0000000..42d86f2
--- /dev/null
+++ b/jconfig.h.in
@@ -0,0 +1,70 @@
+/* Version ID for the JPEG library.
+ * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
+ */
+#define JPEG_LIB_VERSION  62	/* Version 6b */
+
+/* libjpeg-turbo version */
+#define LIBJPEG_TURBO_VERSION 0
+
+/* Support arithmetic encoding */
+#undef C_ARITH_CODING_SUPPORTED
+
+/* Support arithmetic decoding */
+#undef D_ARITH_CODING_SUPPORTED
+
+/*
+ * Define BITS_IN_JSAMPLE as either
+ *   8   for 8-bit sample values (the usual setting)
+ *   12  for 12-bit sample values
+ * Only 8 and 12 are legal data precisions for lossy JPEG according to the
+ * JPEG standard, and the IJG code does not support anything else!
+ * We do not support run-time selection of data precision, sorry.
+ */
+
+#define BITS_IN_JSAMPLE  8      /* use 8 or 12 */
+
+/* Define to 1 if you have the <locale.h> header file. */
+#undef HAVE_LOCALE_H
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#undef HAVE_STDDEF_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if the system has the type `unsigned char'. */
+#undef HAVE_UNSIGNED_CHAR
+
+/* Define to 1 if the system has the type `unsigned short'. */
+#undef HAVE_UNSIGNED_SHORT
+
+/* Compiler does not support pointers to undefined structures. */
+#undef INCOMPLETE_TYPES_BROKEN
+
+/* Support in-memory source/destination managers */
+#undef MEM_SRCDST_SUPPORTED
+
+/* Define if you have BSD-like bzero and bcopy in <strings.h> rather than
+   memset/memcpy in <string.h>. */
+#undef NEED_BSD_STRINGS
+
+/* Define if you need to include <sys/types.h> to get size_t. */
+#undef NEED_SYS_TYPES_H
+
+/* Define if your (broken) compiler shifts signed values as if they were
+   unsigned. */
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+/* Use accelerated SIMD routines. */
+#undef WITH_SIMD
+
+/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
+#ifndef __CHAR_UNSIGNED__
+# undef __CHAR_UNSIGNED__
+#endif
+
+/* Define to empty if `const' does not conform to ANSI C. */
+#undef const
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+#undef size_t
diff --git a/jconfig.txt b/jconfig.txt
new file mode 100644
index 0000000..808f87f
--- /dev/null
+++ b/jconfig.txt
@@ -0,0 +1,143 @@
+/*
+ * jconfig.txt
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1994, Thomas G. Lane.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file documents the configuration options that are required to
+ * customize the JPEG software for a particular system.
+ *
+ * The actual configuration options for a particular installation are stored
+ * in jconfig.h.  On many machines, jconfig.h can be generated automatically
+ * or copied from one of the "canned" jconfig files that we supply.  But if
+ * you need to generate a jconfig.h file by hand, this file tells you how.
+ *
+ * DO NOT EDIT THIS FILE --- IT WON'T ACCOMPLISH ANYTHING.
+ * EDIT A COPY NAMED JCONFIG.H.
+ */
+
+
+/*
+ * These symbols indicate the properties of your machine or compiler.
+ * #define the symbol if yes, #undef it if no.
+ */
+
+/* Does your compiler support the declaration "unsigned char" ?
+ * How about "unsigned short" ?
+ */
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
+
+/* Define "void" as "char" if your compiler doesn't know about type void.
+ * NOTE: be sure to define void such that "void *" represents the most general
+ * pointer type, e.g., that returned by malloc().
+ */
+/* #define void char */
+
+/* Define "const" as empty if your compiler doesn't know the "const" keyword.
+ */
+/* #define const */
+
+/* Define this if an ordinary "char" type is unsigned.
+ * If you're not sure, leaving it undefined will work at some cost in speed.
+ * If you defined HAVE_UNSIGNED_CHAR then the speed difference is minimal.
+ */
+#undef __CHAR_UNSIGNED__
+
+/* Define this if your system has an ANSI-conforming <stddef.h> file.
+ */
+#define HAVE_STDDEF_H
+
+/* Define this if your system has an ANSI-conforming <stdlib.h> file.
+ */
+#define HAVE_STDLIB_H
+
+/* Define this if your system does not have an ANSI/SysV <string.h>,
+ * but does have a BSD-style <strings.h>.
+ */
+#undef NEED_BSD_STRINGS
+
+/* Define this if your system does not provide typedef size_t in any of the
+ * ANSI-standard places (stddef.h, stdlib.h, or stdio.h), but places it in
+ * <sys/types.h> instead.
+ */
+#undef NEED_SYS_TYPES_H
+
+/* Although a real ANSI C compiler can deal perfectly well with pointers to
+ * unspecified structures (see "incomplete types" in the spec), a few pre-ANSI
+ * and pseudo-ANSI compilers get confused.  To keep one of these bozos happy,
+ * define INCOMPLETE_TYPES_BROKEN.  This is not recommended unless you
+ * actually get "missing structure definition" warnings or errors while
+ * compiling the JPEG code.
+ */
+#undef INCOMPLETE_TYPES_BROKEN
+
+/* Define "boolean" as unsigned char, not int, on Windows systems.
+ */
+#ifdef _WIN32
+#ifndef __RPCNDR_H__            /* don't conflict if rpcndr.h already read */
+typedef unsigned char boolean;
+#endif
+#define HAVE_BOOLEAN            /* prevent jmorecfg.h from redefining it */
+#endif
+
+
+/*
+ * The following options affect code selection within the JPEG library,
+ * but they don't need to be visible to applications using the library.
+ * To minimize application namespace pollution, the symbols won't be
+ * defined unless JPEG_INTERNALS has been defined.
+ */
+
+#ifdef JPEG_INTERNALS
+
+/* Define this if your compiler implements ">>" on signed values as a logical
+ * (unsigned) shift; leave it undefined if ">>" is a signed (arithmetic) shift,
+ * which is the normal and rational definition.
+ */
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+
+#endif /* JPEG_INTERNALS */
+
+
+/*
+ * The remaining options do not affect the JPEG library proper,
+ * but only the sample applications cjpeg/djpeg (see cjpeg.c, djpeg.c).
+ * Other applications can ignore these.
+ */
+
+#ifdef JPEG_CJPEG_DJPEG
+
+/* These defines indicate which image (non-JPEG) file formats are allowed. */
+
+#define BMP_SUPPORTED           /* BMP image file format */
+#define GIF_SUPPORTED           /* GIF image file format */
+#define PPM_SUPPORTED           /* PBMPLUS PPM/PGM image file format */
+#undef RLE_SUPPORTED            /* Utah RLE image file format */
+#define TARGA_SUPPORTED         /* Targa image file format */
+
+/* Define this if you want to name both input and output files on the command
+ * line, rather than using stdout and optionally stdin.  You MUST do this if
+ * your system can't cope with binary I/O to stdin/stdout.  See comments at
+ * head of cjpeg.c or djpeg.c.
+ */
+#undef TWO_FILE_COMMANDLINE
+
+/* By default, we open image files with fopen(...,"rb") or fopen(...,"wb").
+ * This is necessary on systems that distinguish text files from binary files,
+ * and is harmless on most systems that don't.  If you have one of the rare
+ * systems that complains about the "b" spec, define this symbol.
+ */
+#undef DONT_USE_B_MODE
+
+/* Define this if you want percent-done progress reports from cjpeg/djpeg.
+ */
+#undef PROGRESS_REPORT
+
+
+#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfigint.h b/jconfigint.h
new file mode 100644
index 0000000..587f8ca
--- /dev/null
+++ b/jconfigint.h
@@ -0,0 +1,31 @@
+/* jconfigint.h.  Generated from jconfigint.h.in by configure.  */
+/* libjpeg-turbo build number */
+#define BUILD ""
+
+/* How to obtain function inlining. */
+#ifndef INLINE
+  #ifndef TURBO_FOR_WINDOWS
+    #define INLINE inline __attribute__((always_inline))
+  #else
+    #if defined(__GNUC__)
+      #define INLINE inline __attribute__((always_inline))
+    #elif defined(_MSC_VER)
+      #define INLINE __forceinline
+    #else
+      #define INLINE
+    #endif
+  #endif
+#endif
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "libjpeg-turbo"
+
+/* Version number of package */
+#define VERSION "1.4.90"
+
+/* The size of `size_t', as computed by sizeof. */
+#if __WORDSIZE==64 || defined(_WIN64)
+#define SIZEOF_SIZE_T 8
+#else
+#define SIZEOF_SIZE_T 4
+#endif
diff --git a/jconfigint.h.in b/jconfigint.h.in
new file mode 100644
index 0000000..940424e
--- /dev/null
+++ b/jconfigint.h.in
@@ -0,0 +1,14 @@
+/* libjpeg-turbo build number */
+#undef BUILD
+
+/* How to obtain function inlining. */
+#undef INLINE
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Version number of package */
+#undef VERSION
+
+/* The size of `size_t', as computed by sizeof. */
+#undef SIZEOF_SIZE_T
diff --git a/jcparam.c b/jcparam.c
index 2b9a740..18b2d48 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -6,7 +6,8 @@
  * Modified 2003-2008 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2009-2011, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains optional default-setting code for the JPEG compressor.
  * Applications do not have to use this file, but those that don't use it
@@ -16,6 +17,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jstdhuff.c"
 
 
 /*
@@ -24,15 +26,15 @@
 
 GLOBAL(void)
 jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
-		      const unsigned int *basic_table,
-		      int scale_factor, boolean force_baseline)
+                      const unsigned int *basic_table,
+                      int scale_factor, boolean force_baseline)
 /* Define a quantization table equal to the basic_table times
  * a scale factor (given as a percentage).
  * If force_baseline is TRUE, the computed quantization table entries
  * are limited to 1..255 for JPEG baseline compatibility.
  */
 {
-  JQUANT_TBL ** qtblptr;
+  JQUANT_TBL **qtblptr;
   int i;
   long temp;
 
@@ -54,7 +56,7 @@
     if (temp <= 0L) temp = 1L;
     if (temp > 32767L) temp = 32767L; /* max quantizer needed for 12 bits */
     if (force_baseline && temp > 255L)
-      temp = 255L;		/* limit to baseline range if requested */
+      temp = 255L;              /* limit to baseline range if requested */
     (*qtblptr)->quantval[i] = (UINT16) temp;
   }
 
@@ -99,16 +101,16 @@
 {
   /* Set up two quantization tables using the specified scaling */
   jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
-		       cinfo->q_scale_factor[0], force_baseline);
+                       cinfo->q_scale_factor[0], force_baseline);
   jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
-		       cinfo->q_scale_factor[1], force_baseline);
+                       cinfo->q_scale_factor[1], force_baseline);
 }
 #endif
 
 
 GLOBAL(void)
 jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
-			 boolean force_baseline)
+                         boolean force_baseline)
 /* Set or change the 'quality' (quantization) setting, using default tables
  * and a straight percentage-scaling quality scale.  In most cases it's better
  * to use jpeg_set_quality (below); this entry point is provided for
@@ -117,9 +119,9 @@
 {
   /* Set up two quantization tables using the specified scaling */
   jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
-		       scale_factor, force_baseline);
+                       scale_factor, force_baseline);
   jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
-		       scale_factor, force_baseline);
+                       scale_factor, force_baseline);
 }
 
 
@@ -166,116 +168,6 @@
 
 
 /*
- * Huffman table setup routines
- */
-
-LOCAL(void)
-add_huff_table (j_compress_ptr cinfo,
-		JHUFF_TBL **htblptr, const UINT8 *bits, const UINT8 *val)
-/* Define a Huffman table */
-{
-  int nsymbols, len;
-
-  if (*htblptr == NULL)
-    *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-
-  /* Copy the number-of-symbols-of-each-code-length counts */
-  MEMCOPY((*htblptr)->bits, bits, SIZEOF((*htblptr)->bits));
-
-  /* Validate the counts.  We do this here mainly so we can copy the right
-   * number of symbols from the val[] array, without risking marching off
-   * the end of memory.  jchuff.c will do a more thorough test later.
-   */
-  nsymbols = 0;
-  for (len = 1; len <= 16; len++)
-    nsymbols += bits[len];
-  if (nsymbols < 1 || nsymbols > 256)
-    ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
-
-  MEMCOPY((*htblptr)->huffval, val, nsymbols * SIZEOF(UINT8));
-
-  /* Initialize sent_table FALSE so table will be written to JPEG file. */
-  (*htblptr)->sent_table = FALSE;
-}
-
-
-LOCAL(void)
-std_huff_tables (j_compress_ptr cinfo)
-/* Set up the standard Huffman tables (cf. JPEG standard section K.3) */
-/* IMPORTANT: these are only valid for 8-bit data precision! */
-{
-  static const UINT8 bits_dc_luminance[17] =
-    { /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
-  static const UINT8 val_dc_luminance[] =
-    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
-  
-  static const UINT8 bits_dc_chrominance[17] =
-    { /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
-  static const UINT8 val_dc_chrominance[] =
-    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
-  
-  static const UINT8 bits_ac_luminance[17] =
-    { /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d };
-  static const UINT8 val_ac_luminance[] =
-    { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
-      0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
-      0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
-      0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
-      0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
-      0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
-      0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
-      0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
-      0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
-      0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-      0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-      0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
-      0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
-      0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
-      0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
-      0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
-      0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
-      0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
-      0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
-      0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
-      0xf9, 0xfa };
-  
-  static const UINT8 bits_ac_chrominance[17] =
-    { /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 };
-  static const UINT8 val_ac_chrominance[] =
-    { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
-      0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
-      0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
-      0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
-      0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
-      0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
-      0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
-      0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
-      0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
-      0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
-      0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
-      0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-      0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
-      0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
-      0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
-      0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
-      0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
-      0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
-      0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
-      0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
-      0xf9, 0xfa };
-  
-  add_huff_table(cinfo, &cinfo->dc_huff_tbl_ptrs[0],
-		 bits_dc_luminance, val_dc_luminance);
-  add_huff_table(cinfo, &cinfo->ac_huff_tbl_ptrs[0],
-		 bits_ac_luminance, val_ac_luminance);
-  add_huff_table(cinfo, &cinfo->dc_huff_tbl_ptrs[1],
-		 bits_dc_chrominance, val_dc_chrominance);
-  add_huff_table(cinfo, &cinfo->ac_huff_tbl_ptrs[1],
-		 bits_ac_chrominance, val_ac_chrominance);
-}
-
-
-/*
  * Default parameter setup for compression.
  *
  * Applications that don't choose to use this routine must do their
@@ -301,19 +193,19 @@
   if (cinfo->comp_info == NULL)
     cinfo->comp_info = (jpeg_component_info *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				  MAX_COMPONENTS * SIZEOF(jpeg_component_info));
+                                  MAX_COMPONENTS * sizeof(jpeg_component_info));
 
   /* Initialize everything not dependent on the color space */
 
 #if JPEG_LIB_VERSION >= 70
-  cinfo->scale_num = 1;		/* 1:1 scaling */
+  cinfo->scale_num = 1;         /* 1:1 scaling */
   cinfo->scale_denom = 1;
 #endif
   cinfo->data_precision = BITS_IN_JSAMPLE;
   /* Set up two quantization tables using default quality of 75 */
   jpeg_set_quality(cinfo, 75, TRUE);
   /* Set up two Huffman tables */
-  std_huff_tables(cinfo);
+  std_huff_tables((j_common_ptr) cinfo);
 
   /* Initialize default arithmetic coding conditioning */
   for (i = 0; i < NUM_ARITH_TBLS; i++) {
@@ -371,8 +263,8 @@
    */
   cinfo->JFIF_major_version = 1; /* Default JFIF version = 1.01 */
   cinfo->JFIF_minor_version = 1;
-  cinfo->density_unit = 0;	/* Pixel size is unknown by default */
-  cinfo->X_density = 1;		/* Pixel aspect ratio is square by default */
+  cinfo->density_unit = 0;      /* Pixel size is unknown by default */
+  cinfo->X_density = 1;         /* Pixel aspect ratio is square by default */
   cinfo->Y_density = 1;
 
   /* Choose JPEG colorspace based on input space, set defaults accordingly */
@@ -430,7 +322,7 @@
 GLOBAL(void)
 jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
 {
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   int ci;
 
 #define SET_COMP(index,id,hsamp,vsamp,quant,dctbl,actbl)  \
@@ -498,7 +390,7 @@
     cinfo->num_components = cinfo->input_components;
     if (cinfo->num_components < 1 || cinfo->num_components > MAX_COMPONENTS)
       ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
-	       MAX_COMPONENTS);
+               MAX_COMPONENTS);
     for (ci = 0; ci < cinfo->num_components; ci++) {
       SET_COMP(ci, ci, 1,1, 0, 0,0);
     }
@@ -512,8 +404,8 @@
 #ifdef C_PROGRESSIVE_SUPPORTED
 
 LOCAL(jpeg_scan_info *)
-fill_a_scan (jpeg_scan_info * scanptr, int ci,
-	     int Ss, int Se, int Ah, int Al)
+fill_a_scan (jpeg_scan_info *scanptr, int ci,
+             int Ss, int Se, int Ah, int Al)
 /* Support routine: generate one scan for specified component */
 {
   scanptr->comps_in_scan = 1;
@@ -527,8 +419,8 @@
 }
 
 LOCAL(jpeg_scan_info *)
-fill_scans (jpeg_scan_info * scanptr, int ncomps,
-	    int Ss, int Se, int Ah, int Al)
+fill_scans (jpeg_scan_info *scanptr, int ncomps,
+            int Ss, int Se, int Ah, int Al)
 /* Support routine: generate one scan for each component */
 {
   int ci;
@@ -546,7 +438,7 @@
 }
 
 LOCAL(jpeg_scan_info *)
-fill_dc_scans (jpeg_scan_info * scanptr, int ncomps, int Ah, int Al)
+fill_dc_scans (jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
 /* Support routine: generate interleaved DC scan if possible, else N scans */
 {
   int ci;
@@ -578,7 +470,7 @@
 {
   int ncomps = cinfo->num_components;
   int nscans;
-  jpeg_scan_info * scanptr;
+  jpeg_scan_info *scanptr;
 
   /* Safety check to ensure start_compress not called yet. */
   if (cinfo->global_state != CSTATE_START)
@@ -591,9 +483,9 @@
   } else {
     /* All-purpose script for other color spaces. */
     if (ncomps > MAX_COMPS_IN_SCAN)
-      nscans = 6 * ncomps;	/* 2 DC + 4 AC scans per component */
+      nscans = 6 * ncomps;      /* 2 DC + 4 AC scans per component */
     else
-      nscans = 2 + 4 * ncomps;	/* 2 DC scans; 4 AC scans per component */
+      nscans = 2 + 4 * ncomps;  /* 2 DC scans; 4 AC scans per component */
   }
 
   /* Allocate space for script.
@@ -607,7 +499,7 @@
     cinfo->script_space_size = MAX(nscans, 10);
     cinfo->script_space = (jpeg_scan_info *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-			cinfo->script_space_size * SIZEOF(jpeg_scan_info));
+                        cinfo->script_space_size * sizeof(jpeg_scan_info));
   }
   scanptr = cinfo->script_space;
   cinfo->scan_info = scanptr;
diff --git a/jcphuff.c b/jcphuff.c
index 3102871..046e2e1 100644
--- a/jcphuff.c
+++ b/jcphuff.c
@@ -1,9 +1,12 @@
 /*
  * jcphuff.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains Huffman entropy encoding routines for progressive JPEG.
  *
@@ -15,7 +18,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jchuff.h"		/* Declarations shared with jchuff.c */
+#include "jchuff.h"             /* Declarations shared with jchuff.c */
 
 #ifdef C_PROGRESSIVE_SUPPORTED
 
@@ -30,36 +33,36 @@
   /* Bit-level coding status.
    * next_output_byte/free_in_buffer are local copies of cinfo->dest fields.
    */
-  JOCTET * next_output_byte;	/* => next byte to write in buffer */
-  size_t free_in_buffer;	/* # of byte spaces remaining in buffer */
-  INT32 put_buffer;		/* current bit-accumulation buffer */
-  int put_bits;			/* # of bits now in it */
-  j_compress_ptr cinfo;		/* link to cinfo (needed for dump_buffer) */
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
+  size_t put_buffer;            /* current bit-accumulation buffer */
+  int put_bits;                 /* # of bits now in it */
+  j_compress_ptr cinfo;         /* link to cinfo (needed for dump_buffer) */
 
   /* Coding status for DC components */
   int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
 
   /* Coding status for AC components */
-  int ac_tbl_no;		/* the table number of the single component */
-  unsigned int EOBRUN;		/* run length of EOBs */
-  unsigned int BE;		/* # of buffered correction bits before MCU */
-  char * bit_buffer;		/* buffer for correction bits (1 per char) */
+  int ac_tbl_no;                /* the table number of the single component */
+  unsigned int EOBRUN;          /* run length of EOBs */
+  unsigned int BE;              /* # of buffered correction bits before MCU */
+  char *bit_buffer;             /* buffer for correction bits (1 per char) */
   /* packing correction bits tightly would save some space but cost time... */
 
-  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
-  int next_restart_num;		/* next restart number to write (0-7) */
+  unsigned int restarts_to_go;  /* MCUs left in this restart interval */
+  int next_restart_num;         /* next restart number to write (0-7) */
 
   /* Pointers to derived tables (these workspaces have image lifespan).
    * Since any one scan codes only DC or only AC, we only need one set
    * of tables, not one for DC and one for AC.
    */
-  c_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
+  c_derived_tbl *derived_tbls[NUM_HUFF_TBLS];
 
   /* Statistics tables for optimization; again, one set is enough */
-  long * count_ptrs[NUM_HUFF_TBLS];
+  long *count_ptrs[NUM_HUFF_TBLS];
 } phuff_entropy_encoder;
 
-typedef phuff_entropy_encoder * phuff_entropy_ptr;
+typedef phuff_entropy_encoder *phuff_entropy_ptr;
 
 /* MAX_CORR_BITS is the number of bits the AC refinement correction-bit
  * buffer can hold.  Larger sizes may slightly improve compression, but
@@ -67,35 +70,35 @@
  * The minimum safe size is 64 bits.
  */
 
-#define MAX_CORR_BITS  1000	/* Max # of correction bits I can buffer */
+#define MAX_CORR_BITS  1000     /* Max # of correction bits I can buffer */
 
-/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
- * We assume that int right shift is unsigned if INT32 right shift is,
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than JLONG.
+ * We assume that int right shift is unsigned if JLONG right shift is,
  * which should be safe.
  */
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
-#define ISHIFT_TEMPS	int ishift_temp;
+#define ISHIFT_TEMPS    int ishift_temp;
 #define IRIGHT_SHIFT(x,shft)  \
-	((ishift_temp = (x)) < 0 ? \
-	 (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
-	 (ishift_temp >> (shft)))
+        ((ishift_temp = (x)) < 0 ? \
+         (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
+         (ishift_temp >> (shft)))
 #else
 #define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
+#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
 #endif
 
 /* Forward declarations */
-METHODDEF(boolean) encode_mcu_DC_first JPP((j_compress_ptr cinfo,
-					    JBLOCKROW *MCU_data));
-METHODDEF(boolean) encode_mcu_AC_first JPP((j_compress_ptr cinfo,
-					    JBLOCKROW *MCU_data));
-METHODDEF(boolean) encode_mcu_DC_refine JPP((j_compress_ptr cinfo,
-					     JBLOCKROW *MCU_data));
-METHODDEF(boolean) encode_mcu_AC_refine JPP((j_compress_ptr cinfo,
-					     JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_phuff JPP((j_compress_ptr cinfo));
-METHODDEF(void) finish_pass_gather_phuff JPP((j_compress_ptr cinfo));
+METHODDEF(boolean) encode_mcu_DC_first (j_compress_ptr cinfo,
+                                        JBLOCKROW *MCU_data);
+METHODDEF(boolean) encode_mcu_AC_first (j_compress_ptr cinfo,
+                                        JBLOCKROW *MCU_data);
+METHODDEF(boolean) encode_mcu_DC_refine (j_compress_ptr cinfo,
+                                         JBLOCKROW *MCU_data);
+METHODDEF(boolean) encode_mcu_AC_refine (j_compress_ptr cinfo,
+                                         JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_phuff (j_compress_ptr cinfo);
+METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo);
 
 
 /*
@@ -104,11 +107,11 @@
 
 METHODDEF(void)
 start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
-{  
+{
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
   boolean is_DC_band;
   int ci, tbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   entropy->cinfo = cinfo;
   entropy->gather_statistics = gather_statistics;
@@ -130,9 +133,9 @@
       entropy->pub.encode_mcu = encode_mcu_AC_refine;
       /* AC refinement needs a correction bit buffer */
       if (entropy->bit_buffer == NULL)
-	entropy->bit_buffer = (char *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      MAX_CORR_BITS * SIZEOF(char));
+        entropy->bit_buffer = (char *)
+          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                      MAX_CORR_BITS * sizeof(char));
     }
   }
   if (gather_statistics)
@@ -149,8 +152,8 @@
     entropy->last_dc_val[ci] = 0;
     /* Get table index */
     if (is_DC_band) {
-      if (cinfo->Ah != 0)	/* DC refinement needs no table */
-	continue;
+      if (cinfo->Ah != 0)       /* DC refinement needs no table */
+        continue;
       tbl = compptr->dc_tbl_no;
     } else {
       entropy->ac_tbl_no = tbl = compptr->ac_tbl_no;
@@ -163,15 +166,15 @@
       /* Allocate and zero the statistics tables */
       /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
       if (entropy->count_ptrs[tbl] == NULL)
-	entropy->count_ptrs[tbl] = (long *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      257 * SIZEOF(long));
-      MEMZERO(entropy->count_ptrs[tbl], 257 * SIZEOF(long));
+        entropy->count_ptrs[tbl] = (long *)
+          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                      257 * sizeof(long));
+      MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long));
     } else {
       /* Compute derived values for Huffman table */
       /* We may do this more than once for a table, but it's not expensive */
       jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl,
-			      & entropy->derived_tbls[tbl]);
+                              & entropy->derived_tbls[tbl]);
     }
   }
 
@@ -196,16 +199,16 @@
 
 /* Emit a byte */
 #define emit_byte(entropy,val)  \
-	{ *(entropy)->next_output_byte++ = (JOCTET) (val);  \
-	  if (--(entropy)->free_in_buffer == 0)  \
-	    dump_buffer(entropy); }
+        { *(entropy)->next_output_byte++ = (JOCTET) (val);  \
+          if (--(entropy)->free_in_buffer == 0)  \
+            dump_buffer(entropy); }
 
 
 LOCAL(void)
 dump_buffer (phuff_entropy_ptr entropy)
 /* Empty the output buffer; we do not support suspension in this module. */
 {
-  struct jpeg_destination_mgr * dest = entropy->cinfo->dest;
+  struct jpeg_destination_mgr *dest = entropy->cinfo->dest;
 
   if (! (*dest->empty_output_buffer) (entropy->cinfo))
     ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
@@ -228,7 +231,7 @@
 /* Emit some bits, unless we are in gather mode */
 {
   /* This routine is heavily used, so it's worth coding tightly. */
-  register INT32 put_buffer = (INT32) code;
+  register size_t put_buffer = (size_t) code;
   register int put_bits = entropy->put_bits;
 
   /* if size is 0, caller used an invalid Huffman table entry */
@@ -236,21 +239,21 @@
     ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
 
   if (entropy->gather_statistics)
-    return;			/* do nothing if we're only getting stats */
+    return;                     /* do nothing if we're only getting stats */
 
-  put_buffer &= (((INT32) 1)<<size) - 1; /* mask off any extra bits in code */
-  
-  put_bits += size;		/* new number of bits in buffer */
-  
+  put_buffer &= (((size_t) 1)<<size) - 1; /* mask off any extra bits in code */
+
+  put_bits += size;             /* new number of bits in buffer */
+
   put_buffer <<= 24 - put_bits; /* align incoming bits */
 
   put_buffer |= entropy->put_buffer; /* and merge with old buffer contents */
 
   while (put_bits >= 8) {
     int c = (int) ((put_buffer >> 16) & 0xFF);
-    
+
     emit_byte(entropy, c);
-    if (c == 0xFF) {		/* need to stuff a zero byte? */
+    if (c == 0xFF) {            /* need to stuff a zero byte? */
       emit_byte(entropy, 0);
     }
     put_buffer <<= 8;
@@ -281,7 +284,7 @@
   if (entropy->gather_statistics)
     entropy->count_ptrs[tbl_no][symbol]++;
   else {
-    c_derived_tbl * tbl = entropy->derived_tbls[tbl_no];
+    c_derived_tbl *tbl = entropy->derived_tbls[tbl_no];
     emit_bits(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]);
   }
 }
@@ -292,11 +295,11 @@
  */
 
 LOCAL(void)
-emit_buffered_bits (phuff_entropy_ptr entropy, char * bufstart,
-		    unsigned int nbits)
+emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart,
+                    unsigned int nbits)
 {
   if (entropy->gather_statistics)
-    return;			/* no real work */
+    return;                     /* no real work */
 
   while (nbits > 0) {
     emit_bits(entropy, (unsigned int) (*bufstart), 1);
@@ -315,7 +318,7 @@
 {
   register int temp, nbits;
 
-  if (entropy->EOBRUN > 0) {	/* if there is any pending EOBRUN */
+  if (entropy->EOBRUN > 0) {    /* if there is any pending EOBRUN */
     temp = entropy->EOBRUN;
     nbits = 0;
     while ((temp >>= 1))
@@ -380,7 +383,7 @@
   int blkn, ci;
   int Al = cinfo->Al;
   JBLOCKROW block;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   ISHIFT_TEMPS
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
@@ -409,12 +412,12 @@
     /* Encode the DC coefficient difference per section G.1.2.1 */
     temp2 = temp;
     if (temp < 0) {
-      temp = -temp;		/* temp is abs value of input */
+      temp = -temp;             /* temp is abs value of input */
       /* For a negative input, want temp2 = bitwise complement of abs(input) */
       /* This code assumes we are on a two's complement machine */
       temp2--;
     }
-    
+
     /* Find the number of bits needed for the magnitude of the coefficient */
     nbits = 0;
     while (temp) {
@@ -426,13 +429,13 @@
      */
     if (nbits > MAX_COEF_BITS+1)
       ERREXIT(cinfo, JERR_BAD_DCT_COEF);
-    
+
     /* Count/emit the Huffman-coded symbol for the number of bits */
     emit_symbol(entropy, compptr->dc_tbl_no, nbits);
-    
+
     /* Emit that number of bits of the value, if positive, */
     /* or the complement of its magnitude, if negative. */
-    if (nbits)			/* emit_bits rejects calls with size 0 */
+    if (nbits)                  /* emit_bits rejects calls with size 0 */
       emit_bits(entropy, (unsigned int) temp2, nbits);
   }
 
@@ -481,9 +484,9 @@
   block = MCU_data[0];
 
   /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
-  
-  r = 0;			/* r = run length of zeros */
-   
+
+  r = 0;                        /* r = run length of zeros */
+
   for (k = cinfo->Ss; k <= Se; k++) {
     if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
       r++;
@@ -495,12 +498,12 @@
      * interwoven with finding the abs value (temp) and output bits (temp2).
      */
     if (temp < 0) {
-      temp = -temp;		/* temp is abs value of input */
-      temp >>= Al;		/* apply the point transform */
+      temp = -temp;             /* temp is abs value of input */
+      temp >>= Al;              /* apply the point transform */
       /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
       temp2 = ~temp;
     } else {
-      temp >>= Al;		/* apply the point transform */
+      temp >>= Al;              /* apply the point transform */
       temp2 = temp;
     }
     /* Watch out for case that nonzero coef is zero after point transform */
@@ -519,7 +522,7 @@
     }
 
     /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 1;			/* there must be at least one 1 bit */
+    nbits = 1;                  /* there must be at least one 1 bit */
     while ((temp >>= 1))
       nbits++;
     /* Check for out-of-range coefficient values */
@@ -533,13 +536,13 @@
     /* or the complement of its magnitude, if negative. */
     emit_bits(entropy, (unsigned int) temp2, nbits);
 
-    r = 0;			/* reset zero run length */
+    r = 0;                      /* reset zero run length */
   }
 
-  if (r > 0) {			/* If there are trailing zeroes, */
-    entropy->EOBRUN++;		/* count an EOB */
+  if (r > 0) {                  /* If there are trailing zeroes, */
+    entropy->EOBRUN++;          /* count an EOB */
     if (entropy->EOBRUN == 0x7FFF)
-      emit_eobrun(entropy);	/* force it out to avoid overflow */
+      emit_eobrun(entropy);     /* force it out to avoid overflow */
   }
 
   cinfo->dest->next_output_byte = entropy->next_output_byte;
@@ -648,17 +651,17 @@
      * in C, we shift after obtaining the absolute value.
      */
     if (temp < 0)
-      temp = -temp;		/* temp is abs value of input */
-    temp >>= Al;		/* apply the point transform */
-    absvalues[k] = temp;	/* save abs value for main pass */
+      temp = -temp;             /* temp is abs value of input */
+    temp >>= Al;                /* apply the point transform */
+    absvalues[k] = temp;        /* save abs value for main pass */
     if (temp == 1)
-      EOB = k;			/* EOB = index of last newly-nonzero coef */
+      EOB = k;                  /* EOB = index of last newly-nonzero coef */
   }
 
   /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
-  
-  r = 0;			/* r = run length of zeros */
-  BR = 0;			/* BR = count of buffered bits added now */
+
+  r = 0;                        /* r = run length of zeros */
+  BR = 0;                       /* BR = count of buffered bits added now */
   BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
 
   for (k = cinfo->Ss; k <= Se; k++) {
@@ -705,12 +708,12 @@
     emit_buffered_bits(entropy, BR_buffer, BR);
     BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
     BR = 0;
-    r = 0;			/* reset zero run length */
+    r = 0;                      /* reset zero run length */
   }
 
-  if (r > 0 || BR > 0) {	/* If there are trailing zeroes, */
-    entropy->EOBRUN++;		/* count an EOB */
-    entropy->BE += BR;		/* concat my correction bits to older ones */
+  if (r > 0 || BR > 0) {        /* If there are trailing zeroes, */
+    entropy->EOBRUN++;          /* count an EOB */
+    entropy->BE += BR;          /* concat my correction bits to older ones */
     /* We force out the EOB if we risk either:
      * 1. overflow of the EOB counter;
      * 2. overflow of the correction bit buffer during the next MCU.
@@ -742,7 +745,7 @@
 
 METHODDEF(void)
 finish_pass_phuff (j_compress_ptr cinfo)
-{   
+{
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
@@ -767,7 +770,7 @@
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
   boolean is_DC_band;
   int ci, tbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JHUFF_TBL **htblptr;
   boolean did[NUM_HUFF_TBLS];
 
@@ -779,13 +782,13 @@
   /* It's important not to apply jpeg_gen_optimal_table more than once
    * per table, because it clobbers the input frequency counts!
    */
-  MEMZERO(did, SIZEOF(did));
+  MEMZERO(did, sizeof(did));
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     if (is_DC_band) {
-      if (cinfo->Ah != 0)	/* DC refinement needs no table */
-	continue;
+      if (cinfo->Ah != 0)       /* DC refinement needs no table */
+        continue;
       tbl = compptr->dc_tbl_no;
     } else {
       tbl = compptr->ac_tbl_no;
@@ -816,7 +819,7 @@
 
   entropy = (phuff_entropy_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(phuff_entropy_encoder));
+                                sizeof(phuff_entropy_encoder));
   cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
   entropy->pub.start_pass = start_pass_phuff;
 
@@ -825,7 +828,7 @@
     entropy->derived_tbls[i] = NULL;
     entropy->count_ptrs[i] = NULL;
   }
-  entropy->bit_buffer = NULL;	/* needed only in AC refinement scan */
+  entropy->bit_buffer = NULL;   /* needed only in AC refinement scan */
 }
 
 #endif /* C_PROGRESSIVE_SUPPORTED */
diff --git a/jcprepct.c b/jcprepct.c
index fa93333..e72ebd8 100644
--- a/jcprepct.c
+++ b/jcprepct.c
@@ -1,9 +1,12 @@
 /*
  * jcprepct.c
  *
+ * This file is part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the compression preprocessing controller.
  * This controller manages the color conversion, downsampling,
@@ -58,16 +61,16 @@
    */
   JSAMPARRAY color_buf[MAX_COMPONENTS];
 
-  JDIMENSION rows_to_go;	/* counts rows remaining in source image */
-  int next_buf_row;		/* index of next row to store in color_buf */
+  JDIMENSION rows_to_go;        /* counts rows remaining in source image */
+  int next_buf_row;             /* index of next row to store in color_buf */
 
-#ifdef CONTEXT_ROWS_SUPPORTED	/* only needed for context case */
-  int this_row_group;		/* starting row index of group to process */
-  int next_buf_stop;		/* downsample when we reach this index */
+#ifdef CONTEXT_ROWS_SUPPORTED   /* only needed for context case */
+  int this_row_group;           /* starting row index of group to process */
+  int next_buf_stop;            /* downsample when we reach this index */
 #endif
 } my_prep_controller;
 
-typedef my_prep_controller * my_prep_ptr;
+typedef my_prep_controller *my_prep_ptr;
 
 
 /*
@@ -104,13 +107,13 @@
 
 LOCAL(void)
 expand_bottom_edge (JSAMPARRAY image_data, JDIMENSION num_cols,
-		    int input_rows, int output_rows)
+                    int input_rows, int output_rows)
 {
   register int row;
 
   for (row = input_rows; row < output_rows; row++) {
     jcopy_sample_rows(image_data, input_rows-1, image_data, row,
-		      1, num_cols);
+                      1, num_cols);
   }
 }
 
@@ -126,43 +129,43 @@
 
 METHODDEF(void)
 pre_process_data (j_compress_ptr cinfo,
-		  JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-		  JDIMENSION in_rows_avail,
-		  JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
-		  JDIMENSION out_row_groups_avail)
+                  JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
+                  JDIMENSION in_rows_avail,
+                  JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+                  JDIMENSION out_row_groups_avail)
 {
   my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
   int numrows, ci;
   JDIMENSION inrows;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   while (*in_row_ctr < in_rows_avail &&
-	 *out_row_group_ctr < out_row_groups_avail) {
+         *out_row_group_ctr < out_row_groups_avail) {
     /* Do color conversion to fill the conversion buffer. */
     inrows = in_rows_avail - *in_row_ctr;
     numrows = cinfo->max_v_samp_factor - prep->next_buf_row;
     numrows = (int) MIN((JDIMENSION) numrows, inrows);
     (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
-				       prep->color_buf,
-				       (JDIMENSION) prep->next_buf_row,
-				       numrows);
+                                       prep->color_buf,
+                                       (JDIMENSION) prep->next_buf_row,
+                                       numrows);
     *in_row_ctr += numrows;
     prep->next_buf_row += numrows;
     prep->rows_to_go -= numrows;
     /* If at bottom of image, pad to fill the conversion buffer. */
     if (prep->rows_to_go == 0 &&
-	prep->next_buf_row < cinfo->max_v_samp_factor) {
+        prep->next_buf_row < cinfo->max_v_samp_factor) {
       for (ci = 0; ci < cinfo->num_components; ci++) {
-	expand_bottom_edge(prep->color_buf[ci], cinfo->image_width,
-			   prep->next_buf_row, cinfo->max_v_samp_factor);
+        expand_bottom_edge(prep->color_buf[ci], cinfo->image_width,
+                           prep->next_buf_row, cinfo->max_v_samp_factor);
       }
       prep->next_buf_row = cinfo->max_v_samp_factor;
     }
     /* If we've filled the conversion buffer, empty it. */
     if (prep->next_buf_row == cinfo->max_v_samp_factor) {
       (*cinfo->downsample->downsample) (cinfo,
-					prep->color_buf, (JDIMENSION) 0,
-					output_buf, *out_row_group_ctr);
+                                        prep->color_buf, (JDIMENSION) 0,
+                                        output_buf, *out_row_group_ctr);
       prep->next_buf_row = 0;
       (*out_row_group_ctr)++;
     }
@@ -170,16 +173,16 @@
      * Note we assume the caller is providing a one-iMCU-height output buffer!
      */
     if (prep->rows_to_go == 0 &&
-	*out_row_group_ctr < out_row_groups_avail) {
+        *out_row_group_ctr < out_row_groups_avail) {
       for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-	   ci++, compptr++) {
-	expand_bottom_edge(output_buf[ci],
-			   compptr->width_in_blocks * DCTSIZE,
-			   (int) (*out_row_group_ctr * compptr->v_samp_factor),
-			   (int) (out_row_groups_avail * compptr->v_samp_factor));
+           ci++, compptr++) {
+        expand_bottom_edge(output_buf[ci],
+                           compptr->width_in_blocks * DCTSIZE,
+                           (int) (*out_row_group_ctr * compptr->v_samp_factor),
+                           (int) (out_row_groups_avail * compptr->v_samp_factor));
       }
       *out_row_group_ctr = out_row_groups_avail;
-      break;			/* can exit outer loop without test */
+      break;                    /* can exit outer loop without test */
     }
   }
 }
@@ -193,10 +196,10 @@
 
 METHODDEF(void)
 pre_process_context (j_compress_ptr cinfo,
-		     JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-		     JDIMENSION in_rows_avail,
-		     JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
-		     JDIMENSION out_row_groups_avail)
+                     JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
+                     JDIMENSION in_rows_avail,
+                     JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+                     JDIMENSION out_row_groups_avail)
 {
   my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
   int numrows, ci;
@@ -210,19 +213,19 @@
       numrows = prep->next_buf_stop - prep->next_buf_row;
       numrows = (int) MIN((JDIMENSION) numrows, inrows);
       (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
-					 prep->color_buf,
-					 (JDIMENSION) prep->next_buf_row,
-					 numrows);
+                                         prep->color_buf,
+                                         (JDIMENSION) prep->next_buf_row,
+                                         numrows);
       /* Pad at top of image, if first time through */
       if (prep->rows_to_go == cinfo->image_height) {
-	for (ci = 0; ci < cinfo->num_components; ci++) {
-	  int row;
-	  for (row = 1; row <= cinfo->max_v_samp_factor; row++) {
-	    jcopy_sample_rows(prep->color_buf[ci], 0,
-			      prep->color_buf[ci], -row,
-			      1, cinfo->image_width);
-	  }
-	}
+        for (ci = 0; ci < cinfo->num_components; ci++) {
+          int row;
+          for (row = 1; row <= cinfo->max_v_samp_factor; row++) {
+            jcopy_sample_rows(prep->color_buf[ci], 0,
+                              prep->color_buf[ci], -row,
+                              1, cinfo->image_width);
+          }
+        }
       }
       *in_row_ctr += numrows;
       prep->next_buf_row += numrows;
@@ -230,29 +233,29 @@
     } else {
       /* Return for more data, unless we are at the bottom of the image. */
       if (prep->rows_to_go != 0)
-	break;
+        break;
       /* When at bottom of image, pad to fill the conversion buffer. */
       if (prep->next_buf_row < prep->next_buf_stop) {
-	for (ci = 0; ci < cinfo->num_components; ci++) {
-	  expand_bottom_edge(prep->color_buf[ci], cinfo->image_width,
-			     prep->next_buf_row, prep->next_buf_stop);
-	}
-	prep->next_buf_row = prep->next_buf_stop;
+        for (ci = 0; ci < cinfo->num_components; ci++) {
+          expand_bottom_edge(prep->color_buf[ci], cinfo->image_width,
+                             prep->next_buf_row, prep->next_buf_stop);
+        }
+        prep->next_buf_row = prep->next_buf_stop;
       }
     }
     /* If we've gotten enough data, downsample a row group. */
     if (prep->next_buf_row == prep->next_buf_stop) {
       (*cinfo->downsample->downsample) (cinfo,
-					prep->color_buf,
-					(JDIMENSION) prep->this_row_group,
-					output_buf, *out_row_group_ctr);
+                                        prep->color_buf,
+                                        (JDIMENSION) prep->this_row_group,
+                                        output_buf, *out_row_group_ctr);
       (*out_row_group_ctr)++;
       /* Advance pointers with wraparound as necessary. */
       prep->this_row_group += cinfo->max_v_samp_factor;
       if (prep->this_row_group >= buf_height)
-	prep->this_row_group = 0;
+        prep->this_row_group = 0;
       if (prep->next_buf_row >= buf_height)
-	prep->next_buf_row = 0;
+        prep->next_buf_row = 0;
       prep->next_buf_stop = prep->next_buf_row + cinfo->max_v_samp_factor;
     }
   }
@@ -269,7 +272,7 @@
   my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
   int rgroup_height = cinfo->max_v_samp_factor;
   int ci, i;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JSAMPARRAY true_buffer, fake_buffer;
 
   /* Grab enough space for fake row pointers for all the components;
@@ -277,8 +280,8 @@
    */
   fake_buffer = (JSAMPARRAY)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(cinfo->num_components * 5 * rgroup_height) *
-				SIZEOF(JSAMPROW));
+                                (cinfo->num_components * 5 * rgroup_height) *
+                                sizeof(JSAMPROW));
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -289,11 +292,11 @@
     true_buffer = (*cinfo->mem->alloc_sarray)
       ((j_common_ptr) cinfo, JPOOL_IMAGE,
        (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
-		      cinfo->max_h_samp_factor) / compptr->h_samp_factor),
+                      cinfo->max_h_samp_factor) / compptr->h_samp_factor),
        (JDIMENSION) (3 * rgroup_height));
     /* Copy true buffer row pointers into the middle of the fake row array */
     MEMCOPY(fake_buffer + rgroup_height, true_buffer,
-	    3 * rgroup_height * SIZEOF(JSAMPROW));
+            3 * rgroup_height * sizeof(JSAMPROW));
     /* Fill in the above and below wraparound pointers */
     for (i = 0; i < rgroup_height; i++) {
       fake_buffer[i] = true_buffer[2 * rgroup_height + i];
@@ -316,14 +319,14 @@
 {
   my_prep_ptr prep;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
-  if (need_full_buffer)		/* safety check */
+  if (need_full_buffer)         /* safety check */
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 
   prep = (my_prep_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_prep_controller));
+                                sizeof(my_prep_controller));
   cinfo->prep = (struct jpeg_c_prep_controller *) prep;
   prep->pub.start_pass = start_pass_prep;
 
@@ -343,12 +346,12 @@
     /* No context, just make it tall enough for one row group */
     prep->pub.pre_process_data = pre_process_data;
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-	 ci++, compptr++) {
+         ci++, compptr++) {
       prep->color_buf[ci] = (*cinfo->mem->alloc_sarray)
-	((j_common_ptr) cinfo, JPOOL_IMAGE,
-	 (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
-			cinfo->max_h_samp_factor) / compptr->h_samp_factor),
-	 (JDIMENSION) cinfo->max_v_samp_factor);
+        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+         (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
+                        cinfo->max_h_samp_factor) / compptr->h_samp_factor),
+         (JDIMENSION) cinfo->max_v_samp_factor);
     }
   }
 }
diff --git a/jcsample.c b/jcsample.c
index eea376f..879bd51 100644
--- a/jcsample.c
+++ b/jcsample.c
@@ -1,10 +1,14 @@
 /*
  * jcsample.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2014, MIPS Technologies, Inc., California
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains downsampling routines.
  *
@@ -53,20 +57,21 @@
 
 
 /* Pointer to routine to downsample a single component */
-typedef JMETHOD(void, downsample1_ptr,
-		(j_compress_ptr cinfo, jpeg_component_info * compptr,
-		 JSAMPARRAY input_data, JSAMPARRAY output_data));
+typedef void (*downsample1_ptr) (j_compress_ptr cinfo,
+                                 jpeg_component_info *compptr,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY output_data);
 
 /* Private subobject */
 
 typedef struct {
-  struct jpeg_downsampler pub;	/* public fields */
+  struct jpeg_downsampler pub;  /* public fields */
 
   /* Downsampling method pointers, one per component */
   downsample1_ptr methods[MAX_COMPONENTS];
 } my_downsampler;
 
-typedef my_downsampler * my_downsample_ptr;
+typedef my_downsampler *my_downsample_ptr;
 
 
 /*
@@ -87,7 +92,7 @@
 
 LOCAL(void)
 expand_right_edge (JSAMPARRAY image_data, int num_rows,
-		   JDIMENSION input_cols, JDIMENSION output_cols)
+                   JDIMENSION input_cols, JDIMENSION output_cols)
 {
   register JSAMPROW ptr;
   register JSAMPLE pixval;
@@ -98,9 +103,9 @@
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];		/* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
       for (count = numcols; count > 0; count--)
-	*ptr++ = pixval;
+        *ptr++ = pixval;
     }
   }
 }
@@ -114,12 +119,12 @@
 
 METHODDEF(void)
 sep_downsample (j_compress_ptr cinfo,
-		JSAMPIMAGE input_buf, JDIMENSION in_row_index,
-		JSAMPIMAGE output_buf, JDIMENSION out_row_group_index)
+                JSAMPIMAGE input_buf, JDIMENSION in_row_index,
+                JSAMPIMAGE output_buf, JDIMENSION out_row_group_index)
 {
   my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JSAMPARRAY in_ptr, out_ptr;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
@@ -139,14 +144,14 @@
  */
 
 METHODDEF(void)
-int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
-		JSAMPARRAY input_data, JSAMPARRAY output_data)
+int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
-  JDIMENSION outcol, outcol_h;	/* outcol_h == outcol*h_expand */
+  JDIMENSION outcol, outcol_h;  /* outcol_h == outcol*h_expand */
   JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
   JSAMPROW inptr, outptr;
-  INT32 outvalue;
+  JLONG outvalue;
 
   h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
   v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor;
@@ -158,19 +163,19 @@
    * efficient.
    */
   expand_right_edge(input_data, cinfo->max_v_samp_factor,
-		    cinfo->image_width, output_cols * h_expand);
+                    cinfo->image_width, output_cols * h_expand);
 
   inrow = 0;
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     for (outcol = 0, outcol_h = 0; outcol < output_cols;
-	 outcol++, outcol_h += h_expand) {
+         outcol++, outcol_h += h_expand) {
       outvalue = 0;
       for (v = 0; v < v_expand; v++) {
-	inptr = input_data[inrow+v] + outcol_h;
-	for (h = 0; h < h_expand; h++) {
-	  outvalue += (INT32) GETJSAMPLE(*inptr++);
-	}
+        inptr = input_data[inrow+v] + outcol_h;
+        for (h = 0; h < h_expand; h++) {
+          outvalue += (JLONG) GETJSAMPLE(*inptr++);
+        }
       }
       *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix);
     }
@@ -186,15 +191,15 @@
  */
 
 METHODDEF(void)
-fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
-		     JSAMPARRAY input_data, JSAMPARRAY output_data)
+fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                     JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   /* Copy the data */
   jcopy_sample_rows(input_data, 0, output_data, 0,
-		    cinfo->max_v_samp_factor, cinfo->image_width);
+                    cinfo->max_v_samp_factor, cinfo->image_width);
   /* Edge-expand */
   expand_right_edge(output_data, cinfo->max_v_samp_factor,
-		    cinfo->image_width, compptr->width_in_blocks * DCTSIZE);
+                    cinfo->image_width, compptr->width_in_blocks * DCTSIZE);
 }
 
 
@@ -211,8 +216,8 @@
  */
 
 METHODDEF(void)
-h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
-		 JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                 JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int outrow;
   JDIMENSION outcol;
@@ -225,16 +230,16 @@
    * efficient.
    */
   expand_right_edge(input_data, cinfo->max_v_samp_factor,
-		    cinfo->image_width, output_cols * 2);
+                    cinfo->image_width, output_cols * 2);
 
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr = input_data[outrow];
-    bias = 0;			/* bias = 0,1,0,1,... for successive samples */
+    bias = 0;                   /* bias = 0,1,0,1,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
       *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1])
-			      + bias) >> 1);
-      bias ^= 1;		/* 0=>1, 1=>0 */
+                              + bias) >> 1);
+      bias ^= 1;                /* 0=>1, 1=>0 */
       inptr += 2;
     }
   }
@@ -248,8 +253,8 @@
  */
 
 METHODDEF(void)
-h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
-		 JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                 JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow;
   JDIMENSION outcol;
@@ -262,19 +267,19 @@
    * efficient.
    */
   expand_right_edge(input_data, cinfo->max_v_samp_factor,
-		    cinfo->image_width, output_cols * 2);
+                    cinfo->image_width, output_cols * 2);
 
   inrow = 0;
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
     inptr1 = input_data[inrow+1];
-    bias = 1;			/* bias = 1,2,1,2,... for successive samples */
+    bias = 1;                   /* bias = 1,2,1,2,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
       *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-			      GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1])
-			      + bias) >> 2);
-      bias ^= 3;		/* 1=>2, 2=>1 */
+                              GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1])
+                              + bias) >> 2);
+      bias ^= 3;                /* 1=>2, 2=>1 */
       inptr0 += 2; inptr1 += 2;
     }
     inrow += 2;
@@ -291,21 +296,21 @@
  */
 
 METHODDEF(void)
-h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
-			JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow;
   JDIMENSION colctr;
   JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
   register JSAMPROW inptr0, inptr1, above_ptr, below_ptr, outptr;
-  INT32 membersum, neighsum, memberscale, neighscale;
+  JLONG membersum, neighsum, memberscale, neighscale;
 
   /* Expand input data enough to let all the output samples be generated
    * by the standard loop.  Special-casing padded output would be more
    * efficient.
    */
   expand_right_edge(input_data - 1, cinfo->max_v_samp_factor + 2,
-		    cinfo->image_width, output_cols * 2);
+                    cinfo->image_width, output_cols * 2);
 
   /* We don't bother to form the individual "smoothed" input pixel values;
    * we can directly compute the output which is the average of the four
@@ -333,14 +338,14 @@
 
     /* Special case for first column: pretend column -1 is same as column 0 */
     membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-		GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
+                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
     neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-	       GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-	       GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[2]) +
-	       GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[2]);
+               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
+               GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[2]) +
+               GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[2]);
     neighsum += neighsum;
     neighsum += GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[2]) +
-		GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]);
+                GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]);
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
     inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
@@ -348,17 +353,17 @@
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
       /* sum of pixels directly mapped to this output element */
       membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-		  GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
+                  GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
       /* sum of edge-neighbor pixels */
       neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-		 GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-		 GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[2]) +
-		 GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[2]);
+                 GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
+                 GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[2]) +
+                 GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[2]);
       /* The edge-neighbors count twice as much as corner-neighbors */
       neighsum += neighsum;
       /* Add in the corner-neighbors */
       neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[2]) +
-		  GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[2]);
+                  GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[2]);
       /* form final output scaled up by 2^16 */
       membersum = membersum * memberscale + neighsum * neighscale;
       /* round, descale and output it */
@@ -368,14 +373,14 @@
 
     /* Special case for last column */
     membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-		GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
+                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
     neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-	       GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-	       GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[1]) +
-	       GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[1]);
+               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
+               GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[1]) +
+               GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[1]);
     neighsum += neighsum;
     neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[1]) +
-		GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]);
+                GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]);
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr = (JSAMPLE) ((membersum + 32768) >> 16);
 
@@ -392,13 +397,13 @@
 
 METHODDEF(void)
 fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-			    JSAMPARRAY input_data, JSAMPARRAY output_data)
+                            JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int outrow;
   JDIMENSION colctr;
   JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
   register JSAMPROW inptr, above_ptr, below_ptr, outptr;
-  INT32 membersum, neighsum, memberscale, neighscale;
+  JLONG membersum, neighsum, memberscale, neighscale;
   int colsum, lastcolsum, nextcolsum;
 
   /* Expand input data enough to let all the output samples be generated
@@ -406,7 +411,7 @@
    * efficient.
    */
   expand_right_edge(input_data - 1, cinfo->max_v_samp_factor + 2,
-		    cinfo->image_width, output_cols);
+                    cinfo->image_width, output_cols);
 
   /* Each of the eight neighbor pixels contributes a fraction SF to the
    * smoothed pixel, while the main pixel contributes (1-8*SF).  In order
@@ -425,10 +430,10 @@
 
     /* Special case for first column */
     colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
-	     GETJSAMPLE(*inptr);
+             GETJSAMPLE(*inptr);
     membersum = GETJSAMPLE(*inptr++);
     nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-		 GETJSAMPLE(*inptr);
+                 GETJSAMPLE(*inptr);
     neighsum = colsum + (colsum - membersum) + nextcolsum;
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
@@ -438,7 +443,7 @@
       membersum = GETJSAMPLE(*inptr++);
       above_ptr++; below_ptr++;
       nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-		   GETJSAMPLE(*inptr);
+                   GETJSAMPLE(*inptr);
       neighsum = lastcolsum + (colsum - membersum) + nextcolsum;
       membersum = membersum * memberscale + neighsum * neighscale;
       *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
@@ -467,12 +472,12 @@
 {
   my_downsample_ptr downsample;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   boolean smoothok = TRUE;
 
   downsample = (my_downsample_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_downsampler));
+                                sizeof(my_downsampler));
   cinfo->downsample = (struct jpeg_downsampler *) downsample;
   downsample->pub.start_pass = start_pass_downsample;
   downsample->pub.downsample = sep_downsample;
@@ -485,35 +490,42 @@
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     if (compptr->h_samp_factor == cinfo->max_h_samp_factor &&
-	compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+        compptr->v_samp_factor == cinfo->max_v_samp_factor) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
       if (cinfo->smoothing_factor) {
-	downsample->methods[ci] = fullsize_smooth_downsample;
-	downsample->pub.need_context_rows = TRUE;
+        downsample->methods[ci] = fullsize_smooth_downsample;
+        downsample->pub.need_context_rows = TRUE;
       } else
 #endif
-	downsample->methods[ci] = fullsize_downsample;
+        downsample->methods[ci] = fullsize_downsample;
     } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
-	       compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+               compptr->v_samp_factor == cinfo->max_v_samp_factor) {
       smoothok = FALSE;
       if (jsimd_can_h2v1_downsample())
         downsample->methods[ci] = jsimd_h2v1_downsample;
       else
         downsample->methods[ci] = h2v1_downsample;
     } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
-	       compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
+               compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
       if (cinfo->smoothing_factor) {
-	downsample->methods[ci] = h2v2_smooth_downsample;
-	downsample->pub.need_context_rows = TRUE;
+#if defined(__mips__)
+        if (jsimd_can_h2v2_smooth_downsample())
+          downsample->methods[ci] = jsimd_h2v2_smooth_downsample;
+        else
+#endif
+          downsample->methods[ci] = h2v2_smooth_downsample;
+        downsample->pub.need_context_rows = TRUE;
       } else
 #endif
-	if (jsimd_can_h2v2_downsample())
-	  downsample->methods[ci] = jsimd_h2v2_downsample;
-	else
-	  downsample->methods[ci] = h2v2_downsample;
+      {
+        if (jsimd_can_h2v2_downsample())
+          downsample->methods[ci] = jsimd_h2v2_downsample;
+        else
+          downsample->methods[ci] = h2v2_downsample;
+      }
     } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
-	       (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
+               (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
       smoothok = FALSE;
       downsample->methods[ci] = int_downsample;
     } else
diff --git a/jcstest.c b/jcstest.c
new file mode 100644
index 0000000..358ed25
--- /dev/null
+++ b/jcstest.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This program demonstrates how to check for the colorspace extension
+   capabilities of libjpeg-turbo at both compile time and run time. */
+
+#include <stdio.h>
+#include <jpeglib.h>
+#include <jerror.h>
+#include <setjmp.h>
+
+#ifndef JCS_EXTENSIONS
+#define JCS_EXT_RGB 6
+#endif
+#if !defined(JCS_EXTENSIONS) || !defined(JCS_ALPHA_EXTENSIONS)
+#define JCS_EXT_RGBA 12
+#endif
+
+static char lasterror[JMSG_LENGTH_MAX] = "No error";
+
+typedef struct _error_mgr {
+  struct jpeg_error_mgr pub;
+  jmp_buf jb;
+} error_mgr;
+
+static void my_error_exit(j_common_ptr cinfo)
+{
+  error_mgr *myerr = (error_mgr *)cinfo->err;
+  (*cinfo->err->output_message)(cinfo);
+  longjmp(myerr->jb, 1);
+}
+
+static void my_output_message(j_common_ptr cinfo)
+{
+  (*cinfo->err->format_message)(cinfo, lasterror);
+}
+
+int main(void)
+{
+  int jcs_valid = -1, jcs_alpha_valid = -1;
+  struct jpeg_compress_struct cinfo;
+  error_mgr jerr;
+
+  printf("libjpeg-turbo colorspace extensions:\n");
+  #if JCS_EXTENSIONS
+  printf("  Present at compile time\n");
+  #else
+  printf("  Not present at compile time\n");
+  #endif
+
+  cinfo.err = jpeg_std_error(&jerr.pub);
+  jerr.pub.error_exit = my_error_exit;
+  jerr.pub.output_message = my_output_message;
+
+  if(setjmp(jerr.jb)) {
+    /* this will execute if libjpeg has an error */
+    jcs_valid = 0;
+    goto done;
+  }
+
+  jpeg_create_compress(&cinfo);
+  cinfo.input_components = 3;
+  jpeg_set_defaults(&cinfo);
+  cinfo.in_color_space = JCS_EXT_RGB;
+  jpeg_default_colorspace(&cinfo);
+  jcs_valid = 1;
+
+  done:
+  if (jcs_valid)
+    printf("  Working properly\n");
+  else
+    printf("  Not working properly.  Error returned was:\n    %s\n",
+           lasterror);
+
+  printf("libjpeg-turbo alpha colorspace extensions:\n");
+  #if JCS_ALPHA_EXTENSIONS
+  printf("  Present at compile time\n");
+  #else
+  printf("  Not present at compile time\n");
+  #endif
+
+  if(setjmp(jerr.jb)) {
+    /* this will execute if libjpeg has an error */
+    jcs_alpha_valid = 0;
+    goto done2;
+  }
+
+  cinfo.in_color_space = JCS_EXT_RGBA;
+  jpeg_default_colorspace(&cinfo);
+  jcs_alpha_valid = 1;
+
+  done2:
+  if (jcs_alpha_valid)
+    printf("  Working properly\n");
+  else
+    printf("  Not working properly.  Error returned was:\n    %s\n",
+           lasterror);
+
+  jpeg_destroy_compress(&cinfo);
+  return 0;
+}
diff --git a/jctrans.c b/jctrans.c
index 916e872..6f16b05 100644
--- a/jctrans.c
+++ b/jctrans.c
@@ -1,10 +1,13 @@
 /*
  * jctrans.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1998, Thomas G. Lane.
  * Modified 2000-2009 by Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains library routines for transcoding compression,
  * that is, writing raw DCT coefficient arrays to an output JPEG file.
@@ -18,9 +21,9 @@
 
 /* Forward declarations */
 LOCAL(void) transencode_master_selection
-	JPP((j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays));
+        (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays);
 LOCAL(void) transencode_coef_controller
-	JPP((j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays));
+        (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays);
 
 
 /*
@@ -36,7 +39,7 @@
  */
 
 GLOBAL(void)
-jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays)
+jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
 {
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
@@ -48,7 +51,7 @@
   /* Perform master selection of active modules */
   transencode_master_selection(cinfo, coef_arrays);
   /* Wait for jpeg_finish_compress() call */
-  cinfo->next_scanline = 0;	/* so jpeg_write_marker works */
+  cinfo->next_scanline = 0;     /* so jpeg_write_marker works */
   cinfo->global_state = CSTATE_WRCOEFS;
 }
 
@@ -62,9 +65,9 @@
 
 GLOBAL(void)
 jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
-			       j_compress_ptr dstinfo)
+                               j_compress_ptr dstinfo)
 {
-  JQUANT_TBL ** qtblptr;
+  JQUANT_TBL **qtblptr;
   jpeg_component_info *incomp, *outcomp;
   JQUANT_TBL *c_quant, *slot_quant;
   int tblno, ci, coefi;
@@ -96,10 +99,10 @@
     if (srcinfo->quant_tbl_ptrs[tblno] != NULL) {
       qtblptr = & dstinfo->quant_tbl_ptrs[tblno];
       if (*qtblptr == NULL)
-	*qtblptr = jpeg_alloc_quant_table((j_common_ptr) dstinfo);
+        *qtblptr = jpeg_alloc_quant_table((j_common_ptr) dstinfo);
       MEMCOPY((*qtblptr)->quantval,
-	      srcinfo->quant_tbl_ptrs[tblno]->quantval,
-	      SIZEOF((*qtblptr)->quantval));
+              srcinfo->quant_tbl_ptrs[tblno]->quantval,
+              sizeof((*qtblptr)->quantval));
       (*qtblptr)->sent_table = FALSE;
     }
   }
@@ -109,7 +112,7 @@
   dstinfo->num_components = srcinfo->num_components;
   if (dstinfo->num_components < 1 || dstinfo->num_components > MAX_COMPONENTS)
     ERREXIT2(dstinfo, JERR_COMPONENT_COUNT, dstinfo->num_components,
-	     MAX_COMPONENTS);
+             MAX_COMPONENTS);
   for (ci = 0, incomp = srcinfo->comp_info, outcomp = dstinfo->comp_info;
        ci < dstinfo->num_components; ci++, incomp++, outcomp++) {
     outcomp->component_id = incomp->component_id;
@@ -122,14 +125,14 @@
      */
     tblno = outcomp->quant_tbl_no;
     if (tblno < 0 || tblno >= NUM_QUANT_TBLS ||
-	srcinfo->quant_tbl_ptrs[tblno] == NULL)
+        srcinfo->quant_tbl_ptrs[tblno] == NULL)
       ERREXIT1(dstinfo, JERR_NO_QUANT_TABLE, tblno);
     slot_quant = srcinfo->quant_tbl_ptrs[tblno];
     c_quant = incomp->quant_table;
     if (c_quant != NULL) {
       for (coefi = 0; coefi < DCTSIZE2; coefi++) {
-	if (c_quant->quantval[coefi] != slot_quant->quantval[coefi])
-	  ERREXIT1(dstinfo, JERR_MISMATCHED_QUANT_TABLE, tblno);
+        if (c_quant->quantval[coefi] != slot_quant->quantval[coefi])
+          ERREXIT1(dstinfo, JERR_MISMATCHED_QUANT_TABLE, tblno);
       }
     }
     /* Note: we do not copy the source's Huffman table assignments;
@@ -163,7 +166,7 @@
 
 LOCAL(void)
 transencode_master_selection (j_compress_ptr cinfo,
-			      jvirt_barray_ptr * coef_arrays)
+                              jvirt_barray_ptr *coef_arrays)
 {
   /* Although we don't actually use input_components for transcoding,
    * jcmaster.c's initial_setup will complain if input_components is 0.
@@ -219,19 +222,19 @@
 typedef struct {
   struct jpeg_c_coef_controller pub; /* public fields */
 
-  JDIMENSION iMCU_row_num;	/* iMCU row # within image */
-  JDIMENSION mcu_ctr;		/* counts MCUs processed in current row */
-  int MCU_vert_offset;		/* counts MCU rows within iMCU row */
-  int MCU_rows_per_iMCU_row;	/* number of such rows needed */
+  JDIMENSION iMCU_row_num;      /* iMCU row # within image */
+  JDIMENSION mcu_ctr;           /* counts MCUs processed in current row */
+  int MCU_vert_offset;          /* counts MCU rows within iMCU row */
+  int MCU_rows_per_iMCU_row;    /* number of such rows needed */
 
   /* Virtual block array for each component. */
-  jvirt_barray_ptr * whole_image;
+  jvirt_barray_ptr *whole_image;
 
   /* Workspace for constructing dummy blocks at right/bottom edges. */
   JBLOCKROW dummy_buffer[C_MAX_BLOCKS_IN_MCU];
 } my_coef_controller;
 
-typedef my_coef_controller * my_coef_ptr;
+typedef my_coef_controller *my_coef_ptr;
 
 
 LOCAL(void)
@@ -289,7 +292,7 @@
 compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
   my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
-  JDIMENSION MCU_col_num;	/* index of current MCU within row */
+  JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   int blkn, ci, xindex, yindex, yoffset, blockcnt;
@@ -312,44 +315,44 @@
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
     for (MCU_col_num = coef->mcu_ctr; MCU_col_num < cinfo->MCUs_per_row;
-	 MCU_col_num++) {
+         MCU_col_num++) {
       /* Construct list of pointers to DCT blocks belonging to this MCU */
-      blkn = 0;			/* index of current DCT block within MCU */
+      blkn = 0;                 /* index of current DCT block within MCU */
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-	compptr = cinfo->cur_comp_info[ci];
-	start_col = MCU_col_num * compptr->MCU_width;
-	blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-						: compptr->last_col_width;
-	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-	  if (coef->iMCU_row_num < last_iMCU_row ||
-	      yindex+yoffset < compptr->last_row_height) {
-	    /* Fill in pointers to real blocks in this row */
-	    buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
-	    for (xindex = 0; xindex < blockcnt; xindex++)
-	      MCU_buffer[blkn++] = buffer_ptr++;
-	  } else {
-	    /* At bottom of image, need a whole row of dummy blocks */
-	    xindex = 0;
-	  }
-	  /* Fill in any dummy blocks needed in this row.
-	   * Dummy blocks are filled in the same way as in jccoefct.c:
-	   * all zeroes in the AC entries, DC entries equal to previous
-	   * block's DC value.  The init routine has already zeroed the
-	   * AC entries, so we need only set the DC entries correctly.
-	   */
-	  for (; xindex < compptr->MCU_width; xindex++) {
-	    MCU_buffer[blkn] = coef->dummy_buffer[blkn];
-	    MCU_buffer[blkn][0][0] = MCU_buffer[blkn-1][0][0];
-	    blkn++;
-	  }
-	}
+        compptr = cinfo->cur_comp_info[ci];
+        start_col = MCU_col_num * compptr->MCU_width;
+        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
+                                                : compptr->last_col_width;
+        for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+          if (coef->iMCU_row_num < last_iMCU_row ||
+              yindex+yoffset < compptr->last_row_height) {
+            /* Fill in pointers to real blocks in this row */
+            buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+            for (xindex = 0; xindex < blockcnt; xindex++)
+              MCU_buffer[blkn++] = buffer_ptr++;
+          } else {
+            /* At bottom of image, need a whole row of dummy blocks */
+            xindex = 0;
+          }
+          /* Fill in any dummy blocks needed in this row.
+           * Dummy blocks are filled in the same way as in jccoefct.c:
+           * all zeroes in the AC entries, DC entries equal to previous
+           * block's DC value.  The init routine has already zeroed the
+           * AC entries, so we need only set the DC entries correctly.
+           */
+          for (; xindex < compptr->MCU_width; xindex++) {
+            MCU_buffer[blkn] = coef->dummy_buffer[blkn];
+            MCU_buffer[blkn][0][0] = MCU_buffer[blkn-1][0][0];
+            blkn++;
+          }
+        }
       }
       /* Try to write the MCU. */
       if (! (*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) {
-	/* Suspension forced; update state counters and exit */
-	coef->MCU_vert_offset = yoffset;
-	coef->mcu_ctr = MCU_col_num;
-	return FALSE;
+        /* Suspension forced; update state counters and exit */
+        coef->MCU_vert_offset = yoffset;
+        coef->mcu_ctr = MCU_col_num;
+        return FALSE;
       }
     }
     /* Completed an MCU row, but perhaps not an iMCU row */
@@ -372,7 +375,7 @@
 
 LOCAL(void)
 transencode_coef_controller (j_compress_ptr cinfo,
-			     jvirt_barray_ptr * coef_arrays)
+                             jvirt_barray_ptr *coef_arrays)
 {
   my_coef_ptr coef;
   JBLOCKROW buffer;
@@ -380,7 +383,7 @@
 
   coef = (my_coef_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_coef_controller));
+                                sizeof(my_coef_controller));
   cinfo->coef = (struct jpeg_c_coef_controller *) coef;
   coef->pub.start_pass = start_pass_coef;
   coef->pub.compress_data = compress_output;
@@ -391,8 +394,8 @@
   /* Allocate and pre-zero space for dummy DCT blocks. */
   buffer = (JBLOCKROW)
     (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
-  jzero_far((void FAR *) buffer, C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
+                                C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
+  jzero_far((void *) buffer, C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
   for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
     coef->dummy_buffer[i] = buffer + i;
   }
diff --git a/jdapimin.c b/jdapimin.c
index cadb59f..f80a146 100644
--- a/jdapimin.c
+++ b/jdapimin.c
@@ -1,9 +1,12 @@
 /*
  * jdapimin.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains application interface code for the decompression half
  * of the JPEG library.  These are the "minimum" API routines that may be
@@ -19,6 +22,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jdmaster.h"
 
 
 /*
@@ -32,12 +36,12 @@
   int i;
 
   /* Guard against version mismatches between library and caller. */
-  cinfo->mem = NULL;		/* so jpeg_destroy knows mem mgr not called */
+  cinfo->mem = NULL;            /* so jpeg_destroy knows mem mgr not called */
   if (version != JPEG_LIB_VERSION)
     ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version);
-  if (structsize != SIZEOF(struct jpeg_decompress_struct))
-    ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE, 
-	     (int) SIZEOF(struct jpeg_decompress_struct), (int) structsize);
+  if (structsize != sizeof(struct jpeg_decompress_struct))
+    ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE,
+             (int) sizeof(struct jpeg_decompress_struct), (int) structsize);
 
   /* For debugging purposes, we zero the whole master structure.
    * But the application has already set the err pointer, and may have set
@@ -48,7 +52,7 @@
   {
     struct jpeg_error_mgr * err = cinfo->err;
     void * client_data = cinfo->client_data; /* ignore Purify complaint here */
-    MEMZERO(cinfo, SIZEOF(struct jpeg_decompress_struct));
+    MEMZERO(cinfo, sizeof(struct jpeg_decompress_struct));
     cinfo->err = err;
     cinfo->client_data = client_data;
   }
@@ -80,6 +84,14 @@
 
   /* OK, I'm ready */
   cinfo->global_state = DSTATE_START;
+
+  /* The master struct is used to store extension parameters, so we allocate it
+   * here.
+   */
+  cinfo->master = (struct jpeg_decomp_master *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+                                  sizeof(my_decomp_master));
+  MEMZERO(cinfo->master, sizeof(my_decomp_master));
 }
 
 
@@ -121,22 +133,22 @@
     cinfo->jpeg_color_space = JCS_GRAYSCALE;
     cinfo->out_color_space = JCS_GRAYSCALE;
     break;
-    
+
   case 3:
     if (cinfo->saw_JFIF_marker) {
       cinfo->jpeg_color_space = JCS_YCbCr; /* JFIF implies YCbCr */
     } else if (cinfo->saw_Adobe_marker) {
       switch (cinfo->Adobe_transform) {
       case 0:
-	cinfo->jpeg_color_space = JCS_RGB;
-	break;
+        cinfo->jpeg_color_space = JCS_RGB;
+        break;
       case 1:
-	cinfo->jpeg_color_space = JCS_YCbCr;
-	break;
+        cinfo->jpeg_color_space = JCS_YCbCr;
+        break;
       default:
-	WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform);
-	cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */
-	break;
+        WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform);
+        cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */
+        break;
       }
     } else {
       /* Saw no special markers, try to guess from the component IDs */
@@ -145,31 +157,31 @@
       int cid2 = cinfo->comp_info[2].component_id;
 
       if (cid0 == 1 && cid1 == 2 && cid2 == 3)
-	cinfo->jpeg_color_space = JCS_YCbCr; /* assume JFIF w/out marker */
+        cinfo->jpeg_color_space = JCS_YCbCr; /* assume JFIF w/out marker */
       else if (cid0 == 82 && cid1 == 71 && cid2 == 66)
-	cinfo->jpeg_color_space = JCS_RGB; /* ASCII 'R', 'G', 'B' */
+        cinfo->jpeg_color_space = JCS_RGB; /* ASCII 'R', 'G', 'B' */
       else {
-	TRACEMS3(cinfo, 1, JTRC_UNKNOWN_IDS, cid0, cid1, cid2);
-	cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */
+        TRACEMS3(cinfo, 1, JTRC_UNKNOWN_IDS, cid0, cid1, cid2);
+        cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */
       }
     }
     /* Always guess RGB is proper output colorspace. */
     cinfo->out_color_space = JCS_RGB;
     break;
-    
+
   case 4:
     if (cinfo->saw_Adobe_marker) {
       switch (cinfo->Adobe_transform) {
       case 0:
-	cinfo->jpeg_color_space = JCS_CMYK;
-	break;
+        cinfo->jpeg_color_space = JCS_CMYK;
+        break;
       case 2:
-	cinfo->jpeg_color_space = JCS_YCCK;
-	break;
+        cinfo->jpeg_color_space = JCS_YCCK;
+        break;
       default:
-	WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform);
-	cinfo->jpeg_color_space = JCS_YCCK; /* assume it's YCCK */
-	break;
+        WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform);
+        cinfo->jpeg_color_space = JCS_YCCK; /* assume it's YCCK */
+        break;
       }
     } else {
       /* No special markers, assume straight CMYK. */
@@ -177,7 +189,7 @@
     }
     cinfo->out_color_space = JCS_CMYK;
     break;
-    
+
   default:
     cinfo->jpeg_color_space = JCS_UNKNOWN;
     cinfo->out_color_space = JCS_UNKNOWN;
@@ -185,7 +197,7 @@
   }
 
   /* Set defaults for other decompression parameters. */
-  cinfo->scale_num = 1;		/* 1:1 scaling */
+  cinfo->scale_num = 1;         /* 1:1 scaling */
   cinfo->scale_denom = 1;
   cinfo->output_gamma = 1.0;
   cinfo->buffered_image = FALSE;
@@ -253,7 +265,7 @@
     retcode = JPEG_HEADER_OK;
     break;
   case JPEG_REACHED_EOI:
-    if (require_image)		/* Complain if application wanted an image */
+    if (require_image)          /* Complain if application wanted an image */
       ERREXIT(cinfo, JERR_NO_IMAGE);
     /* Reset to start state; it would be safer to require the application to
      * call jpeg_abort, but we can't change it now for compatibility reasons.
@@ -385,7 +397,7 @@
   /* Read until EOI */
   while (! cinfo->inputctl->eoi_reached) {
     if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
-      return FALSE;		/* Suspend, come back later */
+      return FALSE;             /* Suspend, come back later */
   }
   /* Do final cleanup */
   (*cinfo->src->term_source) (cinfo);
diff --git a/jdapistd.c b/jdapistd.c
index 056fa1d..37afc84 100644
--- a/jdapistd.c
+++ b/jdapistd.c
@@ -4,9 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2015, D. R. Commander.
+ * Copyright (C) 2010, 2015-2016, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains application interface code for the decompression half
  * of the JPEG library.  These are the "standard" API routines that are
@@ -17,14 +18,14 @@
  * whole decompression library into a transcoder.
  */
 
+#include "jinclude.h"
 #include "jdmainct.h"
 #include "jdcoefct.h"
 #include "jdsample.h"
 #include "jmemsys.h"
 
-
 /* Forward declarations */
-LOCAL(boolean) output_pass_setup JPP((j_decompress_ptr cinfo));
+LOCAL(boolean) output_pass_setup (j_decompress_ptr cinfo);
 
 
 /*
@@ -56,24 +57,24 @@
     if (cinfo->inputctl->has_multiple_scans) {
 #ifdef D_MULTISCAN_FILES_SUPPORTED
       for (;;) {
-	int retcode;
-	/* Call progress monitor hook if present */
-	if (cinfo->progress != NULL)
-	  (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
-	/* Absorb some more input */
-	retcode = (*cinfo->inputctl->consume_input) (cinfo);
-	if (retcode == JPEG_SUSPENDED)
-	  return FALSE;
-	if (retcode == JPEG_REACHED_EOI)
-	  break;
-	/* Advance progress counter if appropriate */
-	if (cinfo->progress != NULL &&
-	    (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
-	  if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
-	    /* jdmaster underestimated number of scans; ratchet up one scan */
-	    cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
-	  }
-	}
+        int retcode;
+        /* Call progress monitor hook if present */
+        if (cinfo->progress != NULL)
+          (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        /* Absorb some more input */
+        retcode = (*cinfo->inputctl->consume_input) (cinfo);
+        if (retcode == JPEG_SUSPENDED)
+          return FALSE;
+        if (retcode == JPEG_REACHED_EOI)
+          break;
+        /* Advance progress counter if appropriate */
+        if (cinfo->progress != NULL &&
+            (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
+          if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
+            /* jdmaster underestimated number of scans; ratchet up one scan */
+            cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
+          }
+        }
       }
 #else
       ERREXIT(cinfo, JERR_NOT_COMPILED);
@@ -112,16 +113,16 @@
       JDIMENSION last_scanline;
       /* Call progress monitor hook if present */
       if (cinfo->progress != NULL) {
-	cinfo->progress->pass_counter = (long) cinfo->output_scanline;
-	cinfo->progress->pass_limit = (long) cinfo->output_height;
-	(*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        cinfo->progress->pass_counter = (long) cinfo->output_scanline;
+        cinfo->progress->pass_limit = (long) cinfo->output_height;
+        (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
       }
       /* Process some data */
       last_scanline = cinfo->output_scanline;
       (*cinfo->main->process_data) (cinfo, (JSAMPARRAY) NULL,
-				    &cinfo->output_scanline, (JDIMENSION) 0);
+                                    &cinfo->output_scanline, (JDIMENSION) 0);
       if (cinfo->output_scanline == last_scanline)
-	return FALSE;		/* No progress made, must suspend */
+        return FALSE;           /* No progress made, must suspend */
     }
     /* Finish up dummy pass, and set up for another one */
     (*cinfo->master->finish_output_pass) (cinfo);
@@ -140,6 +141,110 @@
 
 
 /*
+ * Enable partial scanline decompression
+ *
+ * Must be called after jpeg_start_decompress() and before any calls to
+ * jpeg_read_scanlines() or jpeg_skip_scanlines().
+ *
+ * Refer to libjpeg.txt for more information.
+ */
+
+GLOBAL(void)
+jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                    JDIMENSION *width)
+{
+  int ci, align, orig_downsampled_width;
+  JDIMENSION input_xoffset;
+  boolean reinit_upsampler = FALSE;
+  jpeg_component_info *compptr;
+
+  if (cinfo->global_state != DSTATE_SCANNING || cinfo->output_scanline != 0)
+    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+  if (!xoffset || !width)
+    ERREXIT(cinfo, JERR_BAD_CROP_SPEC);
+
+  /* xoffset and width must fall within the output image dimensions. */
+  if (*width == 0 || *xoffset + *width > cinfo->output_width)
+    ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+
+  /* No need to do anything if the caller wants the entire width. */
+  if (*width == cinfo->output_width)
+    return;
+
+  /* Ensuring the proper alignment of xoffset is tricky.  At minimum, it
+   * must align with an MCU boundary, because:
+   *
+   *   (1) The IDCT is performed in blocks, and it is not feasible to modify
+   *       the algorithm so that it can transform partial blocks.
+   *   (2) Because of the SIMD extensions, any input buffer passed to the
+   *       upsampling and color conversion routines must be aligned to the
+   *       SIMD word size (for instance, 128-bit in the case of SSE2.)  The
+   *       easiest way to accomplish this without copying data is to ensure
+   *       that upsampling and color conversion begin at the start of the
+   *       first MCU column that will be inverse transformed.
+   *
+   * In practice, we actually impose a stricter alignment requirement.  We
+   * require that xoffset be a multiple of the maximum MCU column width of all
+   * of the components (the "iMCU column width.")  This is to simplify the
+   * single-pass decompression case, allowing us to use the same MCU column
+   * width for all of the components.
+   */
+  align = cinfo->_min_DCT_scaled_size * cinfo->max_h_samp_factor;
+
+  /* Adjust xoffset to the nearest iMCU boundary <= the requested value */
+  input_xoffset = *xoffset;
+  *xoffset = (input_xoffset / align) * align;
+
+  /* Adjust the width so that the right edge of the output image is as
+   * requested (only the left edge is altered.)  It is important that calling
+   * programs check this value after this function returns, so that they can
+   * allocate an output buffer with the appropriate size.
+   */
+  *width = *width + input_xoffset - *xoffset;
+  cinfo->output_width = *width;
+
+  /* Set the first and last iMCU columns that we must decompress.  These values
+   * will be used in single-scan decompressions.
+   */
+  cinfo->master->first_iMCU_col =
+    (JDIMENSION) (long) (*xoffset) / (long) align;
+  cinfo->master->last_iMCU_col =
+    (JDIMENSION) jdiv_round_up((long) (*xoffset + cinfo->output_width),
+                               (long) align) - 1;
+
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    /* Set downsampled_width to the new output width. */
+    orig_downsampled_width = compptr->downsampled_width;
+    compptr->downsampled_width =
+      (JDIMENSION) jdiv_round_up((long) (cinfo->output_width *
+                                         compptr->h_samp_factor),
+                                 (long) cinfo->max_h_samp_factor);
+    if (compptr->downsampled_width < 2 && orig_downsampled_width >= 2)
+      reinit_upsampler = TRUE;
+
+    /* Set the first and last iMCU columns that we must decompress.  These
+     * values will be used in multi-scan decompressions.
+     */
+    cinfo->master->first_MCU_col[ci] =
+      (JDIMENSION) (long) (*xoffset * compptr->h_samp_factor) /
+                   (long) align;
+    cinfo->master->last_MCU_col[ci] =
+      (JDIMENSION) jdiv_round_up((long) ((*xoffset + cinfo->output_width) *
+                                         compptr->h_samp_factor),
+                                 (long) align) - 1;
+  }
+
+  if (reinit_upsampler) {
+    cinfo->master->jinit_upsampler_no_alloc = TRUE;
+    jinit_upsampler(cinfo);
+    cinfo->master->jinit_upsampler_no_alloc = FALSE;
+  }
+}
+
+
+/*
  * Read some scanlines of data from the JPEG decompressor.
  *
  * The return value will be the number of lines actually read.
@@ -154,7 +259,7 @@
 
 GLOBAL(JDIMENSION)
 jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines,
-		     JDIMENSION max_lines)
+                     JDIMENSION max_lines)
 {
   JDIMENSION row_ctr;
 
@@ -180,7 +285,6 @@
 }
 
 
-
 /* Dummy color convert function used by jpeg_skip_scanlines() */
 LOCAL(void)
 noop_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
@@ -196,6 +300,7 @@
  * we set up and tear down a dummy color converter in order to avoid valgrind
  * errors and to achieve the best possible performance.
  */
+
 LOCAL(void)
 read_and_discard_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
 {
@@ -213,6 +318,7 @@
   cinfo->cconvert->color_convert = color_convert;
 }
 
+
 /*
  * Called by jpeg_skip_scanlines().  This partially skips a decompress block by
  * incrementing the rowgroup counter.
@@ -416,7 +522,7 @@
 
 GLOBAL(JDIMENSION)
 jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
-		    JDIMENSION max_lines)
+                    JDIMENSION max_lines)
 {
   JDIMENSION lines_per_iMCU_row;
 
@@ -441,7 +547,7 @@
 
   /* Decompress directly into user's buffer. */
   if (! (*cinfo->coef->decompress_data) (cinfo, data))
-    return 0;			/* suspension forced, can do nothing more */
+    return 0;                   /* suspension forced, can do nothing more */
 
   /* OK, we processed one iMCU row. */
   cinfo->output_scanline += lines_per_iMCU_row;
@@ -497,9 +603,9 @@
   }
   /* Read markers looking for SOS or EOI */
   while (cinfo->input_scan_number <= cinfo->output_scan_number &&
-	 ! cinfo->inputctl->eoi_reached) {
+         ! cinfo->inputctl->eoi_reached) {
     if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
-      return FALSE;		/* Suspend, come back later */
+      return FALSE;             /* Suspend, come back later */
   }
   cinfo->global_state = DSTATE_BUFIMAGE;
   return TRUE;
diff --git a/jdarith.c b/jdarith.c
index 9b29728..98d5fad 100644
--- a/jdarith.c
+++ b/jdarith.c
@@ -1,11 +1,12 @@
 /*
  * jdarith.c
  *
- * This file is part of the Independent JPEG Group's software:
- * Developed 1997-2009 by Guido Vollbeding.
+ * This file was part of the Independent JPEG Group's software:
+ * Developed 1997-2015 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains portable arithmetic entropy decoding routines for JPEG
  * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
@@ -25,8 +26,8 @@
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
-  INT32 c;       /* C register, base of coding interval + input bit buffer */
-  INT32 a;               /* A register, normalized size of coding interval */
+  JLONG c;       /* C register, base of coding interval + input bit buffer */
+  JLONG a;               /* A register, normalized size of coding interval */
   int ct;     /* bit shift counter, # of bits left in bit buffer part of C */
                                                          /* init: ct = -16 */
                                                          /* run: ct = 0..7 */
@@ -34,17 +35,17 @@
   int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
   int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
 
-  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+  unsigned int restarts_to_go;  /* MCUs left in this restart interval */
 
   /* Pointers to statistics areas (these workspaces have image lifespan) */
-  unsigned char * dc_stats[NUM_ARITH_TBLS];
-  unsigned char * ac_stats[NUM_ARITH_TBLS];
+  unsigned char *dc_stats[NUM_ARITH_TBLS];
+  unsigned char *ac_stats[NUM_ARITH_TBLS];
 
   /* Statistics bin for coding with fixed probability 0.5 */
   unsigned char fixed_bin[4];
 } arith_entropy_decoder;
 
-typedef arith_entropy_decoder * arith_entropy_ptr;
+typedef arith_entropy_decoder *arith_entropy_ptr;
 
 /* The following two definitions specify the allocation chunk size
  * for the statistics area.
@@ -67,7 +68,7 @@
 get_byte (j_decompress_ptr cinfo)
 /* Read next input byte; we do not support suspension in this module. */
 {
-  struct jpeg_source_mgr * src = cinfo->src;
+  struct jpeg_source_mgr *src = cinfo->src;
 
   if (src->bytes_in_buffer == 0)
     if (! (*src->fill_input_buffer) (cinfo))
@@ -96,7 +97,7 @@
  * (instead of fixed) with the bit shift counter CT.
  * Thus, we also need only one (variable instead of
  * fixed size) shift for the LPS/MPS decision, and
- * we can get away with any renormalization update
+ * we can do away with any renormalization update
  * of C (except for new data insertion, of course).
  *
  * I've also introduced a new scheme for accessing
@@ -109,7 +110,7 @@
 {
   register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
   register unsigned char nl, nm;
-  register INT32 qe, temp;
+  register JLONG qe, temp;
   register int sv, data;
 
   /* Renormalization & data input per section D.2.6 */
@@ -117,32 +118,32 @@
     if (--e->ct < 0) {
       /* Need to fetch next data byte */
       if (cinfo->unread_marker)
-	data = 0;		/* stuff zero data */
+        data = 0;               /* stuff zero data */
       else {
-	data = get_byte(cinfo);	/* read next input byte */
-	if (data == 0xFF) {	/* zero stuff or marker code */
-	  do data = get_byte(cinfo);
-	  while (data == 0xFF);	/* swallow extra 0xFF bytes */
-	  if (data == 0)
-	    data = 0xFF;	/* discard stuffed zero byte */
-	  else {
-	    /* Note: Different from the Huffman decoder, hitting
-	     * a marker while processing the compressed data
-	     * segment is legal in arithmetic coding.
-	     * The convention is to supply zero data
-	     * then until decoding is complete.
-	     */
-	    cinfo->unread_marker = data;
-	    data = 0;
-	  }
-	}
+        data = get_byte(cinfo); /* read next input byte */
+        if (data == 0xFF) {     /* zero stuff or marker code */
+          do data = get_byte(cinfo);
+          while (data == 0xFF); /* swallow extra 0xFF bytes */
+          if (data == 0)
+            data = 0xFF;        /* discard stuffed zero byte */
+          else {
+            /* Note: Different from the Huffman decoder, hitting
+             * a marker while processing the compressed data
+             * segment is legal in arithmetic coding.
+             * The convention is to supply zero data
+             * then until decoding is complete.
+             */
+            cinfo->unread_marker = data;
+            data = 0;
+          }
+        }
       }
       e->c = (e->c << 8) | data; /* insert data into C register */
-      if ((e->ct += 8) < 0)	 /* update bit shift counter */
-	/* Need more initial bytes */
-	if (++e->ct == 0)
-	  /* Got 2 initial bytes -> re-init A and exit loop */
-	  e->a = 0x8000L; /* => e->a = 0x10000L after loop exit */
+      if ((e->ct += 8) < 0)      /* update bit shift counter */
+        /* Need more initial bytes */
+        if (++e->ct == 0)
+          /* Got 2 initial bytes -> re-init A and exit loop */
+          e->a = 0x8000L; /* => e->a = 0x10000L after loop exit */
     }
     e->a <<= 1;
   }
@@ -151,9 +152,9 @@
    * Qe values and probability estimation state machine
    */
   sv = *st;
-  qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
-  nl = (unsigned char) qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
-  nm = (unsigned char) qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
+  qe = jpeg_aritab[sv & 0x7F];  /* => Qe_Value */
+  nl = qe & 0xFF; qe >>= 8;     /* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF; qe >>= 8;     /* Next_Index_MPS */
 
   /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
   temp = e->a - qe;
@@ -164,19 +165,19 @@
     /* Conditional LPS (less probable symbol) exchange */
     if (e->a < qe) {
       e->a = qe;
-      *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+      *st = (sv & 0x80) ^ nm;   /* Estimate_after_MPS */
     } else {
       e->a = qe;
-      *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
-      sv ^= 0x80;		/* Exchange LPS/MPS */
+      *st = (sv & 0x80) ^ nl;   /* Estimate_after_LPS */
+      sv ^= 0x80;               /* Exchange LPS/MPS */
     }
   } else if (e->a < 0x8000L) {
     /* Conditional MPS (more probable symbol) exchange */
     if (e->a < qe) {
-      *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
-      sv ^= 0x80;		/* Exchange LPS/MPS */
+      *st = (sv & 0x80) ^ nl;   /* Estimate_after_LPS */
+      sv ^= 0x80;               /* Exchange LPS/MPS */
     } else {
-      *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+      *st = (sv & 0x80) ^ nm;   /* Estimate_after_MPS */
     }
   }
 
@@ -193,7 +194,7 @@
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   /* Advance past the RSTn marker */
   if (! (*cinfo->marker->read_restart_marker) (cinfo))
@@ -202,13 +203,13 @@
   /* Re-initialize statistics areas */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
-    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+    if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
       MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
       /* Reset DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
     }
-    if (! cinfo->progressive_mode || cinfo->Ss) {
+    if (!cinfo->progressive_mode || cinfo->Ss) {
       MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
     }
   }
@@ -216,7 +217,7 @@
   /* Reset arithmetic decoding variables */
   entropy->c = 0;
   entropy->a = 0;
-  entropy->ct = -16;	/* force reading 2 initial bytes to fill C */
+  entropy->ct = -16;    /* force reading 2 initial bytes to fill C */
 
   /* Reset restart counter */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -255,7 +256,7 @@
     entropy->restarts_to_go--;
   }
 
-  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+  if (entropy->ct == -1) return TRUE;   /* if error do nothing */
 
   /* Outer loop handles each block in the MCU */
 
@@ -279,34 +280,34 @@
       st += 2; st += sign;
       /* Figure F.23: Decoding the magnitude category of v */
       if ((m = arith_decode(cinfo, st)) != 0) {
-	st = entropy->dc_stats[tbl] + 20;	/* Table F.4: X1 = 20 */
-	while (arith_decode(cinfo, st)) {
-	  if ((m <<= 1) == 0x8000) {
-	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
-	    entropy->ct = -1;			/* magnitude overflow */
-	    return TRUE;
-	  }
-	  st += 1;
-	}
+        st = entropy->dc_stats[tbl] + 20;       /* Table F.4: X1 = 20 */
+        while (arith_decode(cinfo, st)) {
+          if ((m <<= 1) == 0x8000) {
+            WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+            entropy->ct = -1;                   /* magnitude overflow */
+            return TRUE;
+          }
+          st += 1;
+        }
       }
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
       if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
-	entropy->dc_context[ci] = 0;		   /* zero diff category */
+        entropy->dc_context[ci] = 0;               /* zero diff category */
       else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
-	entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+        entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
       else
-	entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
+        entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
       v = m;
       /* Figure F.24: Decoding the magnitude bit pattern of v */
       st += 14;
       while (m >>= 1)
-	if (arith_decode(cinfo, st)) v |= m;
+        if (arith_decode(cinfo, st)) v |= m;
       v += 1; if (sign) v = -v;
       entropy->last_dc_val[ci] += v;
     }
 
     /* Scale and output the DC coefficient (assumes jpeg_natural_order[0]=0) */
-    (*block)[0] = (JCOEF) (entropy->last_dc_val[ci] << cinfo->Al);
+    (*block)[0] = (JCOEF) LEFT_SHIFT(entropy->last_dc_val[ci], cinfo->Al);
   }
 
   return TRUE;
@@ -334,7 +335,7 @@
     entropy->restarts_to_go--;
   }
 
-  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+  if (entropy->ct == -1) return TRUE;   /* if error do nothing */
 
   /* There is always only one block per MCU */
   block = MCU_data[0];
@@ -345,13 +346,13 @@
   /* Figure F.20: Decode_AC_coefficients */
   for (k = cinfo->Ss; k <= cinfo->Se; k++) {
     st = entropy->ac_stats[tbl] + 3 * (k - 1);
-    if (arith_decode(cinfo, st)) break;		/* EOB flag */
+    if (arith_decode(cinfo, st)) break;         /* EOB flag */
     while (arith_decode(cinfo, st + 1) == 0) {
       st += 3; k++;
       if (k > cinfo->Se) {
-	WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
-	entropy->ct = -1;			/* spectral overflow */
-	return TRUE;
+        WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+        entropy->ct = -1;                       /* spectral overflow */
+        return TRUE;
       }
     }
     /* Figure F.21: Decoding nonzero value v */
@@ -361,17 +362,17 @@
     /* Figure F.23: Decoding the magnitude category of v */
     if ((m = arith_decode(cinfo, st)) != 0) {
       if (arith_decode(cinfo, st)) {
-	m <<= 1;
-	st = entropy->ac_stats[tbl] +
-	     (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
-	while (arith_decode(cinfo, st)) {
-	  if ((m <<= 1) == 0x8000) {
-	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
-	    entropy->ct = -1;			/* magnitude overflow */
-	    return TRUE;
-	  }
-	  st += 1;
-	}
+        m <<= 1;
+        st = entropy->ac_stats[tbl] +
+             (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+        while (arith_decode(cinfo, st)) {
+          if ((m <<= 1) == 0x8000) {
+            WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+            entropy->ct = -1;                   /* magnitude overflow */
+            return TRUE;
+          }
+          st += 1;
+        }
       }
     }
     v = m;
@@ -406,8 +407,8 @@
     entropy->restarts_to_go--;
   }
 
-  st = entropy->fixed_bin;	/* use fixed probability estimation */
-  p1 = 1 << cinfo->Al;		/* 1 in the bit position being coded */
+  st = entropy->fixed_bin;      /* use fixed probability estimation */
+  p1 = 1 << cinfo->Al;          /* 1 in the bit position being coded */
 
   /* Outer loop handles each block in the MCU */
 
@@ -442,14 +443,14 @@
     entropy->restarts_to_go--;
   }
 
-  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+  if (entropy->ct == -1) return TRUE;   /* if error do nothing */
 
   /* There is always only one block per MCU */
   block = MCU_data[0];
   tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
 
-  p1 = 1 << cinfo->Al;		/* 1 in the bit position being coded */
-  m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+  p1 = 1 << cinfo->Al;          /* 1 in the bit position being coded */
+  m1 = (-1) << cinfo->Al;       /* -1 in the bit position being coded */
 
   /* Establish EOBx (previous stage end-of-block) index */
   for (kex = cinfo->Se; kex > 0; kex--)
@@ -458,30 +459,30 @@
   for (k = cinfo->Ss; k <= cinfo->Se; k++) {
     st = entropy->ac_stats[tbl] + 3 * (k - 1);
     if (k > kex)
-      if (arith_decode(cinfo, st)) break;	/* EOB flag */
+      if (arith_decode(cinfo, st)) break;       /* EOB flag */
     for (;;) {
       thiscoef = *block + jpeg_natural_order[k];
-      if (*thiscoef) {				/* previously nonzero coef */
-	if (arith_decode(cinfo, st + 2)) {
-	  if (*thiscoef < 0)
-	    *thiscoef += m1;
-	  else
-	    *thiscoef += p1;
-	}
-	break;
+      if (*thiscoef) {                          /* previously nonzero coef */
+        if (arith_decode(cinfo, st + 2)) {
+          if (*thiscoef < 0)
+            *thiscoef += m1;
+          else
+            *thiscoef += p1;
+        }
+        break;
       }
-      if (arith_decode(cinfo, st + 1)) {	/* newly nonzero coef */
-	if (arith_decode(cinfo, entropy->fixed_bin))
-	  *thiscoef = m1;
-	else
-	  *thiscoef = p1;
-	break;
+      if (arith_decode(cinfo, st + 1)) {        /* newly nonzero coef */
+        if (arith_decode(cinfo, entropy->fixed_bin))
+          *thiscoef = m1;
+        else
+          *thiscoef = p1;
+        break;
       }
       st += 3; k++;
       if (k > cinfo->Se) {
-	WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
-	entropy->ct = -1;			/* spectral overflow */
-	return TRUE;
+        WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+        entropy->ct = -1;                       /* spectral overflow */
+        return TRUE;
       }
     }
   }
@@ -498,7 +499,7 @@
 decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JBLOCKROW block;
   unsigned char *st;
   int blkn, ci, tbl, sign, k;
@@ -511,7 +512,7 @@
     entropy->restarts_to_go--;
   }
 
-  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+  if (entropy->ct == -1) return TRUE;   /* if error do nothing */
 
   /* Outer loop handles each block in the MCU */
 
@@ -537,28 +538,28 @@
       st += 2; st += sign;
       /* Figure F.23: Decoding the magnitude category of v */
       if ((m = arith_decode(cinfo, st)) != 0) {
-	st = entropy->dc_stats[tbl] + 20;	/* Table F.4: X1 = 20 */
-	while (arith_decode(cinfo, st)) {
-	  if ((m <<= 1) == 0x8000) {
-	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
-	    entropy->ct = -1;			/* magnitude overflow */
-	    return TRUE;
-	  }
-	  st += 1;
-	}
+        st = entropy->dc_stats[tbl] + 20;       /* Table F.4: X1 = 20 */
+        while (arith_decode(cinfo, st)) {
+          if ((m <<= 1) == 0x8000) {
+            WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+            entropy->ct = -1;                   /* magnitude overflow */
+            return TRUE;
+          }
+          st += 1;
+        }
       }
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
       if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
-	entropy->dc_context[ci] = 0;		   /* zero diff category */
+        entropy->dc_context[ci] = 0;               /* zero diff category */
       else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
-	entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+        entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
       else
-	entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
+        entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
       v = m;
       /* Figure F.24: Decoding the magnitude bit pattern of v */
       st += 14;
       while (m >>= 1)
-	if (arith_decode(cinfo, st)) v |= m;
+        if (arith_decode(cinfo, st)) v |= m;
       v += 1; if (sign) v = -v;
       entropy->last_dc_val[ci] += v;
     }
@@ -573,14 +574,14 @@
     /* Figure F.20: Decode_AC_coefficients */
     for (k = 1; k <= DCTSIZE2 - 1; k++) {
       st = entropy->ac_stats[tbl] + 3 * (k - 1);
-      if (arith_decode(cinfo, st)) break;	/* EOB flag */
+      if (arith_decode(cinfo, st)) break;       /* EOB flag */
       while (arith_decode(cinfo, st + 1) == 0) {
-	st += 3; k++;
-	if (k > DCTSIZE2 - 1) {
-	  WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
-	  entropy->ct = -1;			/* spectral overflow */
-	  return TRUE;
-	}
+        st += 3; k++;
+        if (k > DCTSIZE2 - 1) {
+          WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+          entropy->ct = -1;                     /* spectral overflow */
+          return TRUE;
+        }
       }
       /* Figure F.21: Decoding nonzero value v */
       /* Figure F.22: Decoding the sign of v */
@@ -588,25 +589,25 @@
       st += 2;
       /* Figure F.23: Decoding the magnitude category of v */
       if ((m = arith_decode(cinfo, st)) != 0) {
-	if (arith_decode(cinfo, st)) {
-	  m <<= 1;
-	  st = entropy->ac_stats[tbl] +
-	       (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
-	  while (arith_decode(cinfo, st)) {
-	    if ((m <<= 1) == 0x8000) {
-	      WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
-	      entropy->ct = -1;			/* magnitude overflow */
-	      return TRUE;
-	    }
-	    st += 1;
-	  }
-	}
+        if (arith_decode(cinfo, st)) {
+          m <<= 1;
+          st = entropy->ac_stats[tbl] +
+               (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+          while (arith_decode(cinfo, st)) {
+            if ((m <<= 1) == 0x8000) {
+              WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+              entropy->ct = -1;                 /* magnitude overflow */
+              return TRUE;
+            }
+            st += 1;
+          }
+        }
       }
       v = m;
       /* Figure F.24: Decoding the magnitude bit pattern of v */
       st += 14;
       while (m >>= 1)
-	if (arith_decode(cinfo, st)) v |= m;
+        if (arith_decode(cinfo, st)) v |= m;
       v += 1; if (sign) v = -v;
       if (block)
         (*block)[jpeg_natural_order[k]] = (JCOEF) v;
@@ -626,30 +627,30 @@
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   int ci, tbl;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   if (cinfo->progressive_mode) {
     /* Validate progressive scan parameters */
     if (cinfo->Ss == 0) {
       if (cinfo->Se != 0)
-	goto bad;
+        goto bad;
     } else {
       /* need not check Ss/Se < 0 since they came from unsigned bytes */
       if (cinfo->Se < cinfo->Ss || cinfo->Se > DCTSIZE2 - 1)
-	goto bad;
+        goto bad;
       /* AC scans may have only one component */
       if (cinfo->comps_in_scan != 1)
-	goto bad;
+        goto bad;
     }
     if (cinfo->Ah != 0) {
       /* Successive approximation refinement scan: must have Al = Ah-1. */
       if (cinfo->Ah-1 != cinfo->Al)
-	goto bad;
+        goto bad;
     }
-    if (cinfo->Al > 13) {	/* need not check for < 0 */
+    if (cinfo->Al > 13) {       /* need not check for < 0 */
       bad:
       ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
-	       cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+               cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
     }
     /* Update progression status, and verify that scan order is legal.
      * Note that inter-scan inconsistencies are treated as warnings
@@ -659,32 +660,32 @@
       int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
       int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
       if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
-	WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+        WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
       for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
-	int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
-	if (cinfo->Ah != expected)
-	  WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
-	coef_bit_ptr[coefi] = cinfo->Al;
+        int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
+        if (cinfo->Ah != expected)
+          WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+        coef_bit_ptr[coefi] = cinfo->Al;
       }
     }
     /* Select MCU decoding routine */
     if (cinfo->Ah == 0) {
       if (cinfo->Ss == 0)
-	entropy->pub.decode_mcu = decode_mcu_DC_first;
+        entropy->pub.decode_mcu = decode_mcu_DC_first;
       else
-	entropy->pub.decode_mcu = decode_mcu_AC_first;
+        entropy->pub.decode_mcu = decode_mcu_AC_first;
     } else {
       if (cinfo->Ss == 0)
-	entropy->pub.decode_mcu = decode_mcu_DC_refine;
+        entropy->pub.decode_mcu = decode_mcu_DC_refine;
       else
-	entropy->pub.decode_mcu = decode_mcu_AC_refine;
+        entropy->pub.decode_mcu = decode_mcu_AC_refine;
     }
   } else {
     /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
      * This ought to be an error condition, but we make it a warning.
      */
     if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
-	(cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1))
+        (cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1))
       WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
     /* Select MCU decoding routine */
     entropy->pub.decode_mcu = decode_mcu;
@@ -693,25 +694,25 @@
   /* Allocate & initialize requested statistics areas */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
-    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+    if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
       tbl = compptr->dc_tbl_no;
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
-	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+        ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->dc_stats[tbl] == NULL)
-	entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-	  ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+        entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+          ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
       MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
       /* Initialize DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
     }
-    if (! cinfo->progressive_mode || cinfo->Ss) {
+    if (!cinfo->progressive_mode || cinfo->Ss) {
       tbl = compptr->ac_tbl_no;
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
-	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+        ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->ac_stats[tbl] == NULL)
-	entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-	  ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+        entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+          ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
       MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
     }
   }
@@ -719,7 +720,7 @@
   /* Initialize arithmetic decoding variables */
   entropy->c = 0;
   entropy->a = 0;
-  entropy->ct = -16;	/* force reading 2 initial bytes to fill C */
+  entropy->ct = -16;    /* force reading 2 initial bytes to fill C */
 
   /* Initialize restart counter */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -738,7 +739,7 @@
 
   entropy = (arith_entropy_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(arith_entropy_decoder));
+                                sizeof(arith_entropy_decoder));
   cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
   entropy->pub.start_pass = start_pass;
 
@@ -756,10 +757,10 @@
     int *coef_bit_ptr, ci;
     cinfo->coef_bits = (int (*)[DCTSIZE2])
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  cinfo->num_components*DCTSIZE2*SIZEOF(int));
+                                  cinfo->num_components*DCTSIZE2*sizeof(int));
     coef_bit_ptr = & cinfo->coef_bits[0][0];
-    for (ci = 0; ci < cinfo->num_components; ci++) 
+    for (ci = 0; ci < cinfo->num_components; ci++)
       for (i = 0; i < DCTSIZE2; i++)
-	*coef_bit_ptr++ = -1;
+        *coef_bit_ptr++ = -1;
   }
 }
diff --git a/jdatadst-tj.c b/jdatadst-tj.c
index a8bf240..5d4260a 100644
--- a/jdatadst-tj.c
+++ b/jdatadst-tj.c
@@ -5,8 +5,9 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2012 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2011, 2014 D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains compression data destination routines for the case of
  * emitting JPEG data to memory or to a file (or any stdio stream).
@@ -22,13 +23,13 @@
 #include "jpeglib.h"
 #include "jerror.h"
 
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
+#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
+extern void *malloc (size_t size);
+extern void free (void *ptr);
 #endif
 
 
-#define OUTPUT_BUF_SIZE  4096	/* choose an efficiently fwrite'able size */
+#define OUTPUT_BUF_SIZE  4096   /* choose an efficiently fwrite'able size */
 
 
 /* Expanded data destination object for memory output */
@@ -36,15 +37,15 @@
 typedef struct {
   struct jpeg_destination_mgr pub; /* public fields */
 
-  unsigned char ** outbuffer;	/* target buffer */
-  unsigned long * outsize;
-  unsigned char * newbuffer;	/* newly allocated buffer */
-  JOCTET * buffer;		/* start of buffer */
+  unsigned char **outbuffer;    /* target buffer */
+  unsigned long *outsize;
+  unsigned char *newbuffer;     /* newly allocated buffer */
+  JOCTET *buffer;               /* start of buffer */
   size_t bufsize;
   boolean alloc;
 } my_mem_destination_mgr;
 
-typedef my_mem_destination_mgr * my_mem_dest_ptr;
+typedef my_mem_destination_mgr *my_mem_dest_ptr;
 
 
 /*
@@ -86,7 +87,7 @@
 empty_mem_output_buffer (j_compress_ptr cinfo)
 {
   size_t nextsize;
-  JOCTET * nextbuffer;
+  JOCTET *nextbuffer;
   my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
 
   if (!dest->alloc) ERREXIT(cinfo, JERR_BUFFER_SIZE);
@@ -147,29 +148,33 @@
 
 GLOBAL(void)
 jpeg_mem_dest_tj (j_compress_ptr cinfo,
-	       unsigned char ** outbuffer, unsigned long * outsize,
-	       boolean alloc)
+               unsigned char **outbuffer, unsigned long *outsize,
+               boolean alloc)
 {
+  boolean reused = FALSE;
   my_mem_dest_ptr dest;
 
-  if (outbuffer == NULL || outsize == NULL)	/* sanity check */
+  if (outbuffer == NULL || outsize == NULL)     /* sanity check */
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
   /* The destination object is made permanent so that multiple JPEG images
    * can be written to the same buffer without re-executing jpeg_mem_dest.
    */
-  if (cinfo->dest == NULL) {	/* first time for this JPEG object? */
+  if (cinfo->dest == NULL) {    /* first time for this JPEG object? */
     cinfo->dest = (struct jpeg_destination_mgr *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				  SIZEOF(my_mem_destination_mgr));
+                                  sizeof(my_mem_destination_mgr));
     dest = (my_mem_dest_ptr) cinfo->dest;
     dest->newbuffer = NULL;
+    dest->buffer = NULL;
   }
 
   dest = (my_mem_dest_ptr) cinfo->dest;
   dest->pub.init_destination = init_mem_destination;
   dest->pub.empty_output_buffer = empty_mem_output_buffer;
   dest->pub.term_destination = term_mem_destination;
+  if (dest->buffer == *outbuffer && *outbuffer != NULL && alloc)
+    reused = TRUE;
   dest->outbuffer = outbuffer;
   dest->outsize = outsize;
   dest->alloc = alloc;
@@ -186,5 +191,7 @@
   }
 
   dest->pub.next_output_byte = dest->buffer = *outbuffer;
-  dest->pub.free_in_buffer = dest->bufsize = *outsize;
+  if (!reused)
+    dest->bufsize = *outsize;
+  dest->pub.free_in_buffer = dest->bufsize;
 }
diff --git a/jdatadst.c b/jdatadst.c
index 1b89fab..21018b0 100644
--- a/jdatadst.c
+++ b/jdatadst.c
@@ -6,7 +6,8 @@
  * Modified 2009-2012 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2013, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains compression data destination routines for the case of
  * emitting JPEG data to memory or to a file (or any stdio stream).
@@ -22,9 +23,9 @@
 #include "jpeglib.h"
 #include "jerror.h"
 
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
+#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
+extern void *malloc (size_t size);
+extern void free (void *ptr);
 #endif
 
 
@@ -33,13 +34,13 @@
 typedef struct {
   struct jpeg_destination_mgr pub; /* public fields */
 
-  FILE * outfile;		/* target stream */
-  JOCTET * buffer;		/* start of buffer */
+  FILE *outfile;                /* target stream */
+  JOCTET *buffer;               /* start of buffer */
 } my_destination_mgr;
 
-typedef my_destination_mgr * my_dest_ptr;
+typedef my_destination_mgr *my_dest_ptr;
 
-#define OUTPUT_BUF_SIZE  4096	/* choose an efficiently fwrite'able size */
+#define OUTPUT_BUF_SIZE  4096   /* choose an efficiently fwrite'able size */
 
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
@@ -48,14 +49,14 @@
 typedef struct {
   struct jpeg_destination_mgr pub; /* public fields */
 
-  unsigned char ** outbuffer;	/* target buffer */
-  unsigned long * outsize;
-  unsigned char * newbuffer;	/* newly allocated buffer */
-  JOCTET * buffer;		/* start of buffer */
+  unsigned char **outbuffer;    /* target buffer */
+  unsigned long *outsize;
+  unsigned char *newbuffer;     /* newly allocated buffer */
+  JOCTET *buffer;               /* start of buffer */
   size_t bufsize;
 } my_mem_destination_mgr;
 
-typedef my_mem_destination_mgr * my_mem_dest_ptr;
+typedef my_mem_destination_mgr *my_mem_dest_ptr;
 #endif
 
 
@@ -72,7 +73,7 @@
   /* Allocate the output buffer --- it will be released when done with image */
   dest->buffer = (JOCTET *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  OUTPUT_BUF_SIZE * SIZEOF(JOCTET));
+                                  OUTPUT_BUF_SIZE * sizeof(JOCTET));
 
   dest->pub.next_output_byte = dest->buffer;
   dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
@@ -130,7 +131,7 @@
 empty_mem_output_buffer (j_compress_ptr cinfo)
 {
   size_t nextsize;
-  JOCTET * nextbuffer;
+  JOCTET *nextbuffer;
   my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
 
   /* Try to allocate new buffer with double size */
@@ -203,7 +204,7 @@
  */
 
 GLOBAL(void)
-jpeg_stdio_dest (j_compress_ptr cinfo, FILE * outfile)
+jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
 {
   my_dest_ptr dest;
 
@@ -213,10 +214,10 @@
    * manager serially with the same JPEG object, because their private object
    * sizes may be different.  Caveat programmer.
    */
-  if (cinfo->dest == NULL) {	/* first time for this JPEG object? */
+  if (cinfo->dest == NULL) {    /* first time for this JPEG object? */
     cinfo->dest = (struct jpeg_destination_mgr *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				  SIZEOF(my_destination_mgr));
+                                  sizeof(my_destination_mgr));
   }
 
   dest = (my_dest_ptr) cinfo->dest;
@@ -237,24 +238,27 @@
  * larger memory, so the buffer is available to the application after
  * finishing compression, and then the application is responsible for
  * freeing the requested memory.
+ * Note:  An initial buffer supplied by the caller is expected to be
+ * managed by the application.  The library does not free such buffer
+ * when allocating a larger buffer.
  */
 
 GLOBAL(void)
 jpeg_mem_dest (j_compress_ptr cinfo,
-	       unsigned char ** outbuffer, unsigned long * outsize)
+               unsigned char **outbuffer, unsigned long *outsize)
 {
   my_mem_dest_ptr dest;
 
-  if (outbuffer == NULL || outsize == NULL)	/* sanity check */
+  if (outbuffer == NULL || outsize == NULL)     /* sanity check */
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
   /* The destination object is made permanent so that multiple JPEG images
    * can be written to the same buffer without re-executing jpeg_mem_dest.
    */
-  if (cinfo->dest == NULL) {	/* first time for this JPEG object? */
+  if (cinfo->dest == NULL) {    /* first time for this JPEG object? */
     cinfo->dest = (struct jpeg_destination_mgr *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				  SIZEOF(my_mem_destination_mgr));
+                                  sizeof(my_mem_destination_mgr));
   }
 
   dest = (my_mem_dest_ptr) cinfo->dest;
diff --git a/jdatasrc-tj.c b/jdatasrc-tj.c
index 259c6de..0b99ee1 100644
--- a/jdatasrc-tj.c
+++ b/jdatasrc-tj.c
@@ -6,7 +6,8 @@
  * Modified 2009-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2011, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains decompression data source routines for the case of
  * reading JPEG data from memory or from a file (or any stdio stream).
@@ -105,7 +106,7 @@
 METHODDEF(void)
 skip_input_data (j_decompress_ptr cinfo, long num_bytes)
 {
-  struct jpeg_source_mgr * src = cinfo->src;
+  struct jpeg_source_mgr *src = cinfo->src;
 
   /* Just a dumb implementation for now.  Could use fseek() except
    * it doesn't work on pipes.  Not clear that being smart is worth
@@ -157,21 +158,21 @@
 
 GLOBAL(void)
 jpeg_mem_src_tj (j_decompress_ptr cinfo,
-	      unsigned char * inbuffer, unsigned long insize)
+                 const unsigned char *inbuffer, unsigned long insize)
 {
-  struct jpeg_source_mgr * src;
+  struct jpeg_source_mgr *src;
 
-  if (inbuffer == NULL || insize == 0)	/* Treat empty input as fatal error */
+  if (inbuffer == NULL || insize == 0)  /* Treat empty input as fatal error */
     ERREXIT(cinfo, JERR_INPUT_EMPTY);
 
   /* The source object is made permanent so that a series of JPEG images
    * can be read from the same buffer by calling jpeg_mem_src only before
    * the first one.
    */
-  if (cinfo->src == NULL) {	/* first time for this JPEG object? */
+  if (cinfo->src == NULL) {     /* first time for this JPEG object? */
     cinfo->src = (struct jpeg_source_mgr *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				  SIZEOF(struct jpeg_source_mgr));
+                                  sizeof(struct jpeg_source_mgr));
   }
 
   src = cinfo->src;
@@ -181,5 +182,5 @@
   src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
   src->term_source = term_source;
   src->bytes_in_buffer = (size_t) insize;
-  src->next_input_byte = (JOCTET *) inbuffer;
+  src->next_input_byte = (const JOCTET *) inbuffer;
 }
diff --git a/jdatasrc.c b/jdatasrc.c
index 1e9c8ad..acbeb8a 100644
--- a/jdatasrc.c
+++ b/jdatasrc.c
@@ -6,7 +6,8 @@
  * Modified 2009-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2013, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains decompression data source routines for the case of
  * reading JPEG data from memory or from a file (or any stdio stream).
@@ -26,16 +27,16 @@
 /* Expanded data source object for stdio input */
 
 typedef struct {
-  struct jpeg_source_mgr pub;	/* public fields */
+  struct jpeg_source_mgr pub;   /* public fields */
 
-  FILE * infile;		/* source stream */
-  JOCTET * buffer;		/* start of buffer */
-  boolean start_of_file;	/* have we gotten any data yet? */
+  FILE *infile;                 /* source stream */
+  JOCTET *buffer;               /* start of buffer */
+  boolean start_of_file;        /* have we gotten any data yet? */
 } my_source_mgr;
 
-typedef my_source_mgr * my_src_ptr;
+typedef my_source_mgr *my_src_ptr;
 
-#define INPUT_BUF_SIZE  4096	/* choose an efficiently fread'able size */
+#define INPUT_BUF_SIZE  4096    /* choose an efficiently fread'able size */
 
 
 /*
@@ -106,7 +107,7 @@
   nbytes = JFREAD(src->infile, src->buffer, INPUT_BUF_SIZE);
 
   if (nbytes <= 0) {
-    if (src->start_of_file)	/* Treat empty input file as fatal error */
+    if (src->start_of_file)     /* Treat empty input file as fatal error */
       ERREXIT(cinfo, JERR_INPUT_EMPTY);
     WARNMS(cinfo, JWRN_JPEG_EOF);
     /* Insert a fake EOI marker */
@@ -161,7 +162,7 @@
 METHODDEF(void)
 skip_input_data (j_decompress_ptr cinfo, long num_bytes)
 {
-  struct jpeg_source_mgr * src = cinfo->src;
+  struct jpeg_source_mgr *src = cinfo->src;
 
   /* Just a dumb implementation for now.  Could use fseek() except
    * it doesn't work on pipes.  Not clear that being smart is worth
@@ -213,7 +214,7 @@
  */
 
 GLOBAL(void)
-jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile)
+jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
 {
   my_src_ptr src;
 
@@ -224,14 +225,14 @@
    * This makes it unsafe to use this manager and a different source
    * manager serially with the same JPEG object.  Caveat programmer.
    */
-  if (cinfo->src == NULL) {	/* first time for this JPEG object? */
+  if (cinfo->src == NULL) {     /* first time for this JPEG object? */
     cinfo->src = (struct jpeg_source_mgr *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				  SIZEOF(my_source_mgr));
+                                  sizeof(my_source_mgr));
     src = (my_src_ptr) cinfo->src;
     src->buffer = (JOCTET *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				  INPUT_BUF_SIZE * SIZEOF(JOCTET));
+                                  INPUT_BUF_SIZE * sizeof(JOCTET));
   }
 
   src = (my_src_ptr) cinfo->src;
@@ -254,21 +255,21 @@
 
 GLOBAL(void)
 jpeg_mem_src (j_decompress_ptr cinfo,
-	      unsigned char * inbuffer, unsigned long insize)
+              const unsigned char *inbuffer, unsigned long insize)
 {
-  struct jpeg_source_mgr * src;
+  struct jpeg_source_mgr *src;
 
-  if (inbuffer == NULL || insize == 0)	/* Treat empty input as fatal error */
+  if (inbuffer == NULL || insize == 0)  /* Treat empty input as fatal error */
     ERREXIT(cinfo, JERR_INPUT_EMPTY);
 
   /* The source object is made permanent so that a series of JPEG images
    * can be read from the same buffer by calling jpeg_mem_src only before
    * the first one.
    */
-  if (cinfo->src == NULL) {	/* first time for this JPEG object? */
+  if (cinfo->src == NULL) {     /* first time for this JPEG object? */
     cinfo->src = (struct jpeg_source_mgr *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				  SIZEOF(struct jpeg_source_mgr));
+                                  sizeof(struct jpeg_source_mgr));
   }
 
   src = cinfo->src;
@@ -278,6 +279,6 @@
   src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
   src->term_source = term_source;
   src->bytes_in_buffer = (size_t) insize;
-  src->next_input_byte = (JOCTET *) inbuffer;
+  src->next_input_byte = (const JOCTET *) inbuffer;
 }
 #endif
diff --git a/jdcoefct.c b/jdcoefct.c
index 4309462..1a48969 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -5,8 +5,10 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the coefficient buffer controller for decompression.
  * This controller is the top level of the JPEG decompressor proper.
@@ -17,21 +19,22 @@
  * Also, the input side (only) is used when reading a file for transcoding.
  */
 
+#include "jinclude.h"
 #include "jdcoefct.h"
 #include "jpegcomp.h"
 
 
 /* Forward declarations */
 METHODDEF(int) decompress_onepass
-	JPP((j_decompress_ptr cinfo, JSAMPIMAGE output_buf));
+        (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
 #ifdef D_MULTISCAN_FILES_SUPPORTED
 METHODDEF(int) decompress_data
-	JPP((j_decompress_ptr cinfo, JSAMPIMAGE output_buf));
+        (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
 #endif
 #ifdef BLOCK_SMOOTHING_SUPPORTED
-LOCAL(boolean) smoothing_ok JPP((j_decompress_ptr cinfo));
+LOCAL(boolean) smoothing_ok (j_decompress_ptr cinfo);
 METHODDEF(int) decompress_smooth_data
-	JPP((j_decompress_ptr cinfo, JSAMPIMAGE output_buf));
+        (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
 #endif
 
 
@@ -83,7 +86,7 @@
 decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 {
   my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
-  JDIMENSION MCU_col_num;	/* index of current MCU within row */
+  JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   int blkn, ci, xindex, yindex, yoffset, useful_width;
@@ -96,49 +99,57 @@
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
-	 MCU_col_num++) {
+         MCU_col_num++) {
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
-      jzero_far((void FAR *) coef->MCU_buffer[0],
-		(size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK)));
+      jzero_far((void *) coef->MCU_buffer[0],
+                (size_t) (cinfo->blocks_in_MCU * sizeof(JBLOCK)));
       if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
-	/* Suspension forced; update state counters and exit */
-	coef->MCU_vert_offset = yoffset;
-	coef->MCU_ctr = MCU_col_num;
-	return JPEG_SUSPENDED;
+        /* Suspension forced; update state counters and exit */
+        coef->MCU_vert_offset = yoffset;
+        coef->MCU_ctr = MCU_col_num;
+        return JPEG_SUSPENDED;
       }
-      /* Determine where data should go in output_buf and do the IDCT thing.
-       * We skip dummy blocks at the right and bottom edges (but blkn gets
-       * incremented past them!).  Note the inner loop relies on having
-       * allocated the MCU_buffer[] blocks sequentially.
+
+      /* Only perform the IDCT on blocks that are contained within the desired
+       * cropping region.
        */
-      blkn = 0;			/* index of current DCT block within MCU */
-      for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-	compptr = cinfo->cur_comp_info[ci];
-	/* Don't bother to IDCT an uninteresting component. */
-	if (! compptr->component_needed) {
-	  blkn += compptr->MCU_blocks;
-	  continue;
-	}
-	inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
-	useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-						    : compptr->last_col_width;
-	output_ptr = output_buf[compptr->component_index] +
-	  yoffset * compptr->_DCT_scaled_size;
-	start_col = MCU_col_num * compptr->MCU_sample_width;
-	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-	  if (cinfo->input_iMCU_row < last_iMCU_row ||
-	      yoffset+yindex < compptr->last_row_height) {
-	    output_col = start_col;
-	    for (xindex = 0; xindex < useful_width; xindex++) {
-	      (*inverse_DCT) (cinfo, compptr,
-			      (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
-			      output_ptr, output_col);
-	      output_col += compptr->_DCT_scaled_size;
-	    }
-	  }
-	  blkn += compptr->MCU_width;
-	  output_ptr += compptr->_DCT_scaled_size;
-	}
+      if (MCU_col_num >= cinfo->master->first_iMCU_col &&
+          MCU_col_num <= cinfo->master->last_iMCU_col) {
+        /* Determine where data should go in output_buf and do the IDCT thing.
+         * We skip dummy blocks at the right and bottom edges (but blkn gets
+         * incremented past them!).  Note the inner loop relies on having
+         * allocated the MCU_buffer[] blocks sequentially.
+         */
+        blkn = 0;                 /* index of current DCT block within MCU */
+        for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+          compptr = cinfo->cur_comp_info[ci];
+          /* Don't bother to IDCT an uninteresting component. */
+          if (! compptr->component_needed) {
+            blkn += compptr->MCU_blocks;
+            continue;
+          }
+          inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
+          useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
+                                                      : compptr->last_col_width;
+          output_ptr = output_buf[compptr->component_index] +
+            yoffset * compptr->_DCT_scaled_size;
+          start_col = (MCU_col_num - cinfo->master->first_iMCU_col) *
+              compptr->MCU_sample_width;
+          for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+            if (cinfo->input_iMCU_row < last_iMCU_row ||
+                yoffset+yindex < compptr->last_row_height) {
+              output_col = start_col;
+              for (xindex = 0; xindex < useful_width; xindex++) {
+                (*inverse_DCT) (cinfo, compptr,
+                                (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
+                                output_ptr, output_col);
+                output_col += compptr->_DCT_scaled_size;
+              }
+            }
+            blkn += compptr->MCU_width;
+            output_ptr += compptr->_DCT_scaled_size;
+          }
+        }
       }
     }
     /* Completed an MCU row, but perhaps not an iMCU row */
@@ -163,7 +174,7 @@
 METHODDEF(int)
 dummy_consume_data (j_decompress_ptr cinfo)
 {
-  return JPEG_SUSPENDED;	/* Always indicate nothing was done */
+  return JPEG_SUSPENDED;        /* Always indicate nothing was done */
 }
 
 
@@ -180,7 +191,7 @@
 consume_data (j_decompress_ptr cinfo)
 {
   my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
-  JDIMENSION MCU_col_num;	/* index of current MCU within row */
+  JDIMENSION MCU_col_num;       /* index of current MCU within row */
   int blkn, ci, xindex, yindex, yoffset;
   JDIMENSION start_col;
   JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN];
@@ -204,25 +215,25 @@
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num < cinfo->MCUs_per_row;
-	 MCU_col_num++) {
+         MCU_col_num++) {
       /* Construct list of pointers to DCT blocks belonging to this MCU */
-      blkn = 0;			/* index of current DCT block within MCU */
+      blkn = 0;                 /* index of current DCT block within MCU */
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-	compptr = cinfo->cur_comp_info[ci];
-	start_col = MCU_col_num * compptr->MCU_width;
-	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-	  buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
-	  for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
-	    coef->MCU_buffer[blkn++] = buffer_ptr++;
-	  }
-	}
+        compptr = cinfo->cur_comp_info[ci];
+        start_col = MCU_col_num * compptr->MCU_width;
+        for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+          buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+          for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
+            coef->MCU_buffer[blkn++] = buffer_ptr++;
+          }
+        }
       }
       /* Try to fetch the MCU. */
       if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
-	/* Suspension forced; update state counters and exit */
-	coef->MCU_vert_offset = yoffset;
-	coef->MCU_ctr = MCU_col_num;
-	return JPEG_SUSPENDED;
+        /* Suspension forced; update state counters and exit */
+        coef->MCU_vert_offset = yoffset;
+        coef->MCU_ctr = MCU_col_num;
+        return JPEG_SUSPENDED;
       }
     }
     /* Completed an MCU row, but perhaps not an iMCU row */
@@ -263,8 +274,8 @@
 
   /* Force some input to be done if we are getting ahead of the input. */
   while (cinfo->input_scan_number < cinfo->output_scan_number ||
-	 (cinfo->input_scan_number == cinfo->output_scan_number &&
-	  cinfo->input_iMCU_row <= cinfo->output_iMCU_row)) {
+         (cinfo->input_scan_number == cinfo->output_scan_number &&
+          cinfo->input_iMCU_row <= cinfo->output_iMCU_row)) {
     if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED)
       return JPEG_SUSPENDED;
   }
@@ -292,13 +303,14 @@
     output_ptr = output_buf[ci];
     /* Loop over all DCT blocks to be processed. */
     for (block_row = 0; block_row < block_rows; block_row++) {
-      buffer_ptr = buffer[block_row];
+      buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci];
       output_col = 0;
-      for (block_num = 0; block_num < compptr->width_in_blocks; block_num++) {
-	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
-			output_ptr, output_col);
-	buffer_ptr++;
-	output_col += compptr->_DCT_scaled_size;
+      for (block_num = cinfo->master->first_MCU_col[ci];
+           block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
+        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
+                        output_ptr, output_col);
+        buffer_ptr++;
+        output_col += compptr->_DCT_scaled_size;
       }
       output_ptr += compptr->_DCT_scaled_size;
     }
@@ -344,9 +356,9 @@
   boolean smoothing_useful = FALSE;
   int ci, coefi;
   jpeg_component_info *compptr;
-  JQUANT_TBL * qtable;
-  int * coef_bits;
-  int * coef_bits_latch;
+  JQUANT_TBL *qtable;
+  int *coef_bits;
+  int *coef_bits_latch;
 
   if (! cinfo->progressive_mode || cinfo->coef_bits == NULL)
     return FALSE;
@@ -355,8 +367,8 @@
   if (coef->coef_bits_latch == NULL)
     coef->coef_bits_latch = (int *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  cinfo->num_components *
-				  (SAVED_COEFS * SIZEOF(int)));
+                                  cinfo->num_components *
+                                  (SAVED_COEFS * sizeof(int)));
   coef_bits_latch = coef->coef_bits_latch;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
@@ -366,11 +378,11 @@
       return FALSE;
     /* Verify DC & first 5 AC quantizers are nonzero to avoid zero-divide. */
     if (qtable->quantval[0] == 0 ||
-	qtable->quantval[Q01_POS] == 0 ||
-	qtable->quantval[Q10_POS] == 0 ||
-	qtable->quantval[Q20_POS] == 0 ||
-	qtable->quantval[Q11_POS] == 0 ||
-	qtable->quantval[Q02_POS] == 0)
+        qtable->quantval[Q01_POS] == 0 ||
+        qtable->quantval[Q10_POS] == 0 ||
+        qtable->quantval[Q20_POS] == 0 ||
+        qtable->quantval[Q11_POS] == 0 ||
+        qtable->quantval[Q02_POS] == 0)
       return FALSE;
     /* DC values must be at least partly known for all components. */
     coef_bits = cinfo->coef_bits[ci];
@@ -380,7 +392,7 @@
     for (coefi = 1; coefi <= 5; coefi++) {
       coef_bits_latch[coefi] = coef_bits[coefi];
       if (coef_bits[coefi] != 0)
-	smoothing_useful = TRUE;
+        smoothing_useful = TRUE;
     }
     coef_bits_latch += SAVED_COEFS;
   }
@@ -407,10 +419,10 @@
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
   boolean first_row, last_row;
-  JCOEF * workspace;
+  JCOEF *workspace;
   int *coef_bits;
   JQUANT_TBL *quanttbl;
-  INT32 Q00,Q01,Q02,Q10,Q11,Q20, num;
+  JLONG Q00,Q01,Q02,Q10,Q11,Q20, num;
   int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
   int Al, pred;
 
@@ -419,7 +431,7 @@
 
   /* Force some input to be done if we are getting ahead of the input. */
   while (cinfo->input_scan_number <= cinfo->output_scan_number &&
-	 ! cinfo->inputctl->eoi_reached) {
+         ! cinfo->inputctl->eoi_reached) {
     if (cinfo->input_scan_number == cinfo->output_scan_number) {
       /* If input is working on current scan, we ordinarily want it to
        * have completed the current row.  But if input scan is DC,
@@ -428,7 +440,7 @@
        */
       JDIMENSION delta = (cinfo->Ss == 0) ? 1 : 0;
       if (cinfo->input_iMCU_row > cinfo->output_iMCU_row+delta)
-	break;
+        break;
     }
     if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED)
       return JPEG_SUSPENDED;
@@ -456,15 +468,15 @@
     if (cinfo->output_iMCU_row > 0) {
       access_rows += compptr->v_samp_factor; /* prior iMCU row too */
       buffer = (*cinfo->mem->access_virt_barray)
-	((j_common_ptr) cinfo, coef->whole_image[ci],
-	 (cinfo->output_iMCU_row - 1) * compptr->v_samp_factor,
-	 (JDIMENSION) access_rows, FALSE);
-      buffer += compptr->v_samp_factor;	/* point to current iMCU row */
+        ((j_common_ptr) cinfo, coef->whole_image[ci],
+         (cinfo->output_iMCU_row - 1) * compptr->v_samp_factor,
+         (JDIMENSION) access_rows, FALSE);
+      buffer += compptr->v_samp_factor; /* point to current iMCU row */
       first_row = FALSE;
     } else {
       buffer = (*cinfo->mem->access_virt_barray)
-	((j_common_ptr) cinfo, coef->whole_image[ci],
-	 (JDIMENSION) 0, (JDIMENSION) access_rows, FALSE);
+        ((j_common_ptr) cinfo, coef->whole_image[ci],
+         (JDIMENSION) 0, (JDIMENSION) access_rows, FALSE);
       first_row = TRUE;
     }
     /* Fetch component-dependent info */
@@ -480,15 +492,15 @@
     output_ptr = output_buf[ci];
     /* Loop over all DCT blocks to be processed. */
     for (block_row = 0; block_row < block_rows; block_row++) {
-      buffer_ptr = buffer[block_row];
+      buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci];
       if (first_row && block_row == 0)
-	prev_block_row = buffer_ptr;
+        prev_block_row = buffer_ptr;
       else
-	prev_block_row = buffer[block_row-1];
+        prev_block_row = buffer[block_row-1];
       if (last_row && block_row == block_rows-1)
-	next_block_row = buffer_ptr;
+        next_block_row = buffer_ptr;
       else
-	next_block_row = buffer[block_row+1];
+        next_block_row = buffer[block_row+1];
       /* We fetch the surrounding DC values using a sliding-register approach.
        * Initialize all nine here so as to do the right thing on narrow pics.
        */
@@ -497,103 +509,104 @@
       DC7 = DC8 = DC9 = (int) next_block_row[0][0];
       output_col = 0;
       last_block_column = compptr->width_in_blocks - 1;
-      for (block_num = 0; block_num <= last_block_column; block_num++) {
-	/* Fetch current DCT block into workspace so we can modify it. */
-	jcopy_block_row(buffer_ptr, (JBLOCKROW) workspace, (JDIMENSION) 1);
-	/* Update DC values */
-	if (block_num < last_block_column) {
-	  DC3 = (int) prev_block_row[1][0];
-	  DC6 = (int) buffer_ptr[1][0];
-	  DC9 = (int) next_block_row[1][0];
-	}
-	/* Compute coefficient estimates per K.8.
-	 * An estimate is applied only if coefficient is still zero,
-	 * and is not known to be fully accurate.
-	 */
-	/* AC01 */
-	if ((Al=coef_bits[1]) != 0 && workspace[1] == 0) {
-	  num = 36 * Q00 * (DC4 - DC6);
-	  if (num >= 0) {
-	    pred = (int) (((Q01<<7) + num) / (Q01<<8));
-	    if (Al > 0 && pred >= (1<<Al))
-	      pred = (1<<Al)-1;
-	  } else {
-	    pred = (int) (((Q01<<7) - num) / (Q01<<8));
-	    if (Al > 0 && pred >= (1<<Al))
-	      pred = (1<<Al)-1;
-	    pred = -pred;
-	  }
-	  workspace[1] = (JCOEF) pred;
-	}
-	/* AC10 */
-	if ((Al=coef_bits[2]) != 0 && workspace[8] == 0) {
-	  num = 36 * Q00 * (DC2 - DC8);
-	  if (num >= 0) {
-	    pred = (int) (((Q10<<7) + num) / (Q10<<8));
-	    if (Al > 0 && pred >= (1<<Al))
-	      pred = (1<<Al)-1;
-	  } else {
-	    pred = (int) (((Q10<<7) - num) / (Q10<<8));
-	    if (Al > 0 && pred >= (1<<Al))
-	      pred = (1<<Al)-1;
-	    pred = -pred;
-	  }
-	  workspace[8] = (JCOEF) pred;
-	}
-	/* AC20 */
-	if ((Al=coef_bits[3]) != 0 && workspace[16] == 0) {
-	  num = 9 * Q00 * (DC2 + DC8 - 2*DC5);
-	  if (num >= 0) {
-	    pred = (int) (((Q20<<7) + num) / (Q20<<8));
-	    if (Al > 0 && pred >= (1<<Al))
-	      pred = (1<<Al)-1;
-	  } else {
-	    pred = (int) (((Q20<<7) - num) / (Q20<<8));
-	    if (Al > 0 && pred >= (1<<Al))
-	      pred = (1<<Al)-1;
-	    pred = -pred;
-	  }
-	  workspace[16] = (JCOEF) pred;
-	}
-	/* AC11 */
-	if ((Al=coef_bits[4]) != 0 && workspace[9] == 0) {
-	  num = 5 * Q00 * (DC1 - DC3 - DC7 + DC9);
-	  if (num >= 0) {
-	    pred = (int) (((Q11<<7) + num) / (Q11<<8));
-	    if (Al > 0 && pred >= (1<<Al))
-	      pred = (1<<Al)-1;
-	  } else {
-	    pred = (int) (((Q11<<7) - num) / (Q11<<8));
-	    if (Al > 0 && pred >= (1<<Al))
-	      pred = (1<<Al)-1;
-	    pred = -pred;
-	  }
-	  workspace[9] = (JCOEF) pred;
-	}
-	/* AC02 */
-	if ((Al=coef_bits[5]) != 0 && workspace[2] == 0) {
-	  num = 9 * Q00 * (DC4 + DC6 - 2*DC5);
-	  if (num >= 0) {
-	    pred = (int) (((Q02<<7) + num) / (Q02<<8));
-	    if (Al > 0 && pred >= (1<<Al))
-	      pred = (1<<Al)-1;
-	  } else {
-	    pred = (int) (((Q02<<7) - num) / (Q02<<8));
-	    if (Al > 0 && pred >= (1<<Al))
-	      pred = (1<<Al)-1;
-	    pred = -pred;
-	  }
-	  workspace[2] = (JCOEF) pred;
-	}
-	/* OK, do the IDCT */
-	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) workspace,
-			output_ptr, output_col);
-	/* Advance for next column */
-	DC1 = DC2; DC2 = DC3;
-	DC4 = DC5; DC5 = DC6;
-	DC7 = DC8; DC8 = DC9;
-	buffer_ptr++, prev_block_row++, next_block_row++;
-	output_col += compptr->_DCT_scaled_size;
+      for (block_num = cinfo->master->first_MCU_col[ci];
+           block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
+        /* Fetch current DCT block into workspace so we can modify it. */
+        jcopy_block_row(buffer_ptr, (JBLOCKROW) workspace, (JDIMENSION) 1);
+        /* Update DC values */
+        if (block_num < last_block_column) {
+          DC3 = (int) prev_block_row[1][0];
+          DC6 = (int) buffer_ptr[1][0];
+          DC9 = (int) next_block_row[1][0];
+        }
+        /* Compute coefficient estimates per K.8.
+         * An estimate is applied only if coefficient is still zero,
+         * and is not known to be fully accurate.
+         */
+        /* AC01 */
+        if ((Al=coef_bits[1]) != 0 && workspace[1] == 0) {
+          num = 36 * Q00 * (DC4 - DC6);
+          if (num >= 0) {
+            pred = (int) (((Q01<<7) + num) / (Q01<<8));
+            if (Al > 0 && pred >= (1<<Al))
+              pred = (1<<Al)-1;
+          } else {
+            pred = (int) (((Q01<<7) - num) / (Q01<<8));
+            if (Al > 0 && pred >= (1<<Al))
+              pred = (1<<Al)-1;
+            pred = -pred;
+          }
+          workspace[1] = (JCOEF) pred;
+        }
+        /* AC10 */
+        if ((Al=coef_bits[2]) != 0 && workspace[8] == 0) {
+          num = 36 * Q00 * (DC2 - DC8);
+          if (num >= 0) {
+            pred = (int) (((Q10<<7) + num) / (Q10<<8));
+            if (Al > 0 && pred >= (1<<Al))
+              pred = (1<<Al)-1;
+          } else {
+            pred = (int) (((Q10<<7) - num) / (Q10<<8));
+            if (Al > 0 && pred >= (1<<Al))
+              pred = (1<<Al)-1;
+            pred = -pred;
+          }
+          workspace[8] = (JCOEF) pred;
+        }
+        /* AC20 */
+        if ((Al=coef_bits[3]) != 0 && workspace[16] == 0) {
+          num = 9 * Q00 * (DC2 + DC8 - 2*DC5);
+          if (num >= 0) {
+            pred = (int) (((Q20<<7) + num) / (Q20<<8));
+            if (Al > 0 && pred >= (1<<Al))
+              pred = (1<<Al)-1;
+          } else {
+            pred = (int) (((Q20<<7) - num) / (Q20<<8));
+            if (Al > 0 && pred >= (1<<Al))
+              pred = (1<<Al)-1;
+            pred = -pred;
+          }
+          workspace[16] = (JCOEF) pred;
+        }
+        /* AC11 */
+        if ((Al=coef_bits[4]) != 0 && workspace[9] == 0) {
+          num = 5 * Q00 * (DC1 - DC3 - DC7 + DC9);
+          if (num >= 0) {
+            pred = (int) (((Q11<<7) + num) / (Q11<<8));
+            if (Al > 0 && pred >= (1<<Al))
+              pred = (1<<Al)-1;
+          } else {
+            pred = (int) (((Q11<<7) - num) / (Q11<<8));
+            if (Al > 0 && pred >= (1<<Al))
+              pred = (1<<Al)-1;
+            pred = -pred;
+          }
+          workspace[9] = (JCOEF) pred;
+        }
+        /* AC02 */
+        if ((Al=coef_bits[5]) != 0 && workspace[2] == 0) {
+          num = 9 * Q00 * (DC4 + DC6 - 2*DC5);
+          if (num >= 0) {
+            pred = (int) (((Q02<<7) + num) / (Q02<<8));
+            if (Al > 0 && pred >= (1<<Al))
+              pred = (1<<Al)-1;
+          } else {
+            pred = (int) (((Q02<<7) - num) / (Q02<<8));
+            if (Al > 0 && pred >= (1<<Al))
+              pred = (1<<Al)-1;
+            pred = -pred;
+          }
+          workspace[2] = (JCOEF) pred;
+        }
+        /* OK, do the IDCT */
+        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) workspace,
+                        output_ptr, output_col);
+        /* Advance for next column */
+        DC1 = DC2; DC2 = DC3;
+        DC4 = DC5; DC5 = DC6;
+        DC7 = DC8; DC8 = DC9;
+        buffer_ptr++, prev_block_row++, next_block_row++;
+        output_col += compptr->_DCT_scaled_size;
       }
       output_ptr += compptr->_DCT_scaled_size;
     }
@@ -618,7 +631,7 @@
 
   coef = (my_coef_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_coef_controller));
+                                sizeof(my_coef_controller));
   cinfo->coef = (struct jpeg_d_coef_controller *) coef;
   coef->pub.start_input_pass = start_input_pass;
   coef->pub.start_output_pass = start_output_pass;
@@ -636,20 +649,20 @@
     jpeg_component_info *compptr;
 
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-	 ci++, compptr++) {
+         ci++, compptr++) {
       access_rows = compptr->v_samp_factor;
 #ifdef BLOCK_SMOOTHING_SUPPORTED
       /* If block smoothing could be used, need a bigger window */
       if (cinfo->progressive_mode)
-	access_rows *= 3;
+        access_rows *= 3;
 #endif
       coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
-	((j_common_ptr) cinfo, JPOOL_IMAGE, TRUE,
-	 (JDIMENSION) jround_up((long) compptr->width_in_blocks,
-				(long) compptr->h_samp_factor),
-	 (JDIMENSION) jround_up((long) compptr->height_in_blocks,
-				(long) compptr->v_samp_factor),
-	 (JDIMENSION) access_rows);
+        ((j_common_ptr) cinfo, JPOOL_IMAGE, TRUE,
+         (JDIMENSION) jround_up((long) compptr->width_in_blocks,
+                                (long) compptr->h_samp_factor),
+         (JDIMENSION) jround_up((long) compptr->height_in_blocks,
+                                (long) compptr->v_samp_factor),
+         (JDIMENSION) access_rows);
     }
     coef->pub.consume_data = consume_data;
     coef->pub.decompress_data = decompress_data;
@@ -664,7 +677,7 @@
 
     buffer = (JBLOCKROW)
       (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
+                                  D_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
     for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
       coef->MCU_buffer[i] = buffer + i;
     }
@@ -676,5 +689,5 @@
   /* Allocate the workspace buffer */
   coef->workspace = (JCOEF *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                SIZEOF(JCOEF) * DCTSIZE2);
+                                sizeof(JCOEF) * DCTSIZE2);
 }
diff --git a/jdcoefct.h b/jdcoefct.h
index 2f7bbe5..bf6beb2 100644
--- a/jdcoefct.h
+++ b/jdcoefct.h
@@ -5,11 +5,11 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  */
 
 #define JPEG_INTERNALS
-#include "jinclude.h"
 #include "jpeglib.h"
 
 
@@ -41,7 +41,7 @@
   JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
 
   /* Temporary workspace for one MCU */
-  JCOEF * workspace;
+  JCOEF *workspace;
 
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* In multi-pass modes, we need a virtual block array for each component. */
@@ -50,12 +50,12 @@
 
 #ifdef BLOCK_SMOOTHING_SUPPORTED
   /* When doing block smoothing, we latch coefficient Al values here */
-  int * coef_bits_latch;
+  int *coef_bits_latch;
 #define SAVED_COEFS  6          /* we save coef_bits[0..5] */
 #endif
 } my_coef_controller;
 
-typedef my_coef_controller * my_coef_ptr;
+typedef my_coef_controller *my_coef_ptr;
 
 
 LOCAL(void)
diff --git a/jdcol565.c b/jdcol565.c
index 695f262..349fce4 100644
--- a/jdcol565.c
+++ b/jdcol565.c
@@ -5,8 +5,9 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modifications:
  * Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains output colorspace conversion routines.
  */
@@ -30,12 +31,12 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   register int * Crrtab = cconvert->Cr_r_tab;
   register int * Cbbtab = cconvert->Cb_b_tab;
-  register INT32 * Crgtab = cconvert->Cr_g_tab;
-  register INT32 * Cbgtab = cconvert->Cb_g_tab;
+  register JLONG * Crgtab = cconvert->Cr_g_tab;
+  register JLONG * Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
-    INT32 rgb;
+    JLONG rgb;
     unsigned int r, g, b;
     inptr0 = input_buf[0][input_row];
     inptr1 = input_buf[1][input_row];
@@ -52,7 +53,7 @@
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -87,7 +88,7 @@
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
     }
   }
 }
@@ -109,13 +110,13 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   register int * Crrtab = cconvert->Cr_r_tab;
   register int * Cbbtab = cconvert->Cb_b_tab;
-  register INT32 * Crgtab = cconvert->Cr_g_tab;
-  register INT32 * Cbgtab = cconvert->Cb_g_tab;
-  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  register JLONG * Crgtab = cconvert->Cr_g_tab;
+  register JLONG * Cbgtab = cconvert->Cb_g_tab;
+  JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
-    INT32 rgb;
+    JLONG rgb;
     unsigned int r, g, b;
 
     inptr0 = input_buf[0][input_row];
@@ -133,7 +134,7 @@
                                                      SCALEBITS)), d0)];
       b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -173,7 +174,7 @@
                                                      SCALEBITS)), d0)];
       b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
     }
   }
 }
@@ -192,7 +193,7 @@
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
-    INT32 rgb;
+    JLONG rgb;
     unsigned int r, g, b;
 
     inptr0 = input_buf[0][input_row];
@@ -205,7 +206,7 @@
       g = GETJSAMPLE(*inptr1++);
       b = GETJSAMPLE(*inptr2++);
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -228,7 +229,7 @@
       g = GETJSAMPLE(*inptr1);
       b = GETJSAMPLE(*inptr2);
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
     }
   }
 }
@@ -245,11 +246,11 @@
   register JDIMENSION col;
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   JDIMENSION num_cols = cinfo->output_width;
-  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
-    INT32 rgb;
+    JLONG rgb;
     unsigned int r, g, b;
 
     inptr0 = input_buf[0][input_row];
@@ -262,7 +263,7 @@
       g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
       b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -287,7 +288,7 @@
       g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1), d0)];
       b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2), d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
     }
   }
 }
@@ -304,7 +305,7 @@
   JDIMENSION num_cols = cinfo->output_width;
 
   while (--num_rows >= 0) {
-    INT32 rgb;
+    JLONG rgb;
     unsigned int g;
 
     inptr = input_buf[0][input_row++];
@@ -312,7 +313,7 @@
     if (PACK_NEED_ALIGNMENT(outptr)) {
       g = *inptr++;
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -327,7 +328,7 @@
     if (num_cols & 1) {
       g = *inptr;
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
     }
   }
 }
@@ -343,10 +344,10 @@
   register JDIMENSION col;
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   JDIMENSION num_cols = cinfo->output_width;
-  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
 
   while (--num_rows >= 0) {
-    INT32 rgb;
+    JLONG rgb;
     unsigned int g;
 
     inptr = input_buf[0][input_row++];
@@ -355,7 +356,7 @@
       g = *inptr++;
       g = range_limit[DITHER_565_R(g, d0)];
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -377,7 +378,7 @@
       g = *inptr;
       g = range_limit[DITHER_565_R(g, d0)];
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = rgb;
+      *(INT16*)outptr = (INT16)rgb;
     }
   }
 }
diff --git a/jdcolext.c b/jdcolext.c
index 6e9e31a..59b676c 100644
--- a/jdcolext.c
+++ b/jdcolext.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2011, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2009, 2011, 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains output colorspace conversion routines.
  */
@@ -41,8 +42,8 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   register int * Crrtab = cconvert->Cr_r_tab;
   register int * Cbbtab = cconvert->Cb_b_tab;
-  register INT32 * Crgtab = cconvert->Cr_g_tab;
-  register INT32 * Cbgtab = cconvert->Cb_g_tab;
+  register JLONG * Crgtab = cconvert->Cr_g_tab;
+  register JLONG * Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
@@ -58,8 +59,8 @@
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
       outptr[RGB_GREEN] = range_limit[y +
-			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
-						 SCALEBITS))];
+                              ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+                                                 SCALEBITS))];
       outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
       /* Set unused byte to 0xFF so it can be interpreted as an opaque */
       /* alpha channel value */
diff --git a/jdcolor.c b/jdcolor.c
index 2c68ed8..ab8fa24 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -6,9 +6,10 @@
  * Modified 2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009, 2011-2012, 2014, D. R. Commander.
+ * Copyright (C) 2009, 2011-2012, 2014-2015, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains output colorspace conversion routines.
  */
@@ -17,7 +18,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jsimd.h"
-#include "config.h"
+#include "jconfigint.h"
 
 
 /* Private subobject */
@@ -26,16 +27,16 @@
   struct jpeg_color_deconverter pub; /* public fields */
 
   /* Private state for YCC->RGB conversion */
-  int * Cr_r_tab;		/* => table for Cr to R conversion */
-  int * Cb_b_tab;		/* => table for Cb to B conversion */
-  INT32 * Cr_g_tab;		/* => table for Cr to G conversion */
-  INT32 * Cb_g_tab;		/* => table for Cb to G conversion */
+  int *Cr_r_tab;                /* => table for Cr to R conversion */
+  int *Cb_b_tab;                /* => table for Cb to B conversion */
+  JLONG *Cr_g_tab;              /* => table for Cr to G conversion */
+  JLONG *Cb_g_tab;              /* => table for Cb to G conversion */
 
   /* Private state for RGB->Y conversion */
-  INT32 * rgb_y_tab;		/* => table for RGB to Y conversion */
+  JLONG *rgb_y_tab;             /* => table for RGB to Y conversion */
 } my_color_deconverter;
 
-typedef my_color_deconverter * my_cconvert_ptr;
+typedef my_color_deconverter *my_cconvert_ptr;
 
 
 /**************** YCbCr -> RGB conversion: most common case **************/
@@ -46,11 +47,11 @@
  * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5.
  * The conversion equations to be implemented are therefore
  *
- *	R = Y                + 1.40200 * Cr
- *	G = Y - 0.34414 * Cb - 0.71414 * Cr
- *	B = Y + 1.77200 * Cb
+ *      R = Y                + 1.40200 * Cr
+ *      G = Y - 0.34414 * Cb - 0.71414 * Cr
+ *      B = Y + 1.77200 * Cb
  *
- *	Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ *      Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
  *
  * where Cb and Cr represent the incoming values less CENTERJSAMPLE.
  * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.)
@@ -72,9 +73,9 @@
  * together before rounding.
  */
 
-#define SCALEBITS	16	/* speediest right-shift on some machines */
-#define ONE_HALF	((INT32) 1 << (SCALEBITS-1))
-#define FIX(x)		((INT32) ((x) * (1L<<SCALEBITS) + 0.5))
+#define SCALEBITS       16      /* speediest right-shift on some machines */
+#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
+#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
 
 /* We allocate one big table for RGB->Y conversion and divide it up into
  * three parts, instead of doing three alloc_small requests.  This lets us
@@ -83,10 +84,10 @@
  * anyway).
  */
 
-#define R_Y_OFF		0			/* offset to R => Y section */
-#define G_Y_OFF		(1*(MAXJSAMPLE+1))	/* offset to G => Y section */
-#define B_Y_OFF		(2*(MAXJSAMPLE+1))	/* etc. */
-#define TABLE_SIZE	(3*(MAXJSAMPLE+1))
+#define R_Y_OFF         0                       /* offset to R => Y section */
+#define G_Y_OFF         (1*(MAXJSAMPLE+1))      /* offset to G => Y section */
+#define B_Y_OFF         (2*(MAXJSAMPLE+1))      /* etc. */
+#define TABLE_SIZE      (3*(MAXJSAMPLE+1))
 
 
 /* Include inline routines for colorspace extensions */
@@ -211,31 +212,31 @@
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   int i;
-  INT32 x;
+  JLONG x;
   SHIFT_TEMPS
 
   cconvert->Cr_r_tab = (int *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(MAXJSAMPLE+1) * SIZEOF(int));
+                                (MAXJSAMPLE+1) * sizeof(int));
   cconvert->Cb_b_tab = (int *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(MAXJSAMPLE+1) * SIZEOF(int));
-  cconvert->Cr_g_tab = (INT32 *)
+                                (MAXJSAMPLE+1) * sizeof(int));
+  cconvert->Cr_g_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(MAXJSAMPLE+1) * SIZEOF(INT32));
-  cconvert->Cb_g_tab = (INT32 *)
+                                (MAXJSAMPLE+1) * sizeof(JLONG));
+  cconvert->Cb_g_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(MAXJSAMPLE+1) * SIZEOF(INT32));
+                                (MAXJSAMPLE+1) * sizeof(JLONG));
 
   for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
     /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
     /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */
     /* Cr=>R value is nearest int to 1.40200 * x */
     cconvert->Cr_r_tab[i] = (int)
-		    RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
+                    RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
     /* Cb=>B value is nearest int to 1.77200 * x */
     cconvert->Cb_b_tab[i] = (int)
-		    RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
+                    RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
     /* Cr=>G value is scaled-up -0.71414 * x */
     cconvert->Cr_g_tab[i] = (- FIX(0.71414)) * x;
     /* Cb=>G value is scaled-up -0.34414 * x */
@@ -251,8 +252,8 @@
 
 METHODDEF(void)
 ycc_rgb_convert (j_decompress_ptr cinfo,
-		 JSAMPIMAGE input_buf, JDIMENSION input_row,
-		 JSAMPARRAY output_buf, int num_rows)
+                 JSAMPIMAGE input_buf, JDIMENSION input_row,
+                 JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
     case JCS_EXT_RGB:
@@ -302,13 +303,13 @@
 build_rgb_y_table (j_decompress_ptr cinfo)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  INT32 * rgb_y_tab;
-  INT32 i;
+  JLONG *rgb_y_tab;
+  JLONG i;
 
   /* Allocate and fill in the conversion tables. */
-  cconvert->rgb_y_tab = rgb_y_tab = (INT32 *)
+  cconvert->rgb_y_tab = rgb_y_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(TABLE_SIZE * SIZEOF(INT32)));
+                                (TABLE_SIZE * sizeof(JLONG)));
 
   for (i = 0; i <= MAXJSAMPLE; i++) {
     rgb_y_tab[i+R_Y_OFF] = FIX(0.29900) * i;
@@ -324,12 +325,12 @@
 
 METHODDEF(void)
 rgb_gray_convert (j_decompress_ptr cinfo,
-		  JSAMPIMAGE input_buf, JDIMENSION input_row,
-		  JSAMPARRAY output_buf, int num_rows)
+                  JSAMPIMAGE input_buf, JDIMENSION input_row,
+                  JSAMPARRAY output_buf, int num_rows)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int r, g, b;
-  register INT32 * ctab = cconvert->rgb_y_tab;
+  register JLONG *ctab = cconvert->rgb_y_tab;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
@@ -347,8 +348,8 @@
       b = GETJSAMPLE(inptr2[col]);
       /* Y */
       outptr[col] = (JSAMPLE)
-		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-		 >> SCALEBITS);
+                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
+                 >> SCALEBITS);
     }
   }
 }
@@ -361,26 +362,56 @@
 
 METHODDEF(void)
 null_convert (j_decompress_ptr cinfo,
-	      JSAMPIMAGE input_buf, JDIMENSION input_row,
-	      JSAMPARRAY output_buf, int num_rows)
+              JSAMPIMAGE input_buf, JDIMENSION input_row,
+              JSAMPARRAY output_buf, int num_rows)
 {
-  register JSAMPROW inptr, outptr;
-  register JDIMENSION count;
+  register JSAMPROW inptr, inptr0, inptr1, inptr2, inptr3, outptr;
+  register JDIMENSION col;
   register int num_components = cinfo->num_components;
   JDIMENSION num_cols = cinfo->output_width;
   int ci;
 
-  while (--num_rows >= 0) {
-    for (ci = 0; ci < num_components; ci++) {
-      inptr = input_buf[ci][input_row];
-      outptr = output_buf[0] + ci;
-      for (count = num_cols; count > 0; count--) {
-	*outptr = *inptr++;	/* needn't bother with GETJSAMPLE() here */
-	outptr += num_components;
+  if (num_components == 3) {
+    while (--num_rows >= 0) {
+      inptr0 = input_buf[0][input_row];
+      inptr1 = input_buf[1][input_row];
+      inptr2 = input_buf[2][input_row];
+      input_row++;
+      outptr = *output_buf++;
+      for (col = 0; col < num_cols; col++) {
+        *outptr++ = inptr0[col];
+        *outptr++ = inptr1[col];
+        *outptr++ = inptr2[col];
       }
     }
-    input_row++;
-    output_buf++;
+  } else if (num_components == 4) {
+    while (--num_rows >= 0) {
+      inptr0 = input_buf[0][input_row];
+      inptr1 = input_buf[1][input_row];
+      inptr2 = input_buf[2][input_row];
+      inptr3 = input_buf[3][input_row];
+      input_row++;
+      outptr = *output_buf++;
+      for (col = 0; col < num_cols; col++) {
+        *outptr++ = inptr0[col];
+        *outptr++ = inptr1[col];
+        *outptr++ = inptr2[col];
+        *outptr++ = inptr3[col];
+      }
+    }
+  } else {
+    while (--num_rows >= 0) {
+      for (ci = 0; ci < num_components; ci++) {
+        inptr = input_buf[ci][input_row];
+        outptr = *output_buf;
+        for (col = 0; col < num_cols; col++) {
+          outptr[ci] = inptr[col];
+          outptr += num_components;
+        }
+      }
+      output_buf++;
+      input_row++;
+    }
   }
 }
 
@@ -393,11 +424,11 @@
 
 METHODDEF(void)
 grayscale_convert (j_decompress_ptr cinfo,
-		   JSAMPIMAGE input_buf, JDIMENSION input_row,
-		   JSAMPARRAY output_buf, int num_rows)
+                   JSAMPIMAGE input_buf, JDIMENSION input_row,
+                   JSAMPARRAY output_buf, int num_rows)
 {
   jcopy_sample_rows(input_buf[0], (int) input_row, output_buf, 0,
-		    num_rows, cinfo->output_width);
+                    num_rows, cinfo->output_width);
 }
 
 
@@ -407,8 +438,8 @@
 
 METHODDEF(void)
 gray_rgb_convert (j_decompress_ptr cinfo,
-		  JSAMPIMAGE input_buf, JDIMENSION input_row,
-		  JSAMPARRAY output_buf, int num_rows)
+                  JSAMPIMAGE input_buf, JDIMENSION input_row,
+                  JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
     case JCS_EXT_RGB:
@@ -453,8 +484,8 @@
 
 METHODDEF(void)
 rgb_rgb_convert (j_decompress_ptr cinfo,
-		  JSAMPIMAGE input_buf, JDIMENSION input_row,
-		  JSAMPARRAY output_buf, int num_rows)
+                  JSAMPIMAGE input_buf, JDIMENSION input_row,
+                  JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
     case JCS_EXT_RGB:
@@ -502,8 +533,8 @@
 
 METHODDEF(void)
 ycck_cmyk_convert (j_decompress_ptr cinfo,
-		   JSAMPIMAGE input_buf, JDIMENSION input_row,
-		   JSAMPARRAY output_buf, int num_rows)
+                   JSAMPIMAGE input_buf, JDIMENSION input_row,
+                   JSAMPARRAY output_buf, int num_rows)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int y, cb, cr;
@@ -512,11 +543,11 @@
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  register int * Crrtab = cconvert->Cr_r_tab;
-  register int * Cbbtab = cconvert->Cb_b_tab;
-  register INT32 * Crgtab = cconvert->Cr_g_tab;
-  register INT32 * Cbgtab = cconvert->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register int *Crrtab = cconvert->Cr_r_tab;
+  register int *Cbbtab = cconvert->Cb_b_tab;
+  register JLONG *Crgtab = cconvert->Cr_g_tab;
+  register JLONG *Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
@@ -531,13 +562,13 @@
       cb = GETJSAMPLE(inptr1[col]);
       cr = GETJSAMPLE(inptr2[col]);
       /* Range-limiting is essential due to noise introduced by DCT losses. */
-      outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];	/* red */
-      outptr[1] = range_limit[MAXJSAMPLE - (y +			/* green */
-			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
-						 SCALEBITS)))];
-      outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];	/* blue */
+      outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];   /* red */
+      outptr[1] = range_limit[MAXJSAMPLE - (y +                 /* green */
+                              ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+                                                 SCALEBITS)))];
+      outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];   /* blue */
       /* K passes through unchanged */
-      outptr[3] = inptr3[col];	/* don't need GETJSAMPLE here */
+      outptr[3] = inptr3[col];  /* don't need GETJSAMPLE here */
       outptr += 4;
     }
   }
@@ -573,8 +604,8 @@
  */
 
 #define DITHER_MASK       0x3
-#define DITHER_ROTATE(x)  (((x) << 24) | (((x) >> 8) & 0x00FFFFFF))
-static const INT32 dither_matrix[4] = {
+#define DITHER_ROTATE(x)  ((((x) & 0xFF) << 24) | (((x) >> 8) & 0x00FFFFFF))
+static const JLONG dither_matrix[4] = {
   0x0008020A,
   0x0C040E06,
   0x030B0109,
@@ -725,7 +756,7 @@
 
   cconvert = (my_cconvert_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_color_deconverter));
+                                sizeof(my_color_deconverter));
   cinfo->cconvert = (struct jpeg_color_deconverter *) cconvert;
   cconvert->pub.start_pass = start_pass_dcolor;
 
@@ -748,7 +779,7 @@
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
     break;
 
-  default:			/* JCS_UNKNOWN can be anything */
+  default:                      /* JCS_UNKNOWN can be anything */
     if (cinfo->num_components < 1)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
     break;
@@ -763,11 +794,11 @@
   case JCS_GRAYSCALE:
     cinfo->out_color_components = 1;
     if (cinfo->jpeg_color_space == JCS_GRAYSCALE ||
-	cinfo->jpeg_color_space == JCS_YCbCr) {
+        cinfo->jpeg_color_space == JCS_YCbCr) {
       cconvert->pub.color_convert = grayscale_convert;
       /* For color->grayscale conversion, only the Y (0) component is needed */
       for (ci = 1; ci < cinfo->num_components; ci++)
-	cinfo->comp_info[ci].component_needed = FALSE;
+        cinfo->comp_info[ci].component_needed = FALSE;
     } else if (cinfo->jpeg_color_space == JCS_RGB) {
       cconvert->pub.color_convert = rgb_gray_convert;
       build_rgb_y_table(cinfo);
@@ -812,11 +843,11 @@
     cinfo->out_color_components = 3;
     if (cinfo->dither_mode == JDITHER_NONE) {
       if (cinfo->jpeg_color_space == JCS_YCbCr) {
-        if (jsimd_can_ycc_rgb565())
-          cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
-        else {
-          cconvert->pub.color_convert = ycc_rgb565_convert;
-          build_ycc_rgb_table(cinfo);
+         if (jsimd_can_ycc_rgb565())
+           cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
+         else {
+           cconvert->pub.color_convert = ycc_rgb565_convert;
+           build_ycc_rgb_table(cinfo);
         }
       } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
         cconvert->pub.color_convert = gray_rgb565_convert;
@@ -838,7 +869,7 @@
     }
     break;
 
-case JCS_CMYK:
+  case JCS_CMYK:
     cinfo->out_color_components = 4;
     if (cinfo->jpeg_color_space == JCS_YCCK) {
       cconvert->pub.color_convert = ycck_cmyk_convert;
@@ -854,7 +885,7 @@
     if (cinfo->out_color_space == cinfo->jpeg_color_space) {
       cinfo->out_color_components = cinfo->num_components;
       cconvert->pub.color_convert = null_convert;
-    } else			/* unsupported non-null conversion */
+    } else                      /* unsupported non-null conversion */
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
   }
diff --git a/jdct.h b/jdct.h
index 3637448..faf8e1c 100644
--- a/jdct.h
+++ b/jdct.h
@@ -1,14 +1,17 @@
 /*
  * jdct.h
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This include file contains common declarations for the forward and
  * inverse DCT modules.  These declarations are private to the DCT managers
  * (jcdctmgr.c, jddctmgr.c) and the individual DCT algorithms.
- * The individual DCT algorithms are kept in separate files to ease 
+ * The individual DCT algorithms are kept in separate files to ease
  * machine-dependent tuning (e.g., assembly coding).
  */
 
@@ -16,7 +19,7 @@
 /*
  * A forward DCT routine is given a pointer to a work area of type DCTELEM[];
  * the DCT is to be performed in-place in that buffer.  Type DCTELEM is int
- * for 8-bit samples, INT32 for 12-bit samples.  (NOTE: Floating-point DCT
+ * for 8-bit samples, JLONG for 12-bit samples.  (NOTE: Floating-point DCT
  * implementations use an array of type FAST_FLOAT, instead.)
  * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE).
  * The DCT outputs are returned scaled up by a factor of 8; they therefore
@@ -29,7 +32,7 @@
 
 #if BITS_IN_JSAMPLE == 8
 #ifndef WITH_SIMD
-typedef int DCTELEM;		/* 16 or 32 bits is fine */
+typedef int DCTELEM;            /* 16 or 32 bits is fine */
 typedef unsigned int UDCTELEM;
 typedef unsigned long long UDCTELEM2;
 #else
@@ -38,8 +41,7 @@
 typedef unsigned int UDCTELEM2;
 #endif
 #else
-typedef INT32 DCTELEM;		/* must have 32 bits */
-typedef UINT32 UDCTELEM;
+typedef JLONG DCTELEM;          /* must have 32 bits */
 typedef unsigned long long UDCTELEM2;
 #endif
 
@@ -64,10 +66,10 @@
 typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
 #if BITS_IN_JSAMPLE == 8
 typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
-#define IFAST_SCALE_BITS  2	/* fractional bits in scale factors */
+#define IFAST_SCALE_BITS  2     /* fractional bits in scale factors */
 #else
-typedef INT32 IFAST_MULT_TYPE;	/* need 32 bits for scaled quantizers */
-#define IFAST_SCALE_BITS  13	/* fractional bits in scale factors */
+typedef JLONG IFAST_MULT_TYPE;  /* need 32 bits for scaled quantizers */
+#define IFAST_SCALE_BITS  13    /* fractional bits in scale factors */
 #endif
 typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
 
@@ -86,105 +88,79 @@
 #define RANGE_MASK  (MAXJSAMPLE * 4 + 3) /* 2 bits wider than legal samples */
 
 
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_fdct_islow		jFDislow
-#define jpeg_fdct_ifast		jFDifast
-#define jpeg_fdct_float		jFDfloat
-#define jpeg_idct_islow		jRDislow
-#define jpeg_idct_ifast		jRDifast
-#define jpeg_idct_float		jRDfloat
-#define jpeg_idct_7x7		jRD7x7
-#define jpeg_idct_6x6		jRD6x6
-#define jpeg_idct_5x5		jRD5x5
-#define jpeg_idct_4x4		jRD4x4
-#define jpeg_idct_3x3		jRD3x3
-#define jpeg_idct_2x2		jRD2x2
-#define jpeg_idct_1x1		jRD1x1
-#define jpeg_idct_9x9		jRD9x9
-#define jpeg_idct_10x10		jRD10x10
-#define jpeg_idct_11x11		jRD11x11
-#define jpeg_idct_12x12		jRD12x12
-#define jpeg_idct_13x13		jRD13x13
-#define jpeg_idct_14x14		jRD14x14
-#define jpeg_idct_15x15		jRD15x15
-#define jpeg_idct_16x16		jRD16x16
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
 /* Extern declarations for the forward and inverse DCT routines. */
 
-EXTERN(void) jpeg_fdct_islow JPP((DCTELEM * data));
-EXTERN(void) jpeg_fdct_ifast JPP((DCTELEM * data));
-EXTERN(void) jpeg_fdct_float JPP((FAST_FLOAT * data));
+EXTERN(void) jpeg_fdct_islow (DCTELEM *data);
+EXTERN(void) jpeg_fdct_ifast (DCTELEM *data);
+EXTERN(void) jpeg_fdct_float (FAST_FLOAT *data);
 
 EXTERN(void) jpeg_idct_islow
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_ifast
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_float
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_7x7
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_6x6
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_5x5
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_4x4
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_3x3
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_2x2
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_1x1
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_9x9
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_10x10
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_11x11
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_12x12
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_13x13
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_14x14
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_15x15
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 EXTERN(void) jpeg_idct_16x16
-    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
 
 
 /*
  * Macros for handling fixed-point arithmetic; these are used by many
  * but not all of the DCT/IDCT modules.
  *
- * All values are expected to be of type INT32.
+ * All values are expected to be of type JLONG.
  * Fractional constants are scaled left by CONST_BITS bits.
  * CONST_BITS is defined within each module using these macros,
  * and may differ from one module to the next.
  */
 
-#define ONE	((INT32) 1)
+#define ONE     ((JLONG) 1)
 #define CONST_SCALE (ONE << CONST_BITS)
 
 /* Convert a positive real constant to an integer scaled by CONST_SCALE.
@@ -192,16 +168,16 @@
  * thus causing a lot of useless floating-point operations at run time.
  */
 
-#define FIX(x)	((INT32) ((x) * CONST_SCALE + 0.5))
+#define FIX(x)  ((JLONG) ((x) * CONST_SCALE + 0.5))
 
-/* Descale and correctly round an INT32 value that's scaled by N bits.
+/* Descale and correctly round a JLONG value that's scaled by N bits.
  * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
  * the fudge factor is correct for either sign of X.
  */
 
 #define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
 
-/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
+/* Multiply a JLONG variable by a JLONG constant to yield a JLONG result.
  * This macro is used only when the two inputs will actually be no more than
  * 16 bits wide, so that a 16x16->32 bit multiply can be used instead of a
  * full 32x32 multiply.  This provides a useful speedup on many machines.
@@ -210,23 +186,23 @@
  * correct combination of casts.
  */
 
-#ifdef SHORTxSHORT_32		/* may work if 'int' is 32 bits */
+#ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
 #define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((INT16) (const)))
 #endif
-#ifdef SHORTxLCONST_32		/* known to work with Microsoft C 6.0 */
-#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((INT32) (const)))
+#ifdef SHORTxLCONST_32          /* known to work with Microsoft C 6.0 */
+#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((JLONG) (const)))
 #endif
 
-#ifndef MULTIPLY16C16		/* default definition */
+#ifndef MULTIPLY16C16           /* default definition */
 #define MULTIPLY16C16(var,const)  ((var) * (const))
 #endif
 
 /* Same except both inputs are variables. */
 
-#ifdef SHORTxSHORT_32		/* may work if 'int' is 32 bits */
+#ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
 #define MULTIPLY16V16(var1,var2)  (((INT16) (var1)) * ((INT16) (var2)))
 #endif
 
-#ifndef MULTIPLY16V16		/* default definition */
+#ifndef MULTIPLY16V16           /* default definition */
 #define MULTIPLY16V16(var1,var2)  ((var1) * (var2))
 #endif
diff --git a/jddctmgr.c b/jddctmgr.c
index 0a5decb..bdf7c53 100644
--- a/jddctmgr.c
+++ b/jddctmgr.c
@@ -6,8 +6,10 @@
  * Modified 2002-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2010, 2015, D. R. Commander.
+ * Copyright (C) 2013, MIPS Technologies, Inc., California
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the inverse-DCT management logic.
  * This code selects a particular IDCT implementation to be used,
@@ -22,7 +24,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jdct.h"               /* Private declarations for DCT subsystem */
 #include "jsimddct.h"
 #include "jpegcomp.h"
 
@@ -47,7 +49,7 @@
 /* Private subobject for this module */
 
 typedef struct {
-  struct jpeg_inverse_dct pub;	/* public fields */
+  struct jpeg_inverse_dct pub;  /* public fields */
 
   /* This array contains the IDCT method code that each multiplier table
    * is currently set up for, or -1 if it's not yet set up.
@@ -57,7 +59,7 @@
   int cur_method[MAX_COMPONENTS];
 } my_idct_controller;
 
-typedef my_idct_controller * my_idct_ptr;
+typedef my_idct_controller *my_idct_ptr;
 
 
 /* Allocated multiplier tables: big enough for any supported variant */
@@ -99,7 +101,7 @@
   jpeg_component_info *compptr;
   int method = 0;
   inverse_DCT_method_ptr method_ptr = NULL;
-  JQUANT_TBL * qtbl;
+  JQUANT_TBL *qtbl;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -108,105 +110,117 @@
 #ifdef IDCT_SCALING_SUPPORTED
     case 1:
       method_ptr = jpeg_idct_1x1;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctred uses islow-style table */
       break;
     case 2:
       if (jsimd_can_idct_2x2())
         method_ptr = jsimd_idct_2x2;
       else
         method_ptr = jpeg_idct_2x2;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctred uses islow-style table */
       break;
     case 3:
       method_ptr = jpeg_idct_3x3;
-      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 4:
       if (jsimd_can_idct_4x4())
         method_ptr = jsimd_idct_4x4;
       else
         method_ptr = jpeg_idct_4x4;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctred uses islow-style table */
       break;
     case 5:
       method_ptr = jpeg_idct_5x5;
-      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 6:
+#if defined(__mips__)
+      if (jsimd_can_idct_6x6())
+        method_ptr = jsimd_idct_6x6;
+      else
+#endif
       method_ptr = jpeg_idct_6x6;
-      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 7:
       method_ptr = jpeg_idct_7x7;
-      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
 #endif
     case DCTSIZE:
       switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
       case JDCT_ISLOW:
-	if (jsimd_can_idct_islow())
-	  method_ptr = jsimd_idct_islow;
-	else
-	  method_ptr = jpeg_idct_islow;
-	method = JDCT_ISLOW;
-	break;
+        if (jsimd_can_idct_islow())
+          method_ptr = jsimd_idct_islow;
+        else
+          method_ptr = jpeg_idct_islow;
+        method = JDCT_ISLOW;
+        break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
       case JDCT_IFAST:
-	if (jsimd_can_idct_ifast())
-	  method_ptr = jsimd_idct_ifast;
-	else
-	  method_ptr = jpeg_idct_ifast;
-	method = JDCT_IFAST;
-	break;
+        if (jsimd_can_idct_ifast())
+          method_ptr = jsimd_idct_ifast;
+        else
+          method_ptr = jpeg_idct_ifast;
+        method = JDCT_IFAST;
+        break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
       case JDCT_FLOAT:
-	if (jsimd_can_idct_float())
-	  method_ptr = jsimd_idct_float;
-	else
-	  method_ptr = jpeg_idct_float;
-	method = JDCT_FLOAT;
-	break;
+        if (jsimd_can_idct_float())
+          method_ptr = jsimd_idct_float;
+        else
+          method_ptr = jpeg_idct_float;
+        method = JDCT_FLOAT;
+        break;
 #endif
       default:
-	ERREXIT(cinfo, JERR_NOT_COMPILED);
-	break;
+        ERREXIT(cinfo, JERR_NOT_COMPILED);
+        break;
       }
       break;
+#ifdef IDCT_SCALING_SUPPORTED
     case 9:
       method_ptr = jpeg_idct_9x9;
-      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 10:
       method_ptr = jpeg_idct_10x10;
-      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 11:
       method_ptr = jpeg_idct_11x11;
-      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 12:
+#if defined(__mips__)
+      if (jsimd_can_idct_12x12())
+        method_ptr = jsimd_idct_12x12;
+      else
+#endif
       method_ptr = jpeg_idct_12x12;
-      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 13:
       method_ptr = jpeg_idct_13x13;
-      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 14:
       method_ptr = jpeg_idct_14x14;
-      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 15:
       method_ptr = jpeg_idct_15x15;
-      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 16:
       method_ptr = jpeg_idct_16x16;
-      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
+#endif
     default:
       ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->_DCT_scaled_size);
       break;
@@ -222,81 +236,81 @@
     if (! compptr->component_needed || idct->cur_method[ci] == method)
       continue;
     qtbl = compptr->quant_table;
-    if (qtbl == NULL)		/* happens if no data yet for component */
+    if (qtbl == NULL)           /* happens if no data yet for component */
       continue;
     idct->cur_method[ci] = method;
     switch (method) {
 #ifdef PROVIDE_ISLOW_TABLES
     case JDCT_ISLOW:
       {
-	/* For LL&M IDCT method, multipliers are equal to raw quantization
-	 * coefficients, but are stored as ints to ensure access efficiency.
-	 */
-	ISLOW_MULT_TYPE * ismtbl = (ISLOW_MULT_TYPE *) compptr->dct_table;
-	for (i = 0; i < DCTSIZE2; i++) {
-	  ismtbl[i] = (ISLOW_MULT_TYPE) qtbl->quantval[i];
-	}
+        /* For LL&M IDCT method, multipliers are equal to raw quantization
+         * coefficients, but are stored as ints to ensure access efficiency.
+         */
+        ISLOW_MULT_TYPE *ismtbl = (ISLOW_MULT_TYPE *) compptr->dct_table;
+        for (i = 0; i < DCTSIZE2; i++) {
+          ismtbl[i] = (ISLOW_MULT_TYPE) qtbl->quantval[i];
+        }
       }
       break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
     case JDCT_IFAST:
       {
-	/* For AA&N IDCT method, multipliers are equal to quantization
-	 * coefficients scaled by scalefactor[row]*scalefactor[col], where
-	 *   scalefactor[0] = 1
-	 *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
-	 * For integer operation, the multiplier table is to be scaled by
-	 * IFAST_SCALE_BITS.
-	 */
-	IFAST_MULT_TYPE * ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table;
+        /* For AA&N IDCT method, multipliers are equal to quantization
+         * coefficients scaled by scalefactor[row]*scalefactor[col], where
+         *   scalefactor[0] = 1
+         *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+         * For integer operation, the multiplier table is to be scaled by
+         * IFAST_SCALE_BITS.
+         */
+        IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table;
 #define CONST_BITS 14
-	static const INT16 aanscales[DCTSIZE2] = {
-	  /* precomputed values scaled up by 14 bits */
-	  16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
-	  22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
-	  21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
-	  19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
-	  16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
-	  12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
-	   8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
-	   4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
-	};
-	SHIFT_TEMPS
+        static const INT16 aanscales[DCTSIZE2] = {
+          /* precomputed values scaled up by 14 bits */
+          16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+          22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
+          21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
+          19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
+          16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+          12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
+           8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
+           4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
+        };
+        SHIFT_TEMPS
 
-	for (i = 0; i < DCTSIZE2; i++) {
-	  ifmtbl[i] = (IFAST_MULT_TYPE)
-	    DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
-				  (INT32) aanscales[i]),
-		    CONST_BITS-IFAST_SCALE_BITS);
-	}
+        for (i = 0; i < DCTSIZE2; i++) {
+          ifmtbl[i] = (IFAST_MULT_TYPE)
+            DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
+                                  (JLONG) aanscales[i]),
+                    CONST_BITS-IFAST_SCALE_BITS);
+        }
       }
       break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
     case JDCT_FLOAT:
       {
-	/* For float AA&N IDCT method, multipliers are equal to quantization
-	 * coefficients scaled by scalefactor[row]*scalefactor[col], where
-	 *   scalefactor[0] = 1
-	 *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
-	 */
-	FLOAT_MULT_TYPE * fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table;
-	int row, col;
-	static const double aanscalefactor[DCTSIZE] = {
-	  1.0, 1.387039845, 1.306562965, 1.175875602,
-	  1.0, 0.785694958, 0.541196100, 0.275899379
-	};
+        /* For float AA&N IDCT method, multipliers are equal to quantization
+         * coefficients scaled by scalefactor[row]*scalefactor[col], where
+         *   scalefactor[0] = 1
+         *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+         */
+        FLOAT_MULT_TYPE *fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table;
+        int row, col;
+        static const double aanscalefactor[DCTSIZE] = {
+          1.0, 1.387039845, 1.306562965, 1.175875602,
+          1.0, 0.785694958, 0.541196100, 0.275899379
+        };
 
-	i = 0;
-	for (row = 0; row < DCTSIZE; row++) {
-	  for (col = 0; col < DCTSIZE; col++) {
-	    fmtbl[i] = (FLOAT_MULT_TYPE)
-	      ((double) qtbl->quantval[i] *
-	       aanscalefactor[row] * aanscalefactor[col]);
-	    i++;
-	  }
-	}
+        i = 0;
+        for (row = 0; row < DCTSIZE; row++) {
+          for (col = 0; col < DCTSIZE; col++) {
+            fmtbl[i] = (FLOAT_MULT_TYPE)
+              ((double) qtbl->quantval[i] *
+               aanscalefactor[row] * aanscalefactor[col]);
+            i++;
+          }
+        }
       }
       break;
 #endif
@@ -321,7 +335,7 @@
 
   idct = (my_idct_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_idct_controller));
+                                sizeof(my_idct_controller));
   cinfo->idct = (struct jpeg_inverse_dct *) idct;
   idct->pub.start_pass = start_pass;
 
@@ -330,8 +344,8 @@
     /* Allocate and pre-zero a multiplier table for each component */
     compptr->dct_table =
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(multiplier_table));
-    MEMZERO(compptr->dct_table, SIZEOF(multiplier_table));
+                                  sizeof(multiplier_table));
+    MEMZERO(compptr->dct_table, sizeof(multiplier_table));
     /* Mark multiplier table not yet set up for any method */
     idct->cur_method[ci] = -1;
   }
diff --git a/jdhuff.c b/jdhuff.c
index e98dc52..3c6fdaf 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2015, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2009-2011, 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains Huffman entropy decoding routines.
  *
@@ -19,8 +20,9 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdhuff.h"		/* Declarations shared with jdphuff.c */
+#include "jdhuff.h"             /* Declarations shared with jdphuff.c */
 #include "jpegcomp.h"
+#include "jstdhuff.c"
 
 
 /*
@@ -44,10 +46,10 @@
 #else
 #if MAX_COMPS_IN_SCAN == 4
 #define ASSIGN_STATE(dest,src)  \
-	((dest).last_dc_val[0] = (src).last_dc_val[0], \
-	 (dest).last_dc_val[1] = (src).last_dc_val[1], \
-	 (dest).last_dc_val[2] = (src).last_dc_val[2], \
-	 (dest).last_dc_val[3] = (src).last_dc_val[3])
+        ((dest).last_dc_val[0] = (src).last_dc_val[0], \
+         (dest).last_dc_val[1] = (src).last_dc_val[1], \
+         (dest).last_dc_val[2] = (src).last_dc_val[2], \
+         (dest).last_dc_val[3] = (src).last_dc_val[3])
 #endif
 #endif
 
@@ -58,27 +60,27 @@
   /* These fields are loaded into local variables at start of each MCU.
    * In case of suspension, we exit WITHOUT updating them.
    */
-  bitread_perm_state bitstate;	/* Bit buffer at start of MCU */
-  savable_state saved;		/* Other state at start of MCU */
+  bitread_perm_state bitstate;  /* Bit buffer at start of MCU */
+  savable_state saved;          /* Other state at start of MCU */
 
   /* These fields are NOT loaded into local working state. */
-  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+  unsigned int restarts_to_go;  /* MCUs left in this restart interval */
 
   /* Pointers to derived tables (these workspaces have image lifespan) */
-  d_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
-  d_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
+  d_derived_tbl *dc_derived_tbls[NUM_HUFF_TBLS];
+  d_derived_tbl *ac_derived_tbls[NUM_HUFF_TBLS];
 
   /* Precalculated info set up by start_pass for use in decode_mcu: */
 
   /* Pointers to derived tables to be used for each block within an MCU */
-  d_derived_tbl * dc_cur_tbls[D_MAX_BLOCKS_IN_MCU];
-  d_derived_tbl * ac_cur_tbls[D_MAX_BLOCKS_IN_MCU];
+  d_derived_tbl *dc_cur_tbls[D_MAX_BLOCKS_IN_MCU];
+  d_derived_tbl *ac_cur_tbls[D_MAX_BLOCKS_IN_MCU];
   /* Whether we care about the DC and AC coefficient values for each block */
   boolean dc_needed[D_MAX_BLOCKS_IN_MCU];
   boolean ac_needed[D_MAX_BLOCKS_IN_MCU];
 } huff_entropy_decoder;
 
-typedef huff_entropy_decoder * huff_entropy_ptr;
+typedef huff_entropy_decoder *huff_entropy_ptr;
 
 
 /*
@@ -90,7 +92,8 @@
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   int ci, blkn, dctbl, actbl;
-  jpeg_component_info * compptr;
+  d_derived_tbl **pdtbl;
+  jpeg_component_info *compptr;
 
   /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
    * This ought to be an error condition, but we make it a warning because
@@ -106,10 +109,10 @@
     actbl = compptr->ac_tbl_no;
     /* Compute derived values for Huffman tables */
     /* We may do this more than once for a table, but it's not expensive */
-    jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl,
-			    & entropy->dc_derived_tbls[dctbl]);
-    jpeg_make_d_derived_tbl(cinfo, FALSE, actbl,
-			    & entropy->ac_derived_tbls[actbl]);
+    pdtbl = entropy->dc_derived_tbls + dctbl;
+    jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl, pdtbl);
+    pdtbl = entropy->ac_derived_tbls + actbl;
+    jpeg_make_d_derived_tbl(cinfo, FALSE, actbl, pdtbl);
     /* Initialize DC predictions to 0 */
     entropy->saved.last_dc_val[ci] = 0;
   }
@@ -150,7 +153,7 @@
 
 GLOBAL(void)
 jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
-			 d_derived_tbl ** pdtbl)
+                         d_derived_tbl **pdtbl)
 {
   JHUFF_TBL *htbl;
   d_derived_tbl *dtbl;
@@ -176,26 +179,26 @@
   if (*pdtbl == NULL)
     *pdtbl = (d_derived_tbl *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(d_derived_tbl));
+                                  sizeof(d_derived_tbl));
   dtbl = *pdtbl;
-  dtbl->pub = htbl;		/* fill in back link */
-  
+  dtbl->pub = htbl;             /* fill in back link */
+
   /* Figure C.1: make table of Huffman code length for each symbol */
 
   p = 0;
   for (l = 1; l <= 16; l++) {
     i = (int) htbl->bits[l];
-    if (i < 0 || p + i > 256)	/* protect against table overrun */
+    if (i < 0 || p + i > 256)   /* protect against table overrun */
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     while (i--)
       huffsize[p++] = (char) l;
   }
   huffsize[p] = 0;
   numsymbols = p;
-  
+
   /* Figure C.2: generate the codes themselves */
   /* We also validate that the counts represent a legal Huffman code tree. */
-  
+
   code = 0;
   si = huffsize[0];
   p = 0;
@@ -207,7 +210,7 @@
     /* code is now 1 more than the last code used for codelength si; but
      * it must still fit in si bits, since no code is allowed to be all ones.
      */
-    if (((INT32) code) >= (((INT32) 1) << si))
+    if (((JLONG) code) >= (((JLONG) 1) << si))
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     code <<= 1;
     si++;
@@ -221,11 +224,11 @@
       /* valoffset[l] = huffval[] index of 1st symbol of code length l,
        * minus the minimum code of length l
        */
-      dtbl->valoffset[l] = (INT32) p - (INT32) huffcode[p];
+      dtbl->valoffset[l] = (JLONG) p - (JLONG) huffcode[p];
       p += htbl->bits[l];
       dtbl->maxcode[l] = huffcode[p-1]; /* maximum code of length l */
     } else {
-      dtbl->maxcode[l] = -1;	/* -1 if no codes of this length */
+      dtbl->maxcode[l] = -1;    /* -1 if no codes of this length */
     }
   }
   dtbl->valoffset[17] = 0;
@@ -248,8 +251,8 @@
       /* Generate left-justified code followed by all possible bit sequences */
       lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l);
       for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) {
-	dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p];
-	lookbits++;
+        dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p];
+        lookbits++;
       }
     }
   }
@@ -264,7 +267,7 @@
     for (i = 0; i < numsymbols; i++) {
       int sym = htbl->huffval[i];
       if (sym < 0 || sym > 15)
-	ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
+        ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     }
   }
 }
@@ -286,20 +289,20 @@
  */
 
 #ifdef SLOW_SHIFT_32
-#define MIN_GET_BITS  15	/* minimum allowable value */
+#define MIN_GET_BITS  15        /* minimum allowable value */
 #else
 #define MIN_GET_BITS  (BIT_BUF_SIZE-7)
 #endif
 
 
 GLOBAL(boolean)
-jpeg_fill_bit_buffer (bitread_working_state * state,
-		      register bit_buf_type get_buffer, register int bits_left,
-		      int nbits)
+jpeg_fill_bit_buffer (bitread_working_state *state,
+                      register bit_buf_type get_buffer, register int bits_left,
+                      int nbits)
 /* Load up the bit buffer to a depth of at least nbits */
 {
   /* Copy heavily used state fields into locals (hopefully registers) */
-  register const JOCTET * next_input_byte = state->next_input_byte;
+  register const JOCTET *next_input_byte = state->next_input_byte;
   register size_t bytes_in_buffer = state->bytes_in_buffer;
   j_decompress_ptr cinfo = state->cinfo;
 
@@ -307,54 +310,54 @@
   /* (It is assumed that no request will be for more than that many bits.) */
   /* We fail to do so only if we hit a marker or are forced to suspend. */
 
-  if (cinfo->unread_marker == 0) {	/* cannot advance past a marker */
+  if (cinfo->unread_marker == 0) {      /* cannot advance past a marker */
     while (bits_left < MIN_GET_BITS) {
       register int c;
 
       /* Attempt to read a byte */
       if (bytes_in_buffer == 0) {
-	if (! (*cinfo->src->fill_input_buffer) (cinfo))
-	  return FALSE;
-	next_input_byte = cinfo->src->next_input_byte;
-	bytes_in_buffer = cinfo->src->bytes_in_buffer;
+        if (! (*cinfo->src->fill_input_buffer) (cinfo))
+          return FALSE;
+        next_input_byte = cinfo->src->next_input_byte;
+        bytes_in_buffer = cinfo->src->bytes_in_buffer;
       }
       bytes_in_buffer--;
       c = GETJOCTET(*next_input_byte++);
 
       /* If it's 0xFF, check and discard stuffed zero byte */
       if (c == 0xFF) {
-	/* Loop here to discard any padding FF's on terminating marker,
-	 * so that we can save a valid unread_marker value.  NOTE: we will
-	 * accept multiple FF's followed by a 0 as meaning a single FF data
-	 * byte.  This data pattern is not valid according to the standard.
-	 */
-	do {
-	  if (bytes_in_buffer == 0) {
-	    if (! (*cinfo->src->fill_input_buffer) (cinfo))
-	      return FALSE;
-	    next_input_byte = cinfo->src->next_input_byte;
-	    bytes_in_buffer = cinfo->src->bytes_in_buffer;
-	  }
-	  bytes_in_buffer--;
-	  c = GETJOCTET(*next_input_byte++);
-	} while (c == 0xFF);
+        /* Loop here to discard any padding FF's on terminating marker,
+         * so that we can save a valid unread_marker value.  NOTE: we will
+         * accept multiple FF's followed by a 0 as meaning a single FF data
+         * byte.  This data pattern is not valid according to the standard.
+         */
+        do {
+          if (bytes_in_buffer == 0) {
+            if (! (*cinfo->src->fill_input_buffer) (cinfo))
+              return FALSE;
+            next_input_byte = cinfo->src->next_input_byte;
+            bytes_in_buffer = cinfo->src->bytes_in_buffer;
+          }
+          bytes_in_buffer--;
+          c = GETJOCTET(*next_input_byte++);
+        } while (c == 0xFF);
 
-	if (c == 0) {
-	  /* Found FF/00, which represents an FF data byte */
-	  c = 0xFF;
-	} else {
-	  /* Oops, it's actually a marker indicating end of compressed data.
-	   * Save the marker code for later use.
-	   * Fine point: it might appear that we should save the marker into
-	   * bitread working state, not straight into permanent state.  But
-	   * once we have hit a marker, we cannot need to suspend within the
-	   * current MCU, because we will read no more bytes from the data
-	   * source.  So it is OK to update permanent state right away.
-	   */
-	  cinfo->unread_marker = c;
-	  /* See if we need to insert some fake zero bits. */
-	  goto no_more_bytes;
-	}
+        if (c == 0) {
+          /* Found FF/00, which represents an FF data byte */
+          c = 0xFF;
+        } else {
+          /* Oops, it's actually a marker indicating end of compressed data.
+           * Save the marker code for later use.
+           * Fine point: it might appear that we should save the marker into
+           * bitread working state, not straight into permanent state.  But
+           * once we have hit a marker, we cannot need to suspend within the
+           * current MCU, because we will read no more bytes from the data
+           * source.  So it is OK to update permanent state right away.
+           */
+          cinfo->unread_marker = c;
+          /* See if we need to insert some fake zero bits. */
+          goto no_more_bytes;
+        }
       }
 
       /* OK, load c into get_buffer */
@@ -374,8 +377,8 @@
        * appears per data segment.
        */
       if (! cinfo->entropy->insufficient_data) {
-	WARNMS(cinfo, JWRN_HIT_MARKER);
-	cinfo->entropy->insufficient_data = TRUE;
+        WARNMS(cinfo, JWRN_HIT_MARKER);
+        cinfo->entropy->insufficient_data = TRUE;
       }
       /* Fill the buffer with zero bits */
       get_buffer <<= MIN_GET_BITS - bits_left;
@@ -418,11 +421,11 @@
   } \
 }
 
-#if __WORDSIZE == 64 || defined(_WIN64)
+#if SIZEOF_SIZE_T==8 || defined(_WIN64)
 
 /* Pre-fetch 48 bytes, because the holding register is 64-bit */
 #define FILL_BIT_BUFFER_FAST \
-  if (bits_left < 16) { \
+  if (bits_left <= 16) { \
     GET_BYTE GET_BYTE GET_BYTE GET_BYTE GET_BYTE GET_BYTE \
   }
 
@@ -430,7 +433,7 @@
 
 /* Pre-fetch 16 bytes, because the holding register is 32-bit */
 #define FILL_BIT_BUFFER_FAST \
-  if (bits_left < 16) { \
+  if (bits_left <= 16) { \
     GET_BYTE GET_BYTE \
   }
 
@@ -443,12 +446,12 @@
  */
 
 GLOBAL(int)
-jpeg_huff_decode (bitread_working_state * state,
-		  register bit_buf_type get_buffer, register int bits_left,
-		  d_derived_tbl * htbl, int min_bits)
+jpeg_huff_decode (bitread_working_state *state,
+                  register bit_buf_type get_buffer, register int bits_left,
+                  d_derived_tbl *htbl, int min_bits)
 {
   register int l = min_bits;
-  register INT32 code;
+  register JLONG code;
 
   /* HUFF_DECODE has determined that the code is at least min_bits */
   /* bits long, so fetch that many bits in one swoop. */
@@ -474,7 +477,7 @@
 
   if (l > 16) {
     WARNMS(state->cinfo, JWRN_HUFF_BAD_CODE);
-    return 0;			/* fake a zero as the safest result */
+    return 0;                   /* fake a zero as the safest result */
   }
 
   return htbl->pub->huffval[ (int) (code + htbl->valoffset[l]) ];
@@ -489,7 +492,8 @@
 #define AVOID_TABLES
 #ifdef AVOID_TABLES
 
-#define HUFF_EXTEND(x,s)  ((x) + ((((x) - (1<<((s)-1))) >> 31) & (((-1)<<(s)) + 1)))
+#define NEG_1 ((unsigned int)-1)
+#define HUFF_EXTEND(x,s)  ((x) + ((((x) - (1<<((s)-1))) >> 31) & (((NEG_1)<<(s)) + 1)))
 
 #else
 
@@ -562,8 +566,8 @@
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
-    d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
-    d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
+    d_derived_tbl *dctbl = entropy->dc_cur_tbls[blkn];
+    d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn];
     register int s, k, r;
 
     /* Decode a single block's worth of coefficients */
@@ -596,7 +600,7 @@
 
         r = s >> 4;
         s &= 15;
-      
+
         if (s) {
           k += r;
           CHECK_BIT_BUFFER(br_state, s, return FALSE);
@@ -661,8 +665,8 @@
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
-    d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
-    d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
+    d_derived_tbl *dctbl = entropy->dc_cur_tbls[blkn];
+    d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn];
     register int s, k, r, l;
 
     HUFF_DECODE_FAST(s, l, dctbl, slow_decode_mcu);
@@ -686,7 +690,7 @@
         HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
         r = s >> 4;
         s &= 15;
-      
+
         if (s) {
           k += r;
           FILL_BIT_BUFFER_FAST
@@ -747,7 +751,7 @@
  * this module, since we'll just re-assign them on the next call.)
  */
 
-#define BUFSIZE (DCTSIZE2 * 2u)
+#define BUFSIZE (DCTSIZE2 * 8)
 
 METHODDEF(boolean)
 decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
@@ -759,7 +763,7 @@
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
       if (! process_restart(cinfo))
-	return FALSE;
+        return FALSE;
     usefast = 0;
   }
 
@@ -799,9 +803,15 @@
   huff_entropy_ptr entropy;
   int i;
 
+  /* Motion JPEG frames typically do not include the Huffman tables if they
+     are the default tables.  Thus, if the tables are not set by the time
+     the Huffman decoder is initialized (usually within the body of
+     jpeg_start_decompress()), we set them to default values. */
+  std_huff_tables((j_common_ptr) cinfo);
+
   entropy = (huff_entropy_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(huff_entropy_decoder));
+                                sizeof(huff_entropy_decoder));
   cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
   entropy->pub.start_pass = start_pass_huff_decoder;
   entropy->pub.decode_mcu = decode_mcu;
diff --git a/jdhuff.h b/jdhuff.h
index 027177b..3f15d71 100644
--- a/jdhuff.h
+++ b/jdhuff.h
@@ -3,33 +3,28 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modifications:
- * Copyright (C) 2010-2011, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010-2011, 2015-2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains declarations for Huffman entropy decoding routines
  * that are shared between the sequential decoder (jdhuff.c) and the
  * progressive decoder (jdphuff.c).  No other modules need to see these.
  */
 
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_make_d_derived_tbl	jMkDDerived
-#define jpeg_fill_bit_buffer	jFilBitBuf
-#define jpeg_huff_decode	jHufDecode
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
+#include "jconfigint.h"
 
 
 /* Derived data constructed for each Huffman table */
 
-#define HUFF_LOOKAHEAD	8	/* # of bits of lookahead */
+#define HUFF_LOOKAHEAD  8       /* # of bits of lookahead */
 
 typedef struct {
   /* Basic tables: (element [0] of each array is unused) */
-  INT32 maxcode[18];		/* largest code of length k (-1 if none) */
+  JLONG maxcode[18];            /* largest code of length k (-1 if none) */
   /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */
-  INT32 valoffset[18];		/* huffval[] offset for codes of length k */
+  JLONG valoffset[18];          /* huffval[] offset for codes of length k */
   /* valoffset[k] = huffval[] index of 1st symbol of code length k, less
    * the smallest code of length k; so given a code of length k, the
    * corresponding symbol is huffval[code + valoffset[k]]
@@ -53,8 +48,8 @@
 
 /* Expand a Huffman table definition into the derived format */
 EXTERN(void) jpeg_make_d_derived_tbl
-	JPP((j_decompress_ptr cinfo, boolean isDC, int tblno,
-	     d_derived_tbl ** pdtbl));
+        (j_decompress_ptr cinfo, boolean isDC, int tblno,
+         d_derived_tbl ** pdtbl);
 
 
 /*
@@ -75,15 +70,19 @@
  * necessary.
  */
 
-#if __WORDSIZE == 64 || defined(_WIN64)
+#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T)
+#error Cannot determine word size
+#endif
 
-typedef size_t bit_buf_type;	/* type of bit-extraction buffer */
-#define BIT_BUF_SIZE  64		/* size of buffer in bits */
+#if SIZEOF_SIZE_T==8 || defined(_WIN64)
+
+typedef size_t bit_buf_type;            /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  64                /* size of buffer in bits */
 
 #else
 
-typedef INT32 bit_buf_type;	/* type of bit-extraction buffer */
-#define BIT_BUF_SIZE  32		/* size of buffer in bits */
+typedef unsigned long bit_buf_type;     /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  32                /* size of buffer in bits */
 
 #endif
 
@@ -94,43 +93,43 @@
  * because not all machines measure sizeof in 8-bit bytes.
  */
 
-typedef struct {		/* Bitreading state saved across MCUs */
-  bit_buf_type get_buffer;	/* current bit-extraction buffer */
-  int bits_left;		/* # of unused bits in it */
+typedef struct {                /* Bitreading state saved across MCUs */
+  bit_buf_type get_buffer;      /* current bit-extraction buffer */
+  int bits_left;                /* # of unused bits in it */
 } bitread_perm_state;
 
-typedef struct {		/* Bitreading working state within an MCU */
+typedef struct {                /* Bitreading working state within an MCU */
   /* Current data source location */
   /* We need a copy, rather than munging the original, in case of suspension */
-  const JOCTET * next_input_byte; /* => next byte to read from source */
-  size_t bytes_in_buffer;	/* # of bytes remaining in source buffer */
+  const JOCTET *next_input_byte; /* => next byte to read from source */
+  size_t bytes_in_buffer;       /* # of bytes remaining in source buffer */
   /* Bit input buffer --- note these values are kept in register variables,
    * not in this struct, inside the inner loops.
    */
-  bit_buf_type get_buffer;	/* current bit-extraction buffer */
-  int bits_left;		/* # of unused bits in it */
+  bit_buf_type get_buffer;      /* current bit-extraction buffer */
+  int bits_left;                /* # of unused bits in it */
   /* Pointer needed by jpeg_fill_bit_buffer. */
-  j_decompress_ptr cinfo;	/* back link to decompress master record */
+  j_decompress_ptr cinfo;       /* back link to decompress master record */
 } bitread_working_state;
 
 /* Macros to declare and load/save bitread local variables. */
 #define BITREAD_STATE_VARS  \
-	register bit_buf_type get_buffer;  \
-	register int bits_left;  \
-	bitread_working_state br_state
+        register bit_buf_type get_buffer;  \
+        register int bits_left;  \
+        bitread_working_state br_state
 
 #define BITREAD_LOAD_STATE(cinfop,permstate)  \
-	br_state.cinfo = cinfop; \
-	br_state.next_input_byte = cinfop->src->next_input_byte; \
-	br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
-	get_buffer = permstate.get_buffer; \
-	bits_left = permstate.bits_left;
+        br_state.cinfo = cinfop; \
+        br_state.next_input_byte = cinfop->src->next_input_byte; \
+        br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
+        get_buffer = permstate.get_buffer; \
+        bits_left = permstate.bits_left;
 
 #define BITREAD_SAVE_STATE(cinfop,permstate)  \
-	cinfop->src->next_input_byte = br_state.next_input_byte; \
-	cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
-	permstate.get_buffer = get_buffer; \
-	permstate.bits_left = bits_left
+        cinfop->src->next_input_byte = br_state.next_input_byte; \
+        cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
+        permstate.get_buffer = get_buffer; \
+        permstate.bits_left = bits_left
 
 /*
  * These macros provide the in-line portion of bit fetching.
@@ -138,37 +137,37 @@
  * before using GET_BITS, PEEK_BITS, or DROP_BITS.
  * The variables get_buffer and bits_left are assumed to be locals,
  * but the state struct might not be (jpeg_huff_decode needs this).
- *	CHECK_BIT_BUFFER(state,n,action);
- *		Ensure there are N bits in get_buffer; if suspend, take action.
+ *      CHECK_BIT_BUFFER(state,n,action);
+ *              Ensure there are N bits in get_buffer; if suspend, take action.
  *      val = GET_BITS(n);
- *		Fetch next N bits.
+ *              Fetch next N bits.
  *      val = PEEK_BITS(n);
- *		Fetch next N bits without removing them from the buffer.
- *	DROP_BITS(n);
- *		Discard next N bits.
+ *              Fetch next N bits without removing them from the buffer.
+ *      DROP_BITS(n);
+ *              Discard next N bits.
  * The value N should be a simple variable, not an expression, because it
  * is evaluated multiple times.
  */
 
 #define CHECK_BIT_BUFFER(state,nbits,action) \
-	{ if (bits_left < (nbits)) {  \
-	    if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits))  \
-	      { action; }  \
-	    get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
+        { if (bits_left < (nbits)) {  \
+            if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits))  \
+              { action; }  \
+            get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
 
 #define GET_BITS(nbits) \
-	(((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1))
+        (((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1))
 
 #define PEEK_BITS(nbits) \
-	(((int) (get_buffer >> (bits_left -  (nbits)))) & ((1<<(nbits))-1))
+        (((int) (get_buffer >> (bits_left -  (nbits)))) & ((1<<(nbits))-1))
 
 #define DROP_BITS(nbits) \
-	(bits_left -= (nbits))
+        (bits_left -= (nbits))
 
 /* Load up the bit buffer to a depth of at least nbits */
 EXTERN(boolean) jpeg_fill_bit_buffer
-	JPP((bitread_working_state * state, register bit_buf_type get_buffer,
-	     register int bits_left, int nbits));
+        (bitread_working_state *state, register bit_buf_type get_buffer,
+         register int bits_left, int nbits);
 
 
 /*
@@ -204,7 +203,7 @@
   } else { \
 slowlabel: \
     if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
-	{ failaction; } \
+        { failaction; } \
     get_buffer = state.get_buffer; bits_left = state.bits_left; \
   } \
 }
@@ -233,5 +232,5 @@
 
 /* Out-of-line case for Huffman code fetching */
 EXTERN(int) jpeg_huff_decode
-	JPP((bitread_working_state * state, register bit_buf_type get_buffer,
-	     register int bits_left, d_derived_tbl * htbl, int min_bits));
+        (bitread_working_state *state, register bit_buf_type get_buffer,
+         register int bits_left, d_derived_tbl *htbl, int min_bits);
diff --git a/jdinput.c b/jdinput.c
index e7ba33f..32a6b42 100644
--- a/jdinput.c
+++ b/jdinput.c
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2010, 2016, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains input control logic for the JPEG decompressor.
  * These routines are concerned with controlling the decompressor's input
@@ -24,14 +26,14 @@
 typedef struct {
   struct jpeg_input_controller pub; /* public fields */
 
-  boolean inheaders;		/* TRUE until first SOS is reached */
+  boolean inheaders;            /* TRUE until first SOS is reached */
 } my_input_controller;
 
-typedef my_input_controller * my_inputctl_ptr;
+typedef my_input_controller *my_inputctl_ptr;
 
 
 /* Forward declarations */
-METHODDEF(int) consume_markers JPP((j_decompress_ptr cinfo));
+METHODDEF(int) consume_markers (j_decompress_ptr cinfo);
 
 
 /*
@@ -57,7 +59,7 @@
   /* Check that number of components won't exceed internal array sizes */
   if (cinfo->num_components > MAX_COMPONENTS)
     ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
-	     MAX_COMPONENTS);
+             MAX_COMPONENTS);
 
   /* Compute maximum sampling factors; check factor validity */
   cinfo->max_h_samp_factor = 1;
@@ -65,12 +67,12 @@
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR ||
-	compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
+        compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
       ERREXIT(cinfo, JERR_BAD_SAMPLING);
     cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
-				   compptr->h_samp_factor);
+                                   compptr->h_samp_factor);
     cinfo->max_v_samp_factor = MAX(cinfo->max_v_samp_factor,
-				   compptr->v_samp_factor);
+                                   compptr->v_samp_factor);
   }
 
 #if JPEG_LIB_VERSION >=80
@@ -100,10 +102,15 @@
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
     compptr->height_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+    /* Set the first and last MCU columns to decompress from multi-scan images.
+     * By default, decompress all of the MCU columns.
+     */
+    cinfo->master->first_MCU_col[ci] = 0;
+    cinfo->master->last_MCU_col[ci] = compptr->width_in_blocks - 1;
     /* downsampled_width and downsampled_height will also be overridden by
      * jdmaster.c if we are doing full decompression.  The transcoder library
      * doesn't use these values, but the calling application might.
@@ -111,10 +118,10 @@
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-		    (long) cinfo->max_h_samp_factor);
+                    (long) cinfo->max_h_samp_factor);
     compptr->downsampled_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-		    (long) cinfo->max_v_samp_factor);
+                    (long) cinfo->max_v_samp_factor);
     /* Mark component needed, until color conversion says otherwise */
     compptr->component_needed = TRUE;
     /* Mark no quantization table yet saved for component */
@@ -124,7 +131,7 @@
   /* Compute number of fully interleaved MCU rows. */
   cinfo->total_iMCU_rows = (JDIMENSION)
     jdiv_round_up((long) cinfo->image_height,
-		  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+                  (long) (cinfo->max_v_samp_factor*DCTSIZE));
 
   /* Decide whether file contains multiple scans */
   if (cinfo->comps_in_scan < cinfo->num_components || cinfo->progressive_mode)
@@ -141,16 +148,16 @@
 {
   int ci, mcublks, tmp;
   jpeg_component_info *compptr;
-  
+
   if (cinfo->comps_in_scan == 1) {
-    
+
     /* Noninterleaved (single-component) scan */
     compptr = cinfo->cur_comp_info[0];
-    
+
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = compptr->width_in_blocks;
     cinfo->MCU_rows_in_scan = compptr->height_in_blocks;
-    
+
     /* For noninterleaved scan, always one block per MCU */
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
@@ -163,28 +170,28 @@
     tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
     if (tmp == 0) tmp = compptr->v_samp_factor;
     compptr->last_row_height = tmp;
-    
+
     /* Prepare array describing MCU composition */
     cinfo->blocks_in_MCU = 1;
     cinfo->MCU_membership[0] = 0;
-    
+
   } else {
-    
+
     /* Interleaved (multi-component) scan */
     if (cinfo->comps_in_scan <= 0 || cinfo->comps_in_scan > MAX_COMPS_IN_SCAN)
       ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->comps_in_scan,
-	       MAX_COMPS_IN_SCAN);
-    
+               MAX_COMPS_IN_SCAN);
+
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width,
-		    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+                    (long) (cinfo->max_h_samp_factor*DCTSIZE));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height,
-		    (long) (cinfo->max_v_samp_factor*DCTSIZE));
-    
+                    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+
     cinfo->blocks_in_MCU = 0;
-    
+
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       compptr = cinfo->cur_comp_info[ci];
       /* Sampling factors give # of blocks of component in each MCU */
@@ -202,12 +209,12 @@
       /* Prepare array describing MCU composition */
       mcublks = compptr->MCU_blocks;
       if (cinfo->blocks_in_MCU + mcublks > D_MAX_BLOCKS_IN_MCU)
-	ERREXIT(cinfo, JERR_BAD_MCU_SIZE);
+        ERREXIT(cinfo, JERR_BAD_MCU_SIZE);
       while (mcublks-- > 0) {
-	cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci;
+        cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci;
       }
     }
-    
+
   }
 }
 
@@ -238,7 +245,7 @@
 {
   int ci, qtblno;
   jpeg_component_info *compptr;
-  JQUANT_TBL * qtbl;
+  JQUANT_TBL *qtbl;
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
@@ -248,13 +255,13 @@
     /* Make sure specified quantization table is present */
     qtblno = compptr->quant_tbl_no;
     if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
-	cinfo->quant_tbl_ptrs[qtblno] == NULL)
+        cinfo->quant_tbl_ptrs[qtblno] == NULL)
       ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
     /* OK, save away the quantization table */
     qtbl = (JQUANT_TBL *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(JQUANT_TBL));
-    MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], SIZEOF(JQUANT_TBL));
+                                  sizeof(JQUANT_TBL));
+    MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
     compptr->quant_table = qtbl;
   }
 }
@@ -313,31 +320,31 @@
   val = (*cinfo->marker->read_markers) (cinfo);
 
   switch (val) {
-  case JPEG_REACHED_SOS:	/* Found SOS */
-    if (inputctl->inheaders) {	/* 1st SOS */
+  case JPEG_REACHED_SOS:        /* Found SOS */
+    if (inputctl->inheaders) {  /* 1st SOS */
       initial_setup(cinfo);
       inputctl->inheaders = FALSE;
       /* Note: start_input_pass must be called by jdmaster.c
        * before any more input can be consumed.  jdapimin.c is
        * responsible for enforcing this sequencing.
        */
-    } else {			/* 2nd or later SOS marker */
+    } else {                    /* 2nd or later SOS marker */
       if (! inputctl->pub.has_multiple_scans)
-	ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */
+        ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */
       start_input_pass(cinfo);
     }
     break;
-  case JPEG_REACHED_EOI:	/* Found EOI */
+  case JPEG_REACHED_EOI:        /* Found EOI */
     inputctl->pub.eoi_reached = TRUE;
-    if (inputctl->inheaders) {	/* Tables-only datastream, apparently */
+    if (inputctl->inheaders) {  /* Tables-only datastream, apparently */
       if (cinfo->marker->saw_SOF)
-	ERREXIT(cinfo, JERR_SOF_NO_SOS);
+        ERREXIT(cinfo, JERR_SOF_NO_SOS);
     } else {
       /* Prevent infinite loop in coef ctlr's decompress_data routine
        * if user set output_scan_number larger than number of scans.
        */
       if (cinfo->output_scan_number > cinfo->input_scan_number)
-	cinfo->output_scan_number = cinfo->input_scan_number;
+        cinfo->output_scan_number = cinfo->input_scan_number;
     }
     break;
   case JPEG_SUSPENDED:
@@ -382,7 +389,7 @@
   /* Create subobject in permanent pool */
   inputctl = (my_inputctl_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				SIZEOF(my_input_controller));
+                                sizeof(my_input_controller));
   cinfo->inputctl = (struct jpeg_input_controller *) inputctl;
   /* Initialize method pointers */
   inputctl->pub.consume_input = consume_markers;
diff --git a/jdmainct.c b/jdmainct.c
index 6bb8c2b..ebb069b 100644
--- a/jdmainct.c
+++ b/jdmainct.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2010, 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the main buffer controller for decompression.
  * The main buffer lies between the JPEG decompressor proper and the
@@ -15,6 +16,7 @@
  * supplies the equivalent of the main buffer in that case.
  */
 
+#include "jinclude.h"
 #include "jdmainct.h"
 
 
@@ -111,15 +113,15 @@
 
 /* Forward declarations */
 METHODDEF(void) process_data_simple_main
-	JPP((j_decompress_ptr cinfo, JSAMPARRAY output_buf,
-	     JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail));
+        (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
 METHODDEF(void) process_data_context_main
-	JPP((j_decompress_ptr cinfo, JSAMPARRAY output_buf,
-	     JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail));
+        (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
 #ifdef QUANT_2PASS_SUPPORTED
 METHODDEF(void) process_data_crank_post
-	JPP((j_decompress_ptr cinfo, JSAMPARRAY output_buf,
-	     JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail));
+        (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
 #endif
 
 
@@ -140,7 +142,7 @@
    */
   main_ptr->xbuffer[0] = (JSAMPIMAGE)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				cinfo->num_components * 2 * SIZEOF(JSAMPARRAY));
+                                cinfo->num_components * 2 * sizeof(JSAMPARRAY));
   main_ptr->xbuffer[1] = main_ptr->xbuffer[0] + cinfo->num_components;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
@@ -152,8 +154,8 @@
      */
     xbuf = (JSAMPARRAY)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  2 * (rgroup * (M + 4)) * SIZEOF(JSAMPROW));
-    xbuf += rgroup;		/* want one row group at negative offsets */
+                                  2 * (rgroup * (M + 4)) * sizeof(JSAMPROW));
+    xbuf += rgroup;             /* want one row group at negative offsets */
     main_ptr->xbuffer[0][ci] = xbuf;
     xbuf += rgroup * (M + 4);
     main_ptr->xbuffer[1][ci] = xbuf;
@@ -255,14 +257,14 @@
     if (cinfo->upsample->need_context_rows) {
       main_ptr->pub.process_data = process_data_context_main;
       make_funny_pointers(cinfo); /* Create the xbuffer[] lists */
-      main_ptr->whichptr = 0;	/* Read first iMCU row into xbuffer[0] */
+      main_ptr->whichptr = 0;   /* Read first iMCU row into xbuffer[0] */
       main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
       main_ptr->iMCU_row_ctr = 0;
     } else {
       /* Simple case with no context needed */
       main_ptr->pub.process_data = process_data_simple_main;
     }
-    main_ptr->buffer_full = FALSE;	/* Mark buffer empty */
+    main_ptr->buffer_full = FALSE;      /* Mark buffer empty */
     main_ptr->rowgroup_ctr = 0;
     break;
 #ifdef QUANT_2PASS_SUPPORTED
@@ -285,8 +287,8 @@
 
 METHODDEF(void)
 process_data_simple_main (j_decompress_ptr cinfo,
-			  JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-			  JDIMENSION out_rows_avail)
+                          JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+                          JDIMENSION out_rows_avail)
 {
   my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
   JDIMENSION rowgroups_avail;
@@ -294,8 +296,8 @@
   /* Read input data if we haven't filled the main buffer yet */
   if (! main_ptr->buffer_full) {
     if (! (*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
-      return;			/* suspension forced, can do nothing more */
-    main_ptr->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
+      return;                   /* suspension forced, can do nothing more */
+    main_ptr->buffer_full = TRUE;       /* OK, we have an iMCU row to work with */
   }
 
   /* There are always min_DCT_scaled_size row groups in an iMCU row. */
@@ -307,8 +309,8 @@
 
   /* Feed the postprocessor */
   (*cinfo->post->post_process_data) (cinfo, main_ptr->buffer,
-				     &main_ptr->rowgroup_ctr, rowgroups_avail,
-				     output_buf, out_row_ctr, out_rows_avail);
+                                     &main_ptr->rowgroup_ctr, rowgroups_avail,
+                                     output_buf, out_row_ctr, out_rows_avail);
 
   /* Has postprocessor consumed all the data yet? If so, mark buffer empty */
   if (main_ptr->rowgroup_ctr >= rowgroups_avail) {
@@ -325,18 +327,18 @@
 
 METHODDEF(void)
 process_data_context_main (j_decompress_ptr cinfo,
-			   JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-			   JDIMENSION out_rows_avail)
+                           JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+                           JDIMENSION out_rows_avail)
 {
   my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
 
   /* Read input data if we haven't filled the main buffer yet */
   if (! main_ptr->buffer_full) {
     if (! (*cinfo->coef->decompress_data) (cinfo,
-					   main_ptr->xbuffer[main_ptr->whichptr]))
-      return;			/* suspension forced, can do nothing more */
-    main_ptr->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
-    main_ptr->iMCU_row_ctr++;	/* count rows received */
+                                           main_ptr->xbuffer[main_ptr->whichptr]))
+      return;                   /* suspension forced, can do nothing more */
+    main_ptr->buffer_full = TRUE;       /* OK, we have an iMCU row to work with */
+    main_ptr->iMCU_row_ctr++;   /* count rows received */
   }
 
   /* Postprocessor typically will not swallow all the input data it is handed
@@ -348,13 +350,13 @@
   case CTX_POSTPONED_ROW:
     /* Call postprocessor using previously set pointers for postponed row */
     (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
-			&main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
-			output_buf, out_row_ctr, out_rows_avail);
+                        &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
+                        output_buf, out_row_ctr, out_rows_avail);
     if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
-      return;			/* Need to suspend */
+      return;                   /* Need to suspend */
     main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
     if (*out_row_ctr >= out_rows_avail)
-      return;			/* Postprocessor exactly filled output buf */
+      return;                   /* Postprocessor exactly filled output buf */
     /*FALLTHROUGH*/
   case CTX_PREPARE_FOR_IMCU:
     /* Prepare to process first M-1 row groups of this iMCU row */
@@ -370,15 +372,15 @@
   case CTX_PROCESS_IMCU:
     /* Call postprocessor using previously set pointers */
     (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
-			&main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
-			output_buf, out_row_ctr, out_rows_avail);
+                        &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
+                        output_buf, out_row_ctr, out_rows_avail);
     if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
-      return;			/* Need to suspend */
+      return;                   /* Need to suspend */
     /* After the first iMCU, change wraparound pointers to normal state */
     if (main_ptr->iMCU_row_ctr == 1)
       set_wraparound_pointers(cinfo);
     /* Prepare to load new iMCU row using other xbuffer list */
-    main_ptr->whichptr ^= 1;	/* 0=>1 or 1=>0 */
+    main_ptr->whichptr ^= 1;    /* 0=>1 or 1=>0 */
     main_ptr->buffer_full = FALSE;
     /* Still need to process last row group of this iMCU row, */
     /* which is saved at index M+1 of the other xbuffer */
@@ -399,12 +401,12 @@
 
 METHODDEF(void)
 process_data_crank_post (j_decompress_ptr cinfo,
-			 JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-			 JDIMENSION out_rows_avail)
+                         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+                         JDIMENSION out_rows_avail)
 {
   (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE) NULL,
-				     (JDIMENSION *) NULL, (JDIMENSION) 0,
-				     output_buf, out_row_ctr, out_rows_avail);
+                                     (JDIMENSION *) NULL, (JDIMENSION) 0,
+                                     output_buf, out_row_ctr, out_rows_avail);
 }
 
 #endif /* QUANT_2PASS_SUPPORTED */
@@ -423,11 +425,11 @@
 
   main_ptr = (my_main_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_main_controller));
+                                sizeof(my_main_controller));
   cinfo->main = (struct jpeg_d_main_controller *) main_ptr;
   main_ptr->pub.start_pass = start_pass_main;
 
-  if (need_full_buffer)		/* shouldn't happen */
+  if (need_full_buffer)         /* shouldn't happen */
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 
   /* Allocate the workspace.
@@ -447,8 +449,8 @@
     rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
       cinfo->_min_DCT_scaled_size; /* height of a row group of component */
     main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
-			((j_common_ptr) cinfo, JPOOL_IMAGE,
-			 compptr->width_in_blocks * compptr->_DCT_scaled_size,
-			 (JDIMENSION) (rgroup * ngroups));
+                        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                         compptr->width_in_blocks * compptr->_DCT_scaled_size,
+                         (JDIMENSION) (rgroup * ngroups));
   }
 }
diff --git a/jdmainct.h b/jdmainct.h
index 37ab27d..3090301 100644
--- a/jdmainct.h
+++ b/jdmainct.h
@@ -3,11 +3,11 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  */
 
 #define JPEG_INTERNALS
-#include "jinclude.h"
 #include "jpeglib.h"
 #include "jpegcomp.h"
 
@@ -34,7 +34,7 @@
   JDIMENSION iMCU_row_ctr;      /* counts iMCU rows to detect image top/bot */
 } my_main_controller;
 
-typedef my_main_controller * my_main_ptr;
+typedef my_main_controller *my_main_ptr;
 
 
 /* context_state values: */
diff --git a/jdmarker.c b/jdmarker.c
index c8cf9a4..e3b612c 100644
--- a/jdmarker.c
+++ b/jdmarker.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2012, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2012, 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to decode JPEG datastream markers.
  * Most of the complexity arises from our desire to support input
@@ -19,29 +20,29 @@
 #include "jpeglib.h"
 
 
-typedef enum {			/* JPEG marker codes */
+typedef enum {                  /* JPEG marker codes */
   M_SOF0  = 0xc0,
   M_SOF1  = 0xc1,
   M_SOF2  = 0xc2,
   M_SOF3  = 0xc3,
-  
+
   M_SOF5  = 0xc5,
   M_SOF6  = 0xc6,
   M_SOF7  = 0xc7,
-  
+
   M_JPG   = 0xc8,
   M_SOF9  = 0xc9,
   M_SOF10 = 0xca,
   M_SOF11 = 0xcb,
-  
+
   M_SOF13 = 0xcd,
   M_SOF14 = 0xce,
   M_SOF15 = 0xcf,
-  
+
   M_DHT   = 0xc4,
-  
+
   M_DAC   = 0xcc,
-  
+
   M_RST0  = 0xd0,
   M_RST1  = 0xd1,
   M_RST2  = 0xd2,
@@ -50,7 +51,7 @@
   M_RST5  = 0xd5,
   M_RST6  = 0xd6,
   M_RST7  = 0xd7,
-  
+
   M_SOI   = 0xd8,
   M_EOI   = 0xd9,
   M_SOS   = 0xda,
@@ -59,7 +60,7 @@
   M_DRI   = 0xdd,
   M_DHP   = 0xde,
   M_EXP   = 0xdf,
-  
+
   M_APP0  = 0xe0,
   M_APP1  = 0xe1,
   M_APP2  = 0xe2,
@@ -76,13 +77,13 @@
   M_APP13 = 0xed,
   M_APP14 = 0xee,
   M_APP15 = 0xef,
-  
+
   M_JPG0  = 0xf0,
   M_JPG13 = 0xfd,
   M_COM   = 0xfe,
-  
+
   M_TEM   = 0x01,
-  
+
   M_ERROR = 0x100
 } JPEG_MARKER;
 
@@ -101,12 +102,12 @@
   unsigned int length_limit_APPn[16];
 
   /* Status of COM/APPn marker saving */
-  jpeg_saved_marker_ptr cur_marker;	/* NULL if not processing a marker */
-  unsigned int bytes_read;		/* data bytes read so far in marker */
+  jpeg_saved_marker_ptr cur_marker;     /* NULL if not processing a marker */
+  unsigned int bytes_read;              /* data bytes read so far in marker */
   /* Note: cur_marker is not linked into marker_list until it's all read. */
 } my_marker_reader;
 
-typedef my_marker_reader * my_marker_ptr;
+typedef my_marker_reader *my_marker_ptr;
 
 
 /*
@@ -119,49 +120,49 @@
 
 /* Declare and initialize local copies of input pointer/count */
 #define INPUT_VARS(cinfo)  \
-	struct jpeg_source_mgr * datasrc = (cinfo)->src;  \
-	const JOCTET * next_input_byte = datasrc->next_input_byte;  \
-	size_t bytes_in_buffer = datasrc->bytes_in_buffer
+        struct jpeg_source_mgr *datasrc = (cinfo)->src;  \
+        const JOCTET *next_input_byte = datasrc->next_input_byte;  \
+        size_t bytes_in_buffer = datasrc->bytes_in_buffer
 
 /* Unload the local copies --- do this only at a restart boundary */
 #define INPUT_SYNC(cinfo)  \
-	( datasrc->next_input_byte = next_input_byte,  \
-	  datasrc->bytes_in_buffer = bytes_in_buffer )
+        ( datasrc->next_input_byte = next_input_byte,  \
+          datasrc->bytes_in_buffer = bytes_in_buffer )
 
 /* Reload the local copies --- used only in MAKE_BYTE_AVAIL */
 #define INPUT_RELOAD(cinfo)  \
-	( next_input_byte = datasrc->next_input_byte,  \
-	  bytes_in_buffer = datasrc->bytes_in_buffer )
+        ( next_input_byte = datasrc->next_input_byte,  \
+          bytes_in_buffer = datasrc->bytes_in_buffer )
 
 /* Internal macro for INPUT_BYTE and INPUT_2BYTES: make a byte available.
  * Note we do *not* do INPUT_SYNC before calling fill_input_buffer,
  * but we must reload the local copies after a successful fill.
  */
 #define MAKE_BYTE_AVAIL(cinfo,action)  \
-	if (bytes_in_buffer == 0) {  \
-	  if (! (*datasrc->fill_input_buffer) (cinfo))  \
-	    { action; }  \
-	  INPUT_RELOAD(cinfo);  \
-	}
+        if (bytes_in_buffer == 0) {  \
+          if (! (*datasrc->fill_input_buffer) (cinfo))  \
+            { action; }  \
+          INPUT_RELOAD(cinfo);  \
+        }
 
 /* Read a byte into variable V.
  * If must suspend, take the specified action (typically "return FALSE").
  */
 #define INPUT_BYTE(cinfo,V,action)  \
-	MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
-		  bytes_in_buffer--; \
-		  V = GETJOCTET(*next_input_byte++); )
+        MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
+                  bytes_in_buffer--; \
+                  V = GETJOCTET(*next_input_byte++); )
 
 /* As above, but read two bytes interpreted as an unsigned 16-bit integer.
- * V should be declared unsigned int or perhaps INT32.
+ * V should be declared unsigned int or perhaps JLONG.
  */
 #define INPUT_2BYTES(cinfo,V,action)  \
-	MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
-		  bytes_in_buffer--; \
-		  V = ((unsigned int) GETJOCTET(*next_input_byte++)) << 8; \
-		  MAKE_BYTE_AVAIL(cinfo,action); \
-		  bytes_in_buffer--; \
-		  V += GETJOCTET(*next_input_byte++); )
+        MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
+                  bytes_in_buffer--; \
+                  V = ((unsigned int) GETJOCTET(*next_input_byte++)) << 8; \
+                  MAKE_BYTE_AVAIL(cinfo,action); \
+                  bytes_in_buffer--; \
+                  V += GETJOCTET(*next_input_byte++); )
 
 
 /*
@@ -200,7 +201,7 @@
 /* Process an SOI marker */
 {
   int i;
-  
+
   TRACEMS(cinfo, 1, JTRC_SOI);
 
   if (cinfo->marker->saw_SOI)
@@ -239,9 +240,9 @@
 get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
 /* Process a SOFn marker */
 {
-  INT32 length;
+  JLONG length;
   int c, ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   INPUT_VARS(cinfo);
 
   cinfo->progressive_mode = is_prog;
@@ -257,8 +258,8 @@
   length -= 8;
 
   TRACEMS4(cinfo, 1, JTRC_SOF, cinfo->unread_marker,
-	   (int) cinfo->image_width, (int) cinfo->image_height,
-	   cinfo->num_components);
+           (int) cinfo->image_width, (int) cinfo->image_height,
+           cinfo->num_components);
 
   if (cinfo->marker->saw_SOF)
     ERREXIT(cinfo, JERR_SOF_DUPLICATE);
@@ -273,11 +274,11 @@
   if (length != (cinfo->num_components * 3))
     ERREXIT(cinfo, JERR_BAD_LENGTH);
 
-  if (cinfo->comp_info == NULL)	/* do only once, even if suspend */
+  if (cinfo->comp_info == NULL) /* do only once, even if suspend */
     cinfo->comp_info = (jpeg_component_info *) (*cinfo->mem->alloc_small)
-			((j_common_ptr) cinfo, JPOOL_IMAGE,
-			 cinfo->num_components * SIZEOF(jpeg_component_info));
-  
+                        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                         cinfo->num_components * sizeof(jpeg_component_info));
+
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     compptr->component_index = ci;
@@ -288,8 +289,8 @@
     INPUT_BYTE(cinfo, compptr->quant_tbl_no, return FALSE);
 
     TRACEMS4(cinfo, 1, JTRC_SOF_COMPONENT,
-	     compptr->component_id, compptr->h_samp_factor,
-	     compptr->v_samp_factor, compptr->quant_tbl_no);
+             compptr->component_id, compptr->h_samp_factor,
+             compptr->v_samp_factor, compptr->quant_tbl_no);
   }
 
   cinfo->marker->saw_SOF = TRUE;
@@ -303,9 +304,9 @@
 get_sos (j_decompress_ptr cinfo)
 /* Process a SOS marker */
 {
-  INT32 length;
+  JLONG length;
   int i, ci, n, c, cc, pi;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   INPUT_VARS(cinfo);
 
   if (! cinfo->marker->saw_SOF)
@@ -330,12 +331,12 @@
   for (i = 0; i < n; i++) {
     INPUT_BYTE(cinfo, cc, return FALSE);
     INPUT_BYTE(cinfo, c, return FALSE);
-    
+
     for (ci = 0, compptr = cinfo->comp_info;
-	 ci < cinfo->num_components && ci < MAX_COMPS_IN_SCAN;
-	 ci++, compptr++) {
+         ci < cinfo->num_components && ci < MAX_COMPS_IN_SCAN;
+         ci++, compptr++) {
       if (cc == compptr->component_id && !cinfo->cur_comp_info[ci])
-	goto id_found;
+        goto id_found;
     }
 
     ERREXIT1(cinfo, JERR_BAD_COMPONENT_ID, cc);
@@ -345,9 +346,9 @@
     cinfo->cur_comp_info[i] = compptr;
     compptr->dc_tbl_no = (c >> 4) & 15;
     compptr->ac_tbl_no = (c     ) & 15;
-    
+
     TRACEMS3(cinfo, 1, JTRC_SOS_COMPONENT, cc,
-	     compptr->dc_tbl_no, compptr->ac_tbl_no);
+             compptr->dc_tbl_no, compptr->ac_tbl_no);
 
     /* This CSi (cc) should differ from the previous CSi */
     for (pi = 0; pi < i; pi++) {
@@ -367,7 +368,7 @@
   cinfo->Al = (c     ) & 15;
 
   TRACEMS4(cinfo, 1, JTRC_SOS_PARAMS, cinfo->Ss, cinfo->Se,
-	   cinfo->Ah, cinfo->Al);
+           cinfo->Ah, cinfo->Al);
 
   /* Prepare to scan data & restart markers */
   cinfo->marker->next_restart_num = 0;
@@ -386,13 +387,13 @@
 get_dac (j_decompress_ptr cinfo)
 /* Process a DAC marker */
 {
-  INT32 length;
+  JLONG length;
   int index, val;
   INPUT_VARS(cinfo);
 
   INPUT_2BYTES(cinfo, length, return FALSE);
   length -= 2;
-  
+
   while (length > 0) {
     INPUT_BYTE(cinfo, index, return FALSE);
     INPUT_BYTE(cinfo, val, return FALSE);
@@ -406,11 +407,11 @@
 
     if (index >= NUM_ARITH_TBLS) { /* define AC table */
       cinfo->arith_ac_K[index-NUM_ARITH_TBLS] = (UINT8) val;
-    } else {			/* define DC table */
+    } else {                    /* define DC table */
       cinfo->arith_dc_L[index] = (UINT8) (val & 0x0F);
       cinfo->arith_dc_U[index] = (UINT8) (val >> 4);
       if (cinfo->arith_dc_L[index] > cinfo->arith_dc_U[index])
-	ERREXIT1(cinfo, JERR_DAC_VALUE, val);
+        ERREXIT1(cinfo, JERR_DAC_VALUE, val);
     }
   }
 
@@ -432,7 +433,7 @@
 get_dht (j_decompress_ptr cinfo)
 /* Process a DHT marker */
 {
-  INT32 length;
+  JLONG length;
   UINT8 bits[17];
   UINT8 huffval[256];
   int i, index, count;
@@ -441,12 +442,12 @@
 
   INPUT_2BYTES(cinfo, length, return FALSE);
   length -= 2;
-  
+
   while (length > 16) {
     INPUT_BYTE(cinfo, index, return FALSE);
 
     TRACEMS1(cinfo, 1, JTRC_DHT, index);
-      
+
     bits[0] = 0;
     count = 0;
     for (i = 1; i <= 16; i++) {
@@ -457,31 +458,31 @@
     length -= 1 + 16;
 
     TRACEMS8(cinfo, 2, JTRC_HUFFBITS,
-	     bits[1], bits[2], bits[3], bits[4],
-	     bits[5], bits[6], bits[7], bits[8]);
+             bits[1], bits[2], bits[3], bits[4],
+             bits[5], bits[6], bits[7], bits[8]);
     TRACEMS8(cinfo, 2, JTRC_HUFFBITS,
-	     bits[9], bits[10], bits[11], bits[12],
-	     bits[13], bits[14], bits[15], bits[16]);
+             bits[9], bits[10], bits[11], bits[12],
+             bits[13], bits[14], bits[15], bits[16]);
 
     /* Here we just do minimal validation of the counts to avoid walking
      * off the end of our table space.  jdhuff.c will check more carefully.
      */
-    if (count > 256 || ((INT32) count) > length)
+    if (count > 256 || ((JLONG) count) > length)
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
 
     for (i = 0; i < count; i++)
       INPUT_BYTE(cinfo, huffval[i], return FALSE);
 
-    MEMZERO(&huffval[count], (256 - count) * SIZEOF(UINT8));
+    MEMZERO(&huffval[count], (256 - count) * sizeof(UINT8));
 
     length -= count;
 
-    if (index & 0x10) {		/* AC table definition */
+    if (index & 0x10) {         /* AC table definition */
       index -= 0x10;
       if (index < 0 || index >= NUM_HUFF_TBLS)
         ERREXIT1(cinfo, JERR_DHT_INDEX, index);
       htblptr = &cinfo->ac_huff_tbl_ptrs[index];
-    } else {			/* DC table definition */
+    } else {                    /* DC table definition */
       if (index < 0 || index >= NUM_HUFF_TBLS)
         ERREXIT1(cinfo, JERR_DHT_INDEX, index);
       htblptr = &cinfo->dc_huff_tbl_ptrs[index];
@@ -489,9 +490,9 @@
 
     if (*htblptr == NULL)
       *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-  
-    MEMCOPY((*htblptr)->bits, bits, SIZEOF((*htblptr)->bits));
-    MEMCOPY((*htblptr)->huffval, huffval, SIZEOF((*htblptr)->huffval));
+
+    MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
+    MEMCOPY((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
   }
 
   if (length != 0)
@@ -506,7 +507,7 @@
 get_dqt (j_decompress_ptr cinfo)
 /* Process a DQT marker */
 {
-  INT32 length;
+  JLONG length;
   int n, i, prec;
   unsigned int tmp;
   JQUANT_TBL *quant_ptr;
@@ -524,27 +525,27 @@
 
     if (n >= NUM_QUANT_TBLS)
       ERREXIT1(cinfo, JERR_DQT_INDEX, n);
-      
+
     if (cinfo->quant_tbl_ptrs[n] == NULL)
       cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) cinfo);
     quant_ptr = cinfo->quant_tbl_ptrs[n];
 
     for (i = 0; i < DCTSIZE2; i++) {
       if (prec)
-	INPUT_2BYTES(cinfo, tmp, return FALSE);
+        INPUT_2BYTES(cinfo, tmp, return FALSE);
       else
-	INPUT_BYTE(cinfo, tmp, return FALSE);
+        INPUT_BYTE(cinfo, tmp, return FALSE);
       /* We convert the zigzag-order table to natural array order. */
       quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16) tmp;
     }
 
     if (cinfo->err->trace_level >= 2) {
       for (i = 0; i < DCTSIZE2; i += 8) {
-	TRACEMS8(cinfo, 2, JTRC_QUANTVALS,
-		 quant_ptr->quantval[i],   quant_ptr->quantval[i+1],
-		 quant_ptr->quantval[i+2], quant_ptr->quantval[i+3],
-		 quant_ptr->quantval[i+4], quant_ptr->quantval[i+5],
-		 quant_ptr->quantval[i+6], quant_ptr->quantval[i+7]);
+        TRACEMS8(cinfo, 2, JTRC_QUANTVALS,
+                 quant_ptr->quantval[i],   quant_ptr->quantval[i+1],
+                 quant_ptr->quantval[i+2], quant_ptr->quantval[i+3],
+                 quant_ptr->quantval[i+4], quant_ptr->quantval[i+5],
+                 quant_ptr->quantval[i+6], quant_ptr->quantval[i+7]);
       }
     }
 
@@ -564,12 +565,12 @@
 get_dri (j_decompress_ptr cinfo)
 /* Process a DRI marker */
 {
-  INT32 length;
+  JLONG length;
   unsigned int tmp;
   INPUT_VARS(cinfo);
 
   INPUT_2BYTES(cinfo, length, return FALSE);
-  
+
   if (length != 4)
     ERREXIT(cinfo, JERR_BAD_LENGTH);
 
@@ -591,20 +592,20 @@
  * JFIF and Adobe markers, respectively.
  */
 
-#define APP0_DATA_LEN	14	/* Length of interesting data in APP0 */
-#define APP14_DATA_LEN	12	/* Length of interesting data in APP14 */
-#define APPN_DATA_LEN	14	/* Must be the largest of the above!! */
+#define APP0_DATA_LEN   14      /* Length of interesting data in APP0 */
+#define APP14_DATA_LEN  12      /* Length of interesting data in APP14 */
+#define APPN_DATA_LEN   14      /* Must be the largest of the above!! */
 
 
 LOCAL(void)
-examine_app0 (j_decompress_ptr cinfo, JOCTET FAR * data,
-	      unsigned int datalen, INT32 remaining)
+examine_app0 (j_decompress_ptr cinfo, JOCTET *data,
+              unsigned int datalen, JLONG remaining)
 /* Examine first few bytes from an APP0.
  * Take appropriate action if it is a JFIF marker.
  * datalen is # of bytes at data[], remaining is length of rest of marker data.
  */
 {
-  INT32 totallen = (INT32) datalen + remaining;
+  JLONG totallen = (JLONG) datalen + remaining;
 
   if (datalen >= APP0_DATA_LEN &&
       GETJOCTET(data[0]) == 0x4A &&
@@ -627,18 +628,18 @@
      */
     if (cinfo->JFIF_major_version != 1)
       WARNMS2(cinfo, JWRN_JFIF_MAJOR,
-	      cinfo->JFIF_major_version, cinfo->JFIF_minor_version);
+              cinfo->JFIF_major_version, cinfo->JFIF_minor_version);
     /* Generate trace messages */
     TRACEMS5(cinfo, 1, JTRC_JFIF,
-	     cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
-	     cinfo->X_density, cinfo->Y_density, cinfo->density_unit);
+             cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
+             cinfo->X_density, cinfo->Y_density, cinfo->density_unit);
     /* Validate thumbnail dimensions and issue appropriate messages */
     if (GETJOCTET(data[12]) | GETJOCTET(data[13]))
       TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL,
-	       GETJOCTET(data[12]), GETJOCTET(data[13]));
+               GETJOCTET(data[12]), GETJOCTET(data[13]));
     totallen -= APP0_DATA_LEN;
     if (totallen !=
-	((INT32)GETJOCTET(data[12]) * (INT32)GETJOCTET(data[13]) * (INT32) 3))
+        ((JLONG)GETJOCTET(data[12]) * (JLONG)GETJOCTET(data[13]) * (JLONG) 3))
       TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int) totallen);
   } else if (datalen >= 6 &&
       GETJOCTET(data[0]) == 0x4A &&
@@ -662,7 +663,7 @@
       break;
     default:
       TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION,
-	       GETJOCTET(data[5]), (int) totallen);
+               GETJOCTET(data[5]), (int) totallen);
       break;
     }
   } else {
@@ -673,8 +674,8 @@
 
 
 LOCAL(void)
-examine_app14 (j_decompress_ptr cinfo, JOCTET FAR * data,
-	       unsigned int datalen, INT32 remaining)
+examine_app14 (j_decompress_ptr cinfo, JOCTET *data,
+               unsigned int datalen, JLONG remaining)
 /* Examine first few bytes from an APP14.
  * Take appropriate action if it is an Adobe marker.
  * datalen is # of bytes at data[], remaining is length of rest of marker data.
@@ -707,7 +708,7 @@
 get_interesting_appn (j_decompress_ptr cinfo)
 /* Process an APP0 or APP14 marker without saving it */
 {
-  INT32 length;
+  JLONG length;
   JOCTET b[APPN_DATA_LEN];
   unsigned int i, numtoread;
   INPUT_VARS(cinfo);
@@ -729,10 +730,10 @@
   /* process it */
   switch (cinfo->unread_marker) {
   case M_APP0:
-    examine_app0(cinfo, (JOCTET FAR *) b, numtoread, length);
+    examine_app0(cinfo, (JOCTET *) b, numtoread, length);
     break;
   case M_APP14:
-    examine_app14(cinfo, (JOCTET FAR *) b, numtoread, length);
+    examine_app14(cinfo, (JOCTET *) b, numtoread, length);
     break;
   default:
     /* can't get here unless jpeg_save_markers chooses wrong processor */
@@ -758,33 +759,33 @@
   my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
   jpeg_saved_marker_ptr cur_marker = marker->cur_marker;
   unsigned int bytes_read, data_length;
-  JOCTET FAR * data;
-  INT32 length = 0;
+  JOCTET *data;
+  JLONG length = 0;
   INPUT_VARS(cinfo);
 
   if (cur_marker == NULL) {
     /* begin reading a marker */
     INPUT_2BYTES(cinfo, length, return FALSE);
     length -= 2;
-    if (length >= 0) {		/* watch out for bogus length word */
+    if (length >= 0) {          /* watch out for bogus length word */
       /* figure out how much we want to save */
       unsigned int limit;
       if (cinfo->unread_marker == (int) M_COM)
-	limit = marker->length_limit_COM;
+        limit = marker->length_limit_COM;
       else
-	limit = marker->length_limit_APPn[cinfo->unread_marker - (int) M_APP0];
+        limit = marker->length_limit_APPn[cinfo->unread_marker - (int) M_APP0];
       if ((unsigned int) length < limit)
-	limit = (unsigned int) length;
+        limit = (unsigned int) length;
       /* allocate and initialize the marker item */
       cur_marker = (jpeg_saved_marker_ptr)
-	(*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				    SIZEOF(struct jpeg_marker_struct) + limit);
+        (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                    sizeof(struct jpeg_marker_struct) + limit);
       cur_marker->next = NULL;
       cur_marker->marker = (UINT8) cinfo->unread_marker;
       cur_marker->original_length = (unsigned int) length;
       cur_marker->data_length = limit;
       /* data area is just beyond the jpeg_marker_struct */
-      data = cur_marker->data = (JOCTET FAR *) (cur_marker + 1);
+      data = cur_marker->data = (JOCTET *) (cur_marker + 1);
       marker->cur_marker = cur_marker;
       marker->bytes_read = 0;
       bytes_read = 0;
@@ -802,7 +803,7 @@
   }
 
   while (bytes_read < data_length) {
-    INPUT_SYNC(cinfo);		/* move the restart point to here */
+    INPUT_SYNC(cinfo);          /* move the restart point to here */
     marker->bytes_read = bytes_read;
     /* If there's not at least one byte in buffer, suspend */
     MAKE_BYTE_AVAIL(cinfo, return FALSE);
@@ -815,14 +816,14 @@
   }
 
   /* Done reading what we want to read */
-  if (cur_marker != NULL) {	/* will be NULL if bogus length word */
+  if (cur_marker != NULL) {     /* will be NULL if bogus length word */
     /* Add new marker to end of list */
     if (cinfo->marker_list == NULL) {
       cinfo->marker_list = cur_marker;
     } else {
       jpeg_saved_marker_ptr prev = cinfo->marker_list;
       while (prev->next != NULL)
-	prev = prev->next;
+        prev = prev->next;
       prev->next = cur_marker;
     }
     /* Reset pointer & calc remaining data length */
@@ -842,12 +843,12 @@
     break;
   default:
     TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker,
-	     (int) (data_length + length));
+             (int) (data_length + length));
     break;
   }
 
   /* skip any remaining data -- could be lots */
-  INPUT_SYNC(cinfo);		/* do before skip_input_data */
+  INPUT_SYNC(cinfo);            /* do before skip_input_data */
   if (length > 0)
     (*cinfo->src->skip_input_data) (cinfo, (long) length);
 
@@ -861,15 +862,15 @@
 skip_variable (j_decompress_ptr cinfo)
 /* Skip over an unknown or uninteresting variable-length marker */
 {
-  INT32 length;
+  JLONG length;
   INPUT_VARS(cinfo);
 
   INPUT_2BYTES(cinfo, length, return FALSE);
   length -= 2;
-  
+
   TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, (int) length);
 
-  INPUT_SYNC(cinfo);		/* do before skip_input_data */
+  INPUT_SYNC(cinfo);            /* do before skip_input_data */
   if (length > 0)
     (*cinfo->src->skip_input_data) (cinfo, (long) length);
 
@@ -913,7 +914,7 @@
       INPUT_BYTE(cinfo, c, return FALSE);
     } while (c == 0xFF);
     if (c != 0)
-      break;			/* found a valid marker, exit loop */
+      break;                    /* found a valid marker, exit loop */
     /* Reach here if we found a stuffed-zero data sequence (FF/00).
      * Discard it and loop back to try again.
      */
@@ -922,7 +923,7 @@
   }
 
   if (cinfo->marker->discarded_bytes != 0) {
-    TRACEMS2(cinfo, 1, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
+    WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
     cinfo->marker->discarded_bytes = 0;
   }
 
@@ -956,143 +957,6 @@
   return TRUE;
 }
 
-#ifdef MOTION_JPEG_SUPPORTED
-
-/* The default Huffman tables used by motion JPEG frames. When a motion JPEG
- * frame does not have DHT tables, we should use the huffman tables suggested by
- * the JPEG standard. Each of these tables represents a member of the JHUFF_TBLS
- * struct so we can just copy it to the according JHUFF_TBLS member.
- */
-/* DC table 0 */
-LOCAL(const unsigned char) mjpg_dc0_bits[] = {
-  0x00, 0x01, 0x05, 0x01, 0x01, 0x01, 0x01, 0x01,
-  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-};
-
-LOCAL(const unsigned char) mjpg_dc0_huffval[] = {
-  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-  0x08, 0x09, 0x0A, 0x0B
-};
-
-/* DC table 1 */
-LOCAL(const unsigned char) mjpg_dc1_bits[] = {
-  0x00, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-  0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00
-};
-
-LOCAL(const unsigned char) mjpg_dc1_huffval[] = {
-  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-  0x08, 0x09, 0x0A, 0x0B
-};
-  
-/* AC table 0 */
-LOCAL(const unsigned char) mjpg_ac0_bits[] = {
-  0x00, 0x02, 0x01, 0x03, 0x03, 0x02, 0x04, 0x03,
-  0x05, 0x05, 0x04, 0x04, 0x00, 0x00, 0x01, 0x7D
-};
-
-LOCAL(const unsigned char) mjpg_ac0_huffval[] = {
-  0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
-  0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
-  0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xA1, 0x08,
-  0x23, 0x42, 0xB1, 0xC1, 0x15, 0x52, 0xD1, 0xF0,
-  0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0A, 0x16,
-  0x17, 0x18, 0x19, 0x1A, 0x25, 0x26, 0x27, 0x28,
-  0x29, 0x2A, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
-  0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
-  0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
-  0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-  0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-  0x7A, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
-  0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
-  0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
-  0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6,
-  0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3, 0xC4, 0xC5,
-  0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4,
-  0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xE1, 0xE2,
-  0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA,
-  0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
-  0xF9, 0xFA
-};
-
-/* AC table 1 */
-LOCAL(const unsigned char) mjpg_ac1_bits[] = {
-  0x00, 0x02, 0x01, 0x02, 0x04, 0x04, 0x03, 0x04,
-  0x07, 0x05, 0x04, 0x04, 0x00, 0x01, 0x02, 0x77
-};
-
-LOCAL(const unsigned char) mjpg_ac1_huffval[] = {
-  0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
-  0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
-  0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
-  0xA1, 0xB1, 0xC1, 0x09, 0x23, 0x33, 0x52, 0xF0,
-  0x15, 0x62, 0x72, 0xD1, 0x0A, 0x16, 0x24, 0x34,
-  0xE1, 0x25, 0xF1, 0x17, 0x18, 0x19, 0x1A, 0x26,
-  0x27, 0x28, 0x29, 0x2A, 0x35, 0x36, 0x37, 0x38,
-  0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
-  0x49, 0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
-  0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
-  0x69, 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
-  0x79, 0x7A, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-  0x88, 0x89, 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96,
-  0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5,
-  0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4,
-  0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3,
-  0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2,
-  0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA,
-  0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9,
-  0xEA, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
-  0xF9, 0xFA
-};
-
-/* Loads the default Huffman tables used by motion JPEG frames. This function
- * just copies the huffman tables suggested in the JPEG standard when we have
- * not load them.
- */
-LOCAL(void)
-mjpg_load_huff_tables (j_decompress_ptr cinfo)
-{
-  JHUFF_TBL *htblptr;
-
-  if (! cinfo->dc_huff_tbl_ptrs[0]) {
-    htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-    MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
-    MEMCOPY(&htblptr->bits[1], mjpg_dc0_bits, SIZEOF(mjpg_dc0_bits));
-    MEMCOPY(&htblptr->huffval[0], mjpg_dc0_huffval, SIZEOF(mjpg_dc0_huffval));
-    cinfo->dc_huff_tbl_ptrs[0] = htblptr;
-  }
-
-  if (! cinfo->dc_huff_tbl_ptrs[1]) {
-    htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-    MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
-    MEMCOPY(&htblptr->bits[1], mjpg_dc1_bits, SIZEOF(mjpg_dc1_bits));
-    MEMCOPY(&htblptr->huffval[0], mjpg_dc1_huffval, SIZEOF(mjpg_dc1_huffval));
-    cinfo->dc_huff_tbl_ptrs[1] = htblptr;
-  }
-
-  if (! cinfo->ac_huff_tbl_ptrs[0]) {
-    htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-    MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
-    MEMCOPY(&htblptr->bits[1], mjpg_ac0_bits, SIZEOF(mjpg_ac0_bits));
-    MEMCOPY(&htblptr->huffval[0], mjpg_ac0_huffval, SIZEOF(mjpg_ac0_huffval));
-    cinfo->ac_huff_tbl_ptrs[0] = htblptr;
-  }
-
-  if (! cinfo->ac_huff_tbl_ptrs[1]) {
-    htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-    MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
-    MEMCOPY(&htblptr->bits[1], mjpg_ac1_bits, SIZEOF(mjpg_ac1_bits));
-    MEMCOPY(&htblptr->huffval[0], mjpg_ac1_huffval, SIZEOF(mjpg_ac1_huffval));
-    cinfo->ac_huff_tbl_ptrs[1] = htblptr;
-  }
-}
-
-#else
-
-#define mjpg_load_huff_tables(cinfo)
-
-#endif /* MOTION_JPEG_SUPPORTED */
-
 
 /*
  * Read markers until SOS or EOI.
@@ -1110,11 +974,11 @@
     /* NB: first_marker() enforces the requirement that SOI appear first. */
     if (cinfo->unread_marker == 0) {
       if (! cinfo->marker->saw_SOI) {
-	if (! first_marker(cinfo))
-	  return JPEG_SUSPENDED;
+        if (! first_marker(cinfo))
+          return JPEG_SUSPENDED;
       } else {
-	if (! next_marker(cinfo))
-	  return JPEG_SUSPENDED;
+        if (! next_marker(cinfo))
+          return JPEG_SUSPENDED;
       }
     }
     /* At this point cinfo->unread_marker contains the marker code and the
@@ -1124,75 +988,74 @@
     switch (cinfo->unread_marker) {
     case M_SOI:
       if (! get_soi(cinfo))
-	return JPEG_SUSPENDED;
+        return JPEG_SUSPENDED;
       break;
 
-    case M_SOF0:		/* Baseline */
-    case M_SOF1:		/* Extended sequential, Huffman */
+    case M_SOF0:                /* Baseline */
+    case M_SOF1:                /* Extended sequential, Huffman */
       if (! get_sof(cinfo, FALSE, FALSE))
-	return JPEG_SUSPENDED;
+        return JPEG_SUSPENDED;
       break;
 
-    case M_SOF2:		/* Progressive, Huffman */
+    case M_SOF2:                /* Progressive, Huffman */
       if (! get_sof(cinfo, TRUE, FALSE))
-	return JPEG_SUSPENDED;
+        return JPEG_SUSPENDED;
       break;
 
-    case M_SOF9:		/* Extended sequential, arithmetic */
+    case M_SOF9:                /* Extended sequential, arithmetic */
       if (! get_sof(cinfo, FALSE, TRUE))
-	return JPEG_SUSPENDED;
+        return JPEG_SUSPENDED;
       break;
 
-    case M_SOF10:		/* Progressive, arithmetic */
+    case M_SOF10:               /* Progressive, arithmetic */
       if (! get_sof(cinfo, TRUE, TRUE))
-	return JPEG_SUSPENDED;
+        return JPEG_SUSPENDED;
       break;
 
     /* Currently unsupported SOFn types */
-    case M_SOF3:		/* Lossless, Huffman */
-    case M_SOF5:		/* Differential sequential, Huffman */
-    case M_SOF6:		/* Differential progressive, Huffman */
-    case M_SOF7:		/* Differential lossless, Huffman */
-    case M_JPG:			/* Reserved for JPEG extensions */
-    case M_SOF11:		/* Lossless, arithmetic */
-    case M_SOF13:		/* Differential sequential, arithmetic */
-    case M_SOF14:		/* Differential progressive, arithmetic */
-    case M_SOF15:		/* Differential lossless, arithmetic */
+    case M_SOF3:                /* Lossless, Huffman */
+    case M_SOF5:                /* Differential sequential, Huffman */
+    case M_SOF6:                /* Differential progressive, Huffman */
+    case M_SOF7:                /* Differential lossless, Huffman */
+    case M_JPG:                 /* Reserved for JPEG extensions */
+    case M_SOF11:               /* Lossless, arithmetic */
+    case M_SOF13:               /* Differential sequential, arithmetic */
+    case M_SOF14:               /* Differential progressive, arithmetic */
+    case M_SOF15:               /* Differential lossless, arithmetic */
       ERREXIT1(cinfo, JERR_SOF_UNSUPPORTED, cinfo->unread_marker);
       break;
 
     case M_SOS:
-      mjpg_load_huff_tables(cinfo);
       if (! get_sos(cinfo))
-	return JPEG_SUSPENDED;
-      cinfo->unread_marker = 0;	/* processed the marker */
+        return JPEG_SUSPENDED;
+      cinfo->unread_marker = 0; /* processed the marker */
       return JPEG_REACHED_SOS;
-    
+
     case M_EOI:
       TRACEMS(cinfo, 1, JTRC_EOI);
-      cinfo->unread_marker = 0;	/* processed the marker */
+      cinfo->unread_marker = 0; /* processed the marker */
       return JPEG_REACHED_EOI;
-      
+
     case M_DAC:
       if (! get_dac(cinfo))
-	return JPEG_SUSPENDED;
+        return JPEG_SUSPENDED;
       break;
-      
+
     case M_DHT:
       if (! get_dht(cinfo))
-	return JPEG_SUSPENDED;
+        return JPEG_SUSPENDED;
       break;
-      
+
     case M_DQT:
       if (! get_dqt(cinfo))
-	return JPEG_SUSPENDED;
+        return JPEG_SUSPENDED;
       break;
-      
+
     case M_DRI:
       if (! get_dri(cinfo))
-	return JPEG_SUSPENDED;
+        return JPEG_SUSPENDED;
       break;
-      
+
     case M_APP0:
     case M_APP1:
     case M_APP2:
@@ -1210,16 +1073,16 @@
     case M_APP14:
     case M_APP15:
       if (! (*((my_marker_ptr) cinfo->marker)->process_APPn[
-		cinfo->unread_marker - (int) M_APP0]) (cinfo))
-	return JPEG_SUSPENDED;
-      break;
-      
-    case M_COM:
-      if (! (*((my_marker_ptr) cinfo->marker)->process_COM) (cinfo))
-	return JPEG_SUSPENDED;
+                cinfo->unread_marker - (int) M_APP0]) (cinfo))
+        return JPEG_SUSPENDED;
       break;
 
-    case M_RST0:		/* these are all parameterless */
+    case M_COM:
+      if (! (*((my_marker_ptr) cinfo->marker)->process_COM) (cinfo))
+        return JPEG_SUSPENDED;
+      break;
+
+    case M_RST0:                /* these are all parameterless */
     case M_RST1:
     case M_RST2:
     case M_RST3:
@@ -1231,12 +1094,12 @@
       TRACEMS1(cinfo, 1, JTRC_PARMLESS_MARKER, cinfo->unread_marker);
       break;
 
-    case M_DNL:			/* Ignore DNL ... perhaps the wrong thing */
+    case M_DNL:                 /* Ignore DNL ... perhaps the wrong thing */
       if (! skip_variable(cinfo))
-	return JPEG_SUSPENDED;
+        return JPEG_SUSPENDED;
       break;
 
-    default:			/* must be DHP, EXP, JPGn, or RESn */
+    default:                    /* must be DHP, EXP, JPGn, or RESn */
       /* For now, we treat the reserved markers as fatal errors since they are
        * likely to be used to signal incompatible JPEG Part 3 extensions.
        * Once the JPEG 3 version-number marker is well defined, this code
@@ -1282,7 +1145,7 @@
     /* Uh-oh, the restart markers have been messed up. */
     /* Let the data source manager determine how to resync. */
     if (! (*cinfo->src->resync_to_restart) (cinfo,
-					    cinfo->marker->next_restart_num))
+                                            cinfo->marker->next_restart_num))
       return FALSE;
   }
 
@@ -1347,25 +1210,25 @@
 {
   int marker = cinfo->unread_marker;
   int action = 1;
-  
+
   /* Always put up a warning. */
   WARNMS2(cinfo, JWRN_MUST_RESYNC, marker, desired);
-  
+
   /* Outer loop handles repeated decision after scanning forward. */
   for (;;) {
     if (marker < (int) M_SOF0)
-      action = 2;		/* invalid marker */
+      action = 2;               /* invalid marker */
     else if (marker < (int) M_RST0 || marker > (int) M_RST7)
-      action = 3;		/* valid non-restart marker */
+      action = 3;               /* valid non-restart marker */
     else {
       if (marker == ((int) M_RST0 + ((desired+1) & 7)) ||
-	  marker == ((int) M_RST0 + ((desired+2) & 7)))
-	action = 3;		/* one of the next two expected restarts */
+          marker == ((int) M_RST0 + ((desired+2) & 7)))
+        action = 3;             /* one of the next two expected restarts */
       else if (marker == ((int) M_RST0 + ((desired-1) & 7)) ||
-	       marker == ((int) M_RST0 + ((desired-2) & 7)))
-	action = 2;		/* a prior restart, so advance */
+               marker == ((int) M_RST0 + ((desired-2) & 7)))
+        action = 2;             /* a prior restart, so advance */
       else
-	action = 1;		/* desired restart or too far away */
+        action = 1;             /* desired restart or too far away */
     }
     TRACEMS2(cinfo, 4, JTRC_RECOVERY_ACTION, marker, action);
     switch (action) {
@@ -1376,7 +1239,7 @@
     case 2:
       /* Scan to the next marker, and repeat the decision loop. */
       if (! next_marker(cinfo))
-	return FALSE;
+        return FALSE;
       marker = cinfo->unread_marker;
       break;
     case 3:
@@ -1397,10 +1260,10 @@
 {
   my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
 
-  cinfo->comp_info = NULL;		/* until allocated by get_sof */
-  cinfo->input_scan_number = 0;		/* no SOS seen yet */
-  cinfo->unread_marker = 0;		/* no pending marker */
-  marker->pub.saw_SOI = FALSE;		/* set internal state too */
+  cinfo->comp_info = NULL;              /* until allocated by get_sof */
+  cinfo->input_scan_number = 0;         /* no SOS seen yet */
+  cinfo->unread_marker = 0;             /* no pending marker */
+  marker->pub.saw_SOI = FALSE;          /* set internal state too */
   marker->pub.saw_SOF = FALSE;
   marker->pub.discarded_bytes = 0;
   marker->cur_marker = NULL;
@@ -1421,7 +1284,7 @@
   /* Create subobject in permanent pool */
   marker = (my_marker_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				SIZEOF(my_marker_reader));
+                                sizeof(my_marker_reader));
   cinfo->marker = (struct jpeg_marker_reader *) marker;
   /* Initialize public method pointers */
   marker->pub.reset_marker_reader = reset_marker_reader;
@@ -1452,7 +1315,7 @@
 
 GLOBAL(void)
 jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
-		   unsigned int length_limit)
+                   unsigned int length_limit)
 {
   my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
   long maxlength;
@@ -1461,7 +1324,7 @@
   /* Length limit mustn't be larger than what we can allocate
    * (should only be a concern in a 16-bit environment).
    */
-  maxlength = cinfo->mem->max_alloc_chunk - SIZEOF(struct jpeg_marker_struct);
+  maxlength = cinfo->mem->max_alloc_chunk - sizeof(struct jpeg_marker_struct);
   if (((long) length_limit) > maxlength)
     length_limit = (unsigned int) maxlength;
 
@@ -1501,7 +1364,7 @@
 
 GLOBAL(void)
 jpeg_set_marker_processor (j_decompress_ptr cinfo, int marker_code,
-			   jpeg_marker_parser_method routine)
+                           jpeg_marker_parser_method routine)
 {
   my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
 
diff --git a/jdmaster.c b/jdmaster.c
index f0dd15f..7908849 100644
--- a/jdmaster.c
+++ b/jdmaster.c
@@ -5,10 +5,11 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
-
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains master control logic for the JPEG decompressor.
  * These routines are concerned with selecting the modules to be executed
@@ -20,25 +21,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jpegcomp.h"
-
-
-/* Private state */
-
-typedef struct {
-  struct jpeg_decomp_master pub; /* public fields */
-
-  int pass_number;		/* # of passes completed */
-
-  boolean using_merged_upsample; /* TRUE if using merged upsample/cconvert */
-
-  /* Saved references to initialized quantizer modules,
-   * in case we need to switch modes.
-   */
-  struct jpeg_color_quantizer * quantizer_1pass;
-  struct jpeg_color_quantizer * quantizer_2pass;
-} my_decomp_master;
-
-typedef my_decomp_master * my_master_ptr;
+#include "jdmaster.h"
 
 
 /*
@@ -87,7 +70,7 @@
       cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
     return FALSE;
   /* ??? also need to test for upsample-time rescaling, when & if supported */
-  return TRUE;			/* by golly, it'll work... */
+  return TRUE;                  /* by golly, it'll work... */
 #else
   return FALSE;
 #endif
@@ -299,10 +282,10 @@
        ci++, compptr++) {
     int ssize = cinfo->_min_DCT_scaled_size;
     while (ssize < DCTSIZE &&
-	   ((cinfo->max_h_samp_factor * cinfo->_min_DCT_scaled_size) %
-	    (compptr->h_samp_factor * ssize * 2) == 0) &&
-	   ((cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size) %
-	    (compptr->v_samp_factor * ssize * 2) == 0)) {
+           ((cinfo->max_h_samp_factor * cinfo->_min_DCT_scaled_size) %
+            (compptr->h_samp_factor * ssize * 2) == 0) &&
+           ((cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size) %
+            (compptr->v_samp_factor * ssize * 2) == 0)) {
       ssize = ssize * 2;
     }
 #if JPEG_LIB_VERSION >= 70
@@ -320,12 +303,12 @@
     /* Size in samples, after IDCT scaling */
     compptr->downsampled_width = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width *
-		    (long) (compptr->h_samp_factor * compptr->_DCT_scaled_size),
-		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+                    (long) (compptr->h_samp_factor * compptr->_DCT_scaled_size),
+                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
     compptr->downsampled_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height *
-		    (long) (compptr->v_samp_factor * compptr->_DCT_scaled_size),
-		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+                    (long) (compptr->v_samp_factor * compptr->_DCT_scaled_size),
+                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
   }
 
 #else /* !IDCT_SCALING_SUPPORTED */
@@ -366,12 +349,12 @@
   case JCS_YCCK:
     cinfo->out_color_components = 4;
     break;
-  default:			/* else must be same colorspace as in file */
+  default:                      /* else must be same colorspace as in file */
     cinfo->out_color_components = cinfo->num_components;
     break;
   }
   cinfo->output_components = (cinfo->quantize_colors ? 1 :
-			      cinfo->out_color_components);
+                              cinfo->out_color_components);
 
   /* See if upsampler will want to emit more than one row at a time */
   if (use_merged_upsample(cinfo))
@@ -388,20 +371,20 @@
  * processes are inner loops and need to be as fast as possible.  On most
  * machines, particularly CPUs with pipelines or instruction prefetch,
  * a (subscript-check-less) C table lookup
- *		x = sample_range_limit[x];
+ *              x = sample_range_limit[x];
  * is faster than explicit tests
- *		if (x < 0)  x = 0;
- *		else if (x > MAXJSAMPLE)  x = MAXJSAMPLE;
+ *              if (x < 0)  x = 0;
+ *              else if (x > MAXJSAMPLE)  x = MAXJSAMPLE;
  * These processes all use a common table prepared by the routine below.
  *
  * For most steps we can mathematically guarantee that the initial value
  * of x is within MAXJSAMPLE+1 of the legal range, so a table running from
  * -(MAXJSAMPLE+1) to 2*MAXJSAMPLE+1 is sufficient.  But for the initial
- * limiting step (just after the IDCT), a wildly out-of-range value is 
+ * limiting step (just after the IDCT), a wildly out-of-range value is
  * possible if the input data is corrupt.  To avoid any chance of indexing
  * off the end of memory and getting a bad-pointer trap, we perform the
  * post-IDCT limiting thus:
- *		x = range_limit[x & MASK];
+ *              x = range_limit[x & MASK];
  * where MASK is 2 bits wider than legal sample data, ie 10 bits for 8-bit
  * samples.  Under normal circumstances this is more than enough range and
  * a correct output will be generated; with bogus input data the mask will
@@ -419,37 +402,34 @@
  * We can save some space by overlapping the start of the post-IDCT table
  * with the simpler range limiting table.  The post-IDCT table begins at
  * sample_range_limit + CENTERJSAMPLE.
- *
- * Note that the table is allocated in near data space on PCs; it's small
- * enough and used often enough to justify this.
  */
 
 LOCAL(void)
 prepare_range_limit_table (j_decompress_ptr cinfo)
 /* Allocate and fill in the sample_range_limit table */
 {
-  JSAMPLE * table;
+  JSAMPLE *table;
   int i;
 
   table = (JSAMPLE *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-		(5 * (MAXJSAMPLE+1) + CENTERJSAMPLE) * SIZEOF(JSAMPLE));
-  table += (MAXJSAMPLE+1);	/* allow negative subscripts of simple table */
+                (5 * (MAXJSAMPLE+1) + CENTERJSAMPLE) * sizeof(JSAMPLE));
+  table += (MAXJSAMPLE+1);      /* allow negative subscripts of simple table */
   cinfo->sample_range_limit = table;
   /* First segment of "simple" table: limit[x] = 0 for x < 0 */
-  MEMZERO(table - (MAXJSAMPLE+1), (MAXJSAMPLE+1) * SIZEOF(JSAMPLE));
+  MEMZERO(table - (MAXJSAMPLE+1), (MAXJSAMPLE+1) * sizeof(JSAMPLE));
   /* Main part of "simple" table: limit[x] = x */
   for (i = 0; i <= MAXJSAMPLE; i++)
     table[i] = (JSAMPLE) i;
-  table += CENTERJSAMPLE;	/* Point to where post-IDCT table starts */
+  table += CENTERJSAMPLE;       /* Point to where post-IDCT table starts */
   /* End of simple table, rest of first half of post-IDCT table */
   for (i = CENTERJSAMPLE; i < 2*(MAXJSAMPLE+1); i++)
     table[i] = MAXJSAMPLE;
   /* Second half of post-IDCT table */
   MEMZERO(table + (2 * (MAXJSAMPLE+1)),
-	  (2 * (MAXJSAMPLE+1) - CENTERJSAMPLE) * SIZEOF(JSAMPLE));
+          (2 * (MAXJSAMPLE+1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
   MEMCOPY(table + (4 * (MAXJSAMPLE+1) - CENTERJSAMPLE),
-	  cinfo->sample_range_limit, CENTERJSAMPLE * SIZEOF(JSAMPLE));
+          cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
 }
 
 
@@ -582,6 +562,12 @@
   /* Initialize input side of decompressor to consume first scan. */
   (*cinfo->inputctl->start_input_pass) (cinfo);
 
+  /* Set the first and last iMCU columns to decompress from single-scan images.
+   * By default, decompress all of the iMCU columns.
+   */
+  cinfo->master->first_iMCU_col = 0;
+  cinfo->master->last_iMCU_col = cinfo->MCUs_per_row - 1;
+
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* If jpeg_start_decompress will read the whole file, initialize
    * progress monitoring appropriately.  The input step is counted
@@ -637,24 +623,24 @@
     if (cinfo->quantize_colors && cinfo->colormap == NULL) {
       /* Select new quantization method */
       if (cinfo->two_pass_quantize && cinfo->enable_2pass_quant) {
-	cinfo->cquantize = master->quantizer_2pass;
-	master->pub.is_dummy_pass = TRUE;
+        cinfo->cquantize = master->quantizer_2pass;
+        master->pub.is_dummy_pass = TRUE;
       } else if (cinfo->enable_1pass_quant) {
-	cinfo->cquantize = master->quantizer_1pass;
+        cinfo->cquantize = master->quantizer_1pass;
       } else {
-	ERREXIT(cinfo, JERR_MODE_CHANGE);
+        ERREXIT(cinfo, JERR_MODE_CHANGE);
       }
     }
     (*cinfo->idct->start_pass) (cinfo);
     (*cinfo->coef->start_output_pass) (cinfo);
     if (! cinfo->raw_data_out) {
       if (! master->using_merged_upsample)
-	(*cinfo->cconvert->start_pass) (cinfo);
+        (*cinfo->cconvert->start_pass) (cinfo);
       (*cinfo->upsample->start_pass) (cinfo);
       if (cinfo->quantize_colors)
-	(*cinfo->cquantize->start_pass) (cinfo, master->pub.is_dummy_pass);
+        (*cinfo->cquantize->start_pass) (cinfo, master->pub.is_dummy_pass);
       (*cinfo->post->start_pass) (cinfo,
-	    (master->pub.is_dummy_pass ? JBUF_SAVE_AND_PASS : JBUF_PASS_THRU));
+            (master->pub.is_dummy_pass ? JBUF_SAVE_AND_PASS : JBUF_PASS_THRU));
       (*cinfo->main->start_pass) (cinfo, JBUF_PASS_THRU);
     }
   }
@@ -663,7 +649,7 @@
   if (cinfo->progress != NULL) {
     cinfo->progress->completed_passes = master->pass_number;
     cinfo->progress->total_passes = master->pass_number +
-				    (master->pub.is_dummy_pass ? 2 : 1);
+                                    (master->pub.is_dummy_pass ? 2 : 1);
     /* In buffered-image mode, we assume one more output pass if EOI not
      * yet reached, but no more passes if EOI has been reached.
      */
@@ -726,16 +712,13 @@
 GLOBAL(void)
 jinit_master_decompress (j_decompress_ptr cinfo)
 {
-  my_master_ptr master;
+  my_master_ptr master = (my_master_ptr) cinfo->master;
 
-  master = (my_master_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(my_decomp_master));
-  cinfo->master = (struct jpeg_decomp_master *) master;
   master->pub.prepare_for_output_pass = prepare_for_output_pass;
   master->pub.finish_output_pass = finish_output_pass;
 
   master->pub.is_dummy_pass = FALSE;
+  master->pub.jinit_upsampler_no_alloc = FALSE;
 
   master_selection(cinfo);
 }
diff --git a/jdmaster.h b/jdmaster.h
new file mode 100644
index 0000000..76897e2
--- /dev/null
+++ b/jdmaster.h
@@ -0,0 +1,28 @@
+/*
+ * jdmaster.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1995, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the master control structure for the JPEG decompressor.
+ */
+
+/* Private state */
+
+typedef struct {
+  struct jpeg_decomp_master pub; /* public fields */
+
+  int pass_number;              /* # of passes completed */
+
+  boolean using_merged_upsample; /* TRUE if using merged upsample/cconvert */
+
+  /* Saved references to initialized quantizer modules,
+   * in case we need to switch modes.
+   */
+  struct jpeg_color_quantizer *quantizer_1pass;
+  struct jpeg_color_quantizer *quantizer_2pass;
+} my_decomp_master;
+
+typedef my_decomp_master *my_master_ptr;
diff --git a/jdmerge.c b/jdmerge.c
index 760db2d..6276dd0 100644
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -3,11 +3,12 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2011, 2014 D. R. Commander.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009, 2011, 2014-2015, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains code for merged upsampling/color conversion.
  *
@@ -18,19 +19,19 @@
  * (ie, box filtering), we can save some work in color conversion by
  * calculating all the output pixels corresponding to a pair of chroma
  * samples at one time.  In the conversion equations
- *	R = Y           + K1 * Cr
- *	G = Y + K2 * Cb + K3 * Cr
- *	B = Y + K4 * Cb
+ *      R = Y           + K1 * Cr
+ *      G = Y + K2 * Cb + K3 * Cr
+ *      B = Y + K4 * Cb
  * only the Y term varies among the group of pixels corresponding to a pair
  * of chroma samples, so the rest of the terms can be calculated just once.
  * At typical sampling ratios, this eliminates half or three-quarters of the
  * multiplications needed for color conversion.
  *
  * This file currently provides implementations for the following cases:
- *	YCbCr => RGB color conversion only.
- *	Sampling ratios of 2h1v or 2h2v.
- *	No scaling needed at upsample time.
- *	Corner-aligned (non-CCIR601) sampling alignment.
+ *      YCbCr => RGB color conversion only.
+ *      Sampling ratios of 2h1v or 2h2v.
+ *      No scaling needed at upsample time.
+ *      Corner-aligned (non-CCIR601) sampling alignment.
  * Other special cases could be added, but in most applications these are
  * the only common cases.  (For uncommon cases we fall back on the more
  * general code in jdsample.c and jdcolor.c.)
@@ -40,7 +41,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jsimd.h"
-#include "config.h"
+#include "jconfigint.h"
 
 #ifdef UPSAMPLE_MERGING_SUPPORTED
 
@@ -48,18 +49,17 @@
 /* Private subobject */
 
 typedef struct {
-  struct jpeg_upsampler pub;	/* public fields */
+  struct jpeg_upsampler pub;    /* public fields */
 
   /* Pointer to routine to do actual upsampling/conversion of one row group */
-  JMETHOD(void, upmethod, (j_decompress_ptr cinfo,
-			   JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-			   JSAMPARRAY output_buf));
+  void (*upmethod) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
 
   /* Private state for YCC->RGB conversion */
-  int * Cr_r_tab;		/* => table for Cr to R conversion */
-  int * Cb_b_tab;		/* => table for Cb to B conversion */
-  INT32 * Cr_g_tab;		/* => table for Cr to G conversion */
-  INT32 * Cb_g_tab;		/* => table for Cb to G conversion */
+  int *Cr_r_tab;                /* => table for Cr to R conversion */
+  int *Cb_b_tab;                /* => table for Cb to B conversion */
+  JLONG *Cr_g_tab;              /* => table for Cr to G conversion */
+  JLONG *Cb_g_tab;              /* => table for Cb to G conversion */
 
   /* For 2:1 vertical sampling, we produce two output rows at a time.
    * We need a "spare" row buffer to hold the second output row if the
@@ -67,17 +67,17 @@
    * to discard the dummy last row if the image height is odd.
    */
   JSAMPROW spare_row;
-  boolean spare_full;		/* T if spare buffer is occupied */
+  boolean spare_full;           /* T if spare buffer is occupied */
 
-  JDIMENSION out_row_width;	/* samples per output row */
-  JDIMENSION rows_to_go;	/* counts rows remaining in image */
+  JDIMENSION out_row_width;     /* samples per output row */
+  JDIMENSION rows_to_go;        /* counts rows remaining in image */
 } my_upsampler;
 
-typedef my_upsampler * my_upsample_ptr;
+typedef my_upsampler *my_upsample_ptr;
 
-#define SCALEBITS	16	/* speediest right-shift on some machines */
-#define ONE_HALF	((INT32) 1 << (SCALEBITS-1))
-#define FIX(x)		((INT32) ((x) * (1L<<SCALEBITS) + 0.5))
+#define SCALEBITS       16      /* speediest right-shift on some machines */
+#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
+#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
 
 
 /* Include inline routines for colorspace extensions */
@@ -191,31 +191,31 @@
 {
   my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
   int i;
-  INT32 x;
+  JLONG x;
   SHIFT_TEMPS
 
   upsample->Cr_r_tab = (int *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(MAXJSAMPLE+1) * SIZEOF(int));
+                                (MAXJSAMPLE+1) * sizeof(int));
   upsample->Cb_b_tab = (int *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(MAXJSAMPLE+1) * SIZEOF(int));
-  upsample->Cr_g_tab = (INT32 *)
+                                (MAXJSAMPLE+1) * sizeof(int));
+  upsample->Cr_g_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(MAXJSAMPLE+1) * SIZEOF(INT32));
-  upsample->Cb_g_tab = (INT32 *)
+                                (MAXJSAMPLE+1) * sizeof(JLONG));
+  upsample->Cb_g_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(MAXJSAMPLE+1) * SIZEOF(INT32));
+                                (MAXJSAMPLE+1) * sizeof(JLONG));
 
   for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
     /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
     /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */
     /* Cr=>R value is nearest int to 1.40200 * x */
     upsample->Cr_r_tab[i] = (int)
-		    RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
+                    RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
     /* Cb=>B value is nearest int to 1.77200 * x */
     upsample->Cb_b_tab[i] = (int)
-		    RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
+                    RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
     /* Cr=>G value is scaled-up -0.71414 * x */
     upsample->Cr_g_tab[i] = (- FIX(0.71414)) * x;
     /* Cb=>G value is scaled-up -0.34414 * x */
@@ -249,15 +249,15 @@
 
 METHODDEF(void)
 merged_2v_upsample (j_decompress_ptr cinfo,
-		    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-		    JDIMENSION in_row_groups_avail,
-		    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-		    JDIMENSION out_rows_avail)
+                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
+                    JDIMENSION in_row_groups_avail,
+                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+                    JDIMENSION out_rows_avail)
 /* 2:1 vertical sampling case: may need a spare row. */
 {
   my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
   JSAMPROW work_ptrs[2];
-  JDIMENSION num_rows;		/* number of rows returned to caller */
+  JDIMENSION num_rows;          /* number of rows returned to caller */
 
   if (upsample->spare_full) {
     /* If we have a spare row saved from a previous cycle, just return it. */
@@ -265,7 +265,7 @@
     if (cinfo->out_color_space == JCS_RGB565)
       size = cinfo->output_width * 2;
     jcopy_sample_rows(& upsample->spare_row, 0, output_buf + *out_row_ctr, 0,
-		      1, size);
+                      1, size);
     num_rows = 1;
     upsample->spare_full = FALSE;
   } else {
@@ -301,17 +301,17 @@
 
 METHODDEF(void)
 merged_1v_upsample (j_decompress_ptr cinfo,
-		    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-		    JDIMENSION in_row_groups_avail,
-		    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-		    JDIMENSION out_rows_avail)
+                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
+                    JDIMENSION in_row_groups_avail,
+                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+                    JDIMENSION out_rows_avail)
 /* 1:1 vertical sampling case: much easier, never need a spare row. */
 {
   my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
 
   /* Just do the upsampling. */
   (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr,
-			 output_buf + *out_row_ctr);
+                         output_buf + *out_row_ctr);
   /* Adjust counts */
   (*out_row_ctr)++;
   (*in_row_group_ctr)++;
@@ -334,8 +334,8 @@
 
 METHODDEF(void)
 h2v1_merged_upsample (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf)
+                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+                      JSAMPARRAY output_buf)
 {
   switch (cinfo->out_color_space) {
     case JCS_EXT_RGB:
@@ -380,8 +380,8 @@
 
 METHODDEF(void)
 h2v2_merged_upsample (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf)
+                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+                      JSAMPARRAY output_buf)
 {
   switch (cinfo->out_color_space) {
     case JCS_EXT_RGB:
@@ -436,12 +436,12 @@
 #define PACK_NEED_ALIGNMENT(ptr)  (((size_t)(ptr)) & 3)
 
 #define WRITE_TWO_PIXELS_LE(addr, pixels) {  \
-  ((INT16*)(addr))[0] = (pixels);  \
-  ((INT16*)(addr))[1] = (pixels) >> 16;  \
+  ((INT16*)(addr))[0] = (INT16)(pixels);  \
+  ((INT16*)(addr))[1] = (INT16)((pixels) >> 16);  \
 }
 #define WRITE_TWO_PIXELS_BE(addr, pixels) {  \
-  ((INT16*)(addr))[1] = (pixels);  \
-  ((INT16*)(addr))[0] = (pixels) >> 16;  \
+  ((INT16*)(addr))[1] = (INT16)(pixels);  \
+  ((INT16*)(addr))[0] = (INT16)((pixels) >> 16);  \
 }
 
 #define DITHER_565_R(r, dither)  ((r) + ((dither) & 0xFF))
@@ -456,8 +456,8 @@
  */
 
 #define DITHER_MASK       0x3
-#define DITHER_ROTATE(x)  (((x) << 24) | (((x) >> 8) & 0x00FFFFFF))
-static const INT32 dither_matrix[4] = {
+#define DITHER_ROTATE(x)  ((((x) & 0xFF) << 24) | (((x) >> 8) & 0x00FFFFFF))
+static const JLONG dither_matrix[4] = {
   0x0008020A,
   0x0C040E06,
   0x030B0109,
@@ -520,7 +520,7 @@
   else
     h2v1_merged_upsample_565_le(cinfo, input_buf, in_row_group_ctr,
                                 output_buf);
-}
+ }
 
 
 METHODDEF(void)
@@ -580,7 +580,7 @@
 
   upsample = (my_upsample_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_upsampler));
+                                sizeof(my_upsampler));
   cinfo->upsample = (struct jpeg_upsampler *) upsample;
   upsample->pub.start_pass = start_pass_merged_upsample;
   upsample->pub.need_context_rows = FALSE;
@@ -603,7 +603,7 @@
     /* Allocate a spare row buffer */
     upsample->spare_row = (JSAMPROW)
       (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-		(size_t) (upsample->out_row_width * SIZEOF(JSAMPLE)));
+                (size_t) (upsample->out_row_width * sizeof(JSAMPLE)));
   } else {
     upsample->pub.upsample = merged_1v_upsample;
     if (jsimd_can_h2v1_merged_upsample())
diff --git a/jdmrg565.c b/jdmrg565.c
index 0a10bcc..18287b3 100644
--- a/jdmrg565.c
+++ b/jdmrg565.c
@@ -5,8 +5,9 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains code for merged upsampling/color conversion.
  */
@@ -29,10 +30,10 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   int * Crrtab = upsample->Cr_r_tab;
   int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
+  JLONG * Crgtab = upsample->Cr_g_tab;
+  JLONG * Cbgtab = upsample->Cb_g_tab;
   unsigned int r, g, b;
-  INT32 rgb;
+  JLONG rgb;
   SHIFT_TEMPS
 
   inptr0 = input_buf[0][in_row_group_ctr];
@@ -78,7 +79,7 @@
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr = rgb;
+    *(INT16*)outptr = (INT16)rgb;
    }
  }
 
@@ -100,11 +101,11 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   int * Crrtab = upsample->Cr_r_tab;
   int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
-  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  JLONG * Crgtab = upsample->Cr_g_tab;
+  JLONG * Cbgtab = upsample->Cb_g_tab;
+  JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   unsigned int r, g, b;
-  INT32 rgb;
+  JLONG rgb;
   SHIFT_TEMPS
 
   inptr0 = input_buf[0][in_row_group_ctr];
@@ -152,7 +153,7 @@
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr = rgb;
+    *(INT16*)outptr = (INT16)rgb;
   }
 }
 
@@ -174,10 +175,10 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   int * Crrtab = upsample->Cr_r_tab;
   int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
+  JLONG * Crgtab = upsample->Cr_g_tab;
+  JLONG * Cbgtab = upsample->Cb_g_tab;
   unsigned int r, g, b;
-  INT32 rgb;
+  JLONG rgb;
   SHIFT_TEMPS
 
   inptr00 = input_buf[0][in_row_group_ctr * 2];
@@ -241,14 +242,14 @@
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr0 = rgb;
+    *(INT16*)outptr0 = (INT16)rgb;
 
     y  = GETJSAMPLE(*inptr01);
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr1 = rgb;
+    *(INT16*)outptr1 = (INT16)rgb;
   }
 }
 
@@ -270,12 +271,12 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   int * Crrtab = upsample->Cr_r_tab;
   int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
-  INT32 d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
-  INT32 d1 = dither_matrix[(cinfo->output_scanline+1) & DITHER_MASK];
+  JLONG * Crgtab = upsample->Cr_g_tab;
+  JLONG * Cbgtab = upsample->Cb_g_tab;
+  JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
+  JLONG d1 = dither_matrix[(cinfo->output_scanline+1) & DITHER_MASK];
   unsigned int r, g, b;
-  INT32 rgb;
+  JLONG rgb;
   SHIFT_TEMPS
 
   inptr00 = input_buf[0][in_row_group_ctr*2];
@@ -343,13 +344,13 @@
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr0 = rgb;
+    *(INT16*)outptr0 = (INT16)rgb;
 
     y  = GETJSAMPLE(*inptr01);
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr1 = rgb;
+    *(INT16*)outptr1 = (INT16)rgb;
   }
 }
diff --git a/jdmrgext.c b/jdmrgext.c
index 1f0a550..9d7d2af 100644
--- a/jdmrgext.c
+++ b/jdmrgext.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2011, 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains code for merged upsampling/color conversion.
  */
@@ -35,8 +36,8 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   int * Crrtab = upsample->Cr_r_tab;
   int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
+  JLONG * Crgtab = upsample->Cr_g_tab;
+  JLONG * Cbgtab = upsample->Cb_g_tab;
   SHIFT_TEMPS
 
   inptr0 = input_buf[0][in_row_group_ctr];
@@ -108,8 +109,8 @@
   register JSAMPLE * range_limit = cinfo->sample_range_limit;
   int * Crrtab = upsample->Cr_r_tab;
   int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
+  JLONG * Crgtab = upsample->Cr_g_tab;
+  JLONG * Cbgtab = upsample->Cb_g_tab;
   SHIFT_TEMPS
 
   inptr00 = input_buf[0][in_row_group_ctr*2];
diff --git a/jdphuff.c b/jdphuff.c
index fa97aab..42a7068 100644
--- a/jdphuff.c
+++ b/jdphuff.c
@@ -1,9 +1,12 @@
 /*
  * jdphuff.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains Huffman entropy decoding routines for progressive JPEG.
  *
@@ -17,7 +20,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdhuff.h"		/* Declarations shared with jdhuff.c */
+#include "jdhuff.h"             /* Declarations shared with jdhuff.c */
 
 
 #ifdef D_PROGRESSIVE_SUPPORTED
@@ -30,8 +33,8 @@
  */
 
 typedef struct {
-  unsigned int EOBRUN;			/* remaining EOBs in EOBRUN */
-  int last_dc_val[MAX_COMPS_IN_SCAN];	/* last DC coef for each component */
+  unsigned int EOBRUN;                  /* remaining EOBs in EOBRUN */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
 } savable_state;
 
 /* This macro is to work around compilers with missing or broken
@@ -44,11 +47,11 @@
 #else
 #if MAX_COMPS_IN_SCAN == 4
 #define ASSIGN_STATE(dest,src)  \
-	((dest).EOBRUN = (src).EOBRUN, \
-	 (dest).last_dc_val[0] = (src).last_dc_val[0], \
-	 (dest).last_dc_val[1] = (src).last_dc_val[1], \
-	 (dest).last_dc_val[2] = (src).last_dc_val[2], \
-	 (dest).last_dc_val[3] = (src).last_dc_val[3])
+        ((dest).EOBRUN = (src).EOBRUN, \
+         (dest).last_dc_val[0] = (src).last_dc_val[0], \
+         (dest).last_dc_val[1] = (src).last_dc_val[1], \
+         (dest).last_dc_val[2] = (src).last_dc_val[2], \
+         (dest).last_dc_val[3] = (src).last_dc_val[3])
 #endif
 #endif
 
@@ -59,29 +62,29 @@
   /* These fields are loaded into local variables at start of each MCU.
    * In case of suspension, we exit WITHOUT updating them.
    */
-  bitread_perm_state bitstate;	/* Bit buffer at start of MCU */
-  savable_state saved;		/* Other state at start of MCU */
+  bitread_perm_state bitstate;  /* Bit buffer at start of MCU */
+  savable_state saved;          /* Other state at start of MCU */
 
   /* These fields are NOT loaded into local working state. */
-  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+  unsigned int restarts_to_go;  /* MCUs left in this restart interval */
 
   /* Pointers to derived tables (these workspaces have image lifespan) */
-  d_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
+  d_derived_tbl *derived_tbls[NUM_HUFF_TBLS];
 
-  d_derived_tbl * ac_derived_tbl; /* active table during an AC scan */
+  d_derived_tbl *ac_derived_tbl; /* active table during an AC scan */
 } phuff_entropy_decoder;
 
-typedef phuff_entropy_decoder * phuff_entropy_ptr;
+typedef phuff_entropy_decoder *phuff_entropy_ptr;
 
 /* Forward declarations */
-METHODDEF(boolean) decode_mcu_DC_first JPP((j_decompress_ptr cinfo,
-					    JBLOCKROW *MCU_data));
-METHODDEF(boolean) decode_mcu_AC_first JPP((j_decompress_ptr cinfo,
-					    JBLOCKROW *MCU_data));
-METHODDEF(boolean) decode_mcu_DC_refine JPP((j_decompress_ptr cinfo,
-					     JBLOCKROW *MCU_data));
-METHODDEF(boolean) decode_mcu_AC_refine JPP((j_decompress_ptr cinfo,
-					     JBLOCKROW *MCU_data));
+METHODDEF(boolean) decode_mcu_DC_first (j_decompress_ptr cinfo,
+                                        JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_AC_first (j_decompress_ptr cinfo,
+                                        JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_DC_refine (j_decompress_ptr cinfo,
+                                         JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_AC_refine (j_decompress_ptr cinfo,
+                                         JBLOCKROW *MCU_data);
 
 
 /*
@@ -94,8 +97,9 @@
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
   boolean is_DC_band, bad;
   int ci, coefi, tbl;
+  d_derived_tbl **pdtbl;
   int *coef_bit_ptr;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
 
   is_DC_band = (cinfo->Ss == 0);
 
@@ -117,7 +121,7 @@
     if (cinfo->Al != cinfo->Ah-1)
       bad = TRUE;
   }
-  if (cinfo->Al > 13)		/* need not check for < 0 */
+  if (cinfo->Al > 13)           /* need not check for < 0 */
     bad = TRUE;
   /* Arguably the maximum Al value should be less than 13 for 8-bit precision,
    * but the spec doesn't say so, and we try to be liberal about what we
@@ -127,7 +131,7 @@
    */
   if (bad)
     ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
-	     cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+             cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
   /* Update progression status, and verify that scan order is legal.
    * Note that inter-scan inconsistencies are treated as warnings
    * not fatal errors ... not clear if this is right way to behave.
@@ -140,7 +144,7 @@
     for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
       int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
       if (cinfo->Ah != expected)
-	WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+        WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
       coef_bit_ptr[coefi] = cinfo->Al;
     }
   }
@@ -164,15 +168,15 @@
      * We may build same derived table more than once, but it's not expensive.
      */
     if (is_DC_band) {
-      if (cinfo->Ah == 0) {	/* DC refinement needs no table */
-	tbl = compptr->dc_tbl_no;
-	jpeg_make_d_derived_tbl(cinfo, TRUE, tbl,
-				& entropy->derived_tbls[tbl]);
+      if (cinfo->Ah == 0) {     /* DC refinement needs no table */
+        tbl = compptr->dc_tbl_no;
+        pdtbl = entropy->derived_tbls + tbl;
+        jpeg_make_d_derived_tbl(cinfo, TRUE, tbl, pdtbl);
       }
     } else {
       tbl = compptr->ac_tbl_no;
-      jpeg_make_d_derived_tbl(cinfo, FALSE, tbl,
-			      & entropy->derived_tbls[tbl]);
+      pdtbl = entropy->derived_tbls + tbl;
+      jpeg_make_d_derived_tbl(cinfo, FALSE, tbl, pdtbl);
       /* remember the single active table */
       entropy->ac_derived_tbl = entropy->derived_tbls[tbl];
     }
@@ -201,7 +205,8 @@
 #define AVOID_TABLES
 #ifdef AVOID_TABLES
 
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
+#define NEG_1 ((unsigned)-1)
+#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((NEG_1)<<(s)) + 1) : (x))
 
 #else
 
@@ -264,7 +269,7 @@
 /*
  * Huffman MCU decoding.
  * Each of these routines decodes and returns one MCU's worth of
- * Huffman-compressed coefficients. 
+ * Huffman-compressed coefficients.
  * The coefficients are reordered from zigzag order into natural array order,
  * but are not dequantized.
  *
@@ -285,7 +290,7 @@
 
 METHODDEF(boolean)
 decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
+{
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
   int Al = cinfo->Al;
   register int s, r;
@@ -293,14 +298,14 @@
   JBLOCKROW block;
   BITREAD_STATE_VARS;
   savable_state state;
-  d_derived_tbl * tbl;
-  jpeg_component_info * compptr;
+  d_derived_tbl *tbl;
+  jpeg_component_info *compptr;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
       if (! process_restart(cinfo))
-	return FALSE;
+        return FALSE;
   }
 
   /* If we've run out of data, just leave the MCU set to zeroes.
@@ -325,16 +330,16 @@
       /* Section F.2.2.1: decode the DC coefficient difference */
       HUFF_DECODE(s, br_state, tbl, return FALSE, label1);
       if (s) {
-	CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	r = GET_BITS(s);
-	s = HUFF_EXTEND(r, s);
+        CHECK_BIT_BUFFER(br_state, s, return FALSE);
+        r = GET_BITS(s);
+        s = HUFF_EXTEND(r, s);
       }
 
       /* Convert DC difference to actual value, update last_dc_val */
       s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
-      (*block)[0] = (JCOEF) (s << Al);
+      (*block)[0] = (JCOEF) LEFT_SHIFT(s, Al);
     }
 
     /* Completed MCU, so update state */
@@ -356,7 +361,7 @@
 
 METHODDEF(boolean)
 decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
+{
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
   int Se = cinfo->Se;
   int Al = cinfo->Al;
@@ -364,13 +369,13 @@
   unsigned int EOBRUN;
   JBLOCKROW block;
   BITREAD_STATE_VARS;
-  d_derived_tbl * tbl;
+  d_derived_tbl *tbl;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
       if (! process_restart(cinfo))
-	return FALSE;
+        return FALSE;
   }
 
   /* If we've run out of data, just leave the MCU set to zeroes.
@@ -381,49 +386,49 @@
     /* Load up working state.
      * We can avoid loading/saving bitread state if in an EOB run.
      */
-    EOBRUN = entropy->saved.EOBRUN;	/* only part of saved state we need */
+    EOBRUN = entropy->saved.EOBRUN;     /* only part of saved state we need */
 
     /* There is always only one block per MCU */
 
-    if (EOBRUN > 0)		/* if it's a band of zeroes... */
-      EOBRUN--;			/* ...process it now (we do nothing) */
+    if (EOBRUN > 0)             /* if it's a band of zeroes... */
+      EOBRUN--;                 /* ...process it now (we do nothing) */
     else {
       BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
       block = MCU_data[0];
       tbl = entropy->ac_derived_tbl;
 
       for (k = cinfo->Ss; k <= Se; k++) {
-	HUFF_DECODE(s, br_state, tbl, return FALSE, label2);
-	r = s >> 4;
-	s &= 15;
-	if (s) {
-	  k += r;
-	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	  r = GET_BITS(s);
-	  s = HUFF_EXTEND(r, s);
-	  /* Scale and output coefficient in natural (dezigzagged) order */
-	  (*block)[jpeg_natural_order[k]] = (JCOEF) (s << Al);
-	} else {
-	  if (r == 15) {	/* ZRL */
-	    k += 15;		/* skip 15 zeroes in band */
-	  } else {		/* EOBr, run length is 2^r + appended bits */
-	    EOBRUN = 1 << r;
-	    if (r) {		/* EOBr, r > 0 */
-	      CHECK_BIT_BUFFER(br_state, r, return FALSE);
-	      r = GET_BITS(r);
-	      EOBRUN += r;
-	    }
-	    EOBRUN--;		/* this band is processed at this moment */
-	    break;		/* force end-of-band */
-	  }
-	}
+        HUFF_DECODE(s, br_state, tbl, return FALSE, label2);
+        r = s >> 4;
+        s &= 15;
+        if (s) {
+          k += r;
+          CHECK_BIT_BUFFER(br_state, s, return FALSE);
+          r = GET_BITS(s);
+          s = HUFF_EXTEND(r, s);
+          /* Scale and output coefficient in natural (dezigzagged) order */
+          (*block)[jpeg_natural_order[k]] = (JCOEF) LEFT_SHIFT(s, Al);
+        } else {
+          if (r == 15) {        /* ZRL */
+            k += 15;            /* skip 15 zeroes in band */
+          } else {              /* EOBr, run length is 2^r + appended bits */
+            EOBRUN = 1 << r;
+            if (r) {            /* EOBr, r > 0 */
+              CHECK_BIT_BUFFER(br_state, r, return FALSE);
+              r = GET_BITS(r);
+              EOBRUN += r;
+            }
+            EOBRUN--;           /* this band is processed at this moment */
+            break;              /* force end-of-band */
+          }
+        }
       }
 
       BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
     }
 
     /* Completed MCU, so update state */
-    entropy->saved.EOBRUN = EOBRUN;	/* only part of saved state we need */
+    entropy->saved.EOBRUN = EOBRUN;     /* only part of saved state we need */
   }
 
   /* Account for restart interval (no-op if not using restarts) */
@@ -441,9 +446,9 @@
 
 METHODDEF(boolean)
 decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
+{
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
+  int p1 = 1 << cinfo->Al;      /* 1 in the bit position being coded */
   int blkn;
   JBLOCKROW block;
   BITREAD_STATE_VARS;
@@ -452,7 +457,7 @@
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
       if (! process_restart(cinfo))
-	return FALSE;
+        return FALSE;
   }
 
   /* Not worth the cycles to check insufficient_data here,
@@ -490,17 +495,17 @@
 
 METHODDEF(boolean)
 decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
+{
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
   int Se = cinfo->Se;
-  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
-  int m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+  int p1 = 1 << cinfo->Al;        /* 1 in the bit position being coded */
+  int m1 = (NEG_1) << cinfo->Al;  /* -1 in the bit position being coded */
   register int s, k, r;
   unsigned int EOBRUN;
   JBLOCKROW block;
   JCOEFPTR thiscoef;
   BITREAD_STATE_VARS;
-  d_derived_tbl * tbl;
+  d_derived_tbl *tbl;
   int num_newnz;
   int newnz_pos[DCTSIZE2];
 
@@ -508,7 +513,7 @@
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
       if (! process_restart(cinfo))
-	return FALSE;
+        return FALSE;
   }
 
   /* If we've run out of data, don't modify the MCU.
@@ -536,58 +541,58 @@
 
     if (EOBRUN == 0) {
       for (; k <= Se; k++) {
-	HUFF_DECODE(s, br_state, tbl, goto undoit, label3);
-	r = s >> 4;
-	s &= 15;
-	if (s) {
-	  if (s != 1)		/* size of new coef should always be 1 */
-	    WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
-	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-	  if (GET_BITS(1))
-	    s = p1;		/* newly nonzero coef is positive */
-	  else
-	    s = m1;		/* newly nonzero coef is negative */
-	} else {
-	  if (r != 15) {
-	    EOBRUN = 1 << r;	/* EOBr, run length is 2^r + appended bits */
-	    if (r) {
-	      CHECK_BIT_BUFFER(br_state, r, goto undoit);
-	      r = GET_BITS(r);
-	      EOBRUN += r;
-	    }
-	    break;		/* rest of block is handled by EOB logic */
-	  }
-	  /* note s = 0 for processing ZRL */
-	}
-	/* Advance over already-nonzero coefs and r still-zero coefs,
-	 * appending correction bits to the nonzeroes.  A correction bit is 1
-	 * if the absolute value of the coefficient must be increased.
-	 */
-	do {
-	  thiscoef = *block + jpeg_natural_order[k];
-	  if (*thiscoef != 0) {
-	    CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-	    if (GET_BITS(1)) {
-	      if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
-		if (*thiscoef >= 0)
-		  *thiscoef += p1;
-		else
-		  *thiscoef += m1;
-	      }
-	    }
-	  } else {
-	    if (--r < 0)
-	      break;		/* reached target zero coefficient */
-	  }
-	  k++;
-	} while (k <= Se);
-	if (s) {
-	  int pos = jpeg_natural_order[k];
-	  /* Output newly nonzero coefficient */
-	  (*block)[pos] = (JCOEF) s;
-	  /* Remember its position in case we have to suspend */
-	  newnz_pos[num_newnz++] = pos;
-	}
+        HUFF_DECODE(s, br_state, tbl, goto undoit, label3);
+        r = s >> 4;
+        s &= 15;
+        if (s) {
+          if (s != 1)           /* size of new coef should always be 1 */
+            WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
+          CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+          if (GET_BITS(1))
+            s = p1;             /* newly nonzero coef is positive */
+          else
+            s = m1;             /* newly nonzero coef is negative */
+        } else {
+          if (r != 15) {
+            EOBRUN = 1 << r;    /* EOBr, run length is 2^r + appended bits */
+            if (r) {
+              CHECK_BIT_BUFFER(br_state, r, goto undoit);
+              r = GET_BITS(r);
+              EOBRUN += r;
+            }
+            break;              /* rest of block is handled by EOB logic */
+          }
+          /* note s = 0 for processing ZRL */
+        }
+        /* Advance over already-nonzero coefs and r still-zero coefs,
+         * appending correction bits to the nonzeroes.  A correction bit is 1
+         * if the absolute value of the coefficient must be increased.
+         */
+        do {
+          thiscoef = *block + jpeg_natural_order[k];
+          if (*thiscoef != 0) {
+            CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+            if (GET_BITS(1)) {
+              if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
+                if (*thiscoef >= 0)
+                  *thiscoef += p1;
+                else
+                  *thiscoef += m1;
+              }
+            }
+          } else {
+            if (--r < 0)
+              break;            /* reached target zero coefficient */
+          }
+          k++;
+        } while (k <= Se);
+        if (s) {
+          int pos = jpeg_natural_order[k];
+          /* Output newly nonzero coefficient */
+          (*block)[pos] = (JCOEF) s;
+          /* Remember its position in case we have to suspend */
+          newnz_pos[num_newnz++] = pos;
+        }
       }
     }
 
@@ -598,18 +603,18 @@
        * if the absolute value of the coefficient must be increased.
        */
       for (; k <= Se; k++) {
-	thiscoef = *block + jpeg_natural_order[k];
-	if (*thiscoef != 0) {
-	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-	  if (GET_BITS(1)) {
-	    if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
-	      if (*thiscoef >= 0)
-		*thiscoef += p1;
-	      else
-		*thiscoef += m1;
-	    }
-	  }
-	}
+        thiscoef = *block + jpeg_natural_order[k];
+        if (*thiscoef != 0) {
+          CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+          if (GET_BITS(1)) {
+            if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
+              if (*thiscoef >= 0)
+                *thiscoef += p1;
+              else
+                *thiscoef += m1;
+            }
+          }
+        }
       }
       /* Count one block completed in EOB run */
       EOBRUN--;
@@ -647,7 +652,7 @@
 
   entropy = (phuff_entropy_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(phuff_entropy_decoder));
+                                sizeof(phuff_entropy_decoder));
   cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
   entropy->pub.start_pass = start_pass_phuff_decoder;
 
@@ -659,9 +664,9 @@
   /* Create progression status table */
   cinfo->coef_bits = (int (*)[DCTSIZE2])
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				cinfo->num_components*DCTSIZE2*SIZEOF(int));
+                                cinfo->num_components*DCTSIZE2*sizeof(int));
   coef_bit_ptr = & cinfo->coef_bits[0][0];
-  for (ci = 0; ci < cinfo->num_components; ci++) 
+  for (ci = 0; ci < cinfo->num_components; ci++)
     for (i = 0; i < DCTSIZE2; i++)
       *coef_bit_ptr++ = -1;
 }
diff --git a/jdpostct.c b/jdpostct.c
index 571563d..601fc2a 100644
--- a/jdpostct.c
+++ b/jdpostct.c
@@ -1,9 +1,12 @@
 /*
  * jdpostct.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the decompression postprocessing controller.
  * This controller manages the upsampling, color conversion, and color
@@ -31,37 +34,34 @@
    * For two-pass color quantization, we need a full-image buffer;
    * for one-pass operation, a strip buffer is sufficient.
    */
-  jvirt_sarray_ptr whole_image;	/* virtual array, or NULL if one-pass */
-  JSAMPARRAY buffer;		/* strip buffer, or current strip of virtual */
-  JDIMENSION strip_height;	/* buffer size in rows */
+  jvirt_sarray_ptr whole_image; /* virtual array, or NULL if one-pass */
+  JSAMPARRAY buffer;            /* strip buffer, or current strip of virtual */
+  JDIMENSION strip_height;      /* buffer size in rows */
   /* for two-pass mode only: */
-  JDIMENSION starting_row;	/* row # of first row in current strip */
-  JDIMENSION next_row;		/* index of next row to fill/empty in strip */
+  JDIMENSION starting_row;      /* row # of first row in current strip */
+  JDIMENSION next_row;          /* index of next row to fill/empty in strip */
 } my_post_controller;
 
-typedef my_post_controller * my_post_ptr;
+typedef my_post_controller *my_post_ptr;
 
 
 /* Forward declarations */
 METHODDEF(void) post_process_1pass
-	JPP((j_decompress_ptr cinfo,
-	     JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-	     JDIMENSION in_row_groups_avail,
-	     JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-	     JDIMENSION out_rows_avail));
+        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+         JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
+         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+         JDIMENSION out_rows_avail);
 #ifdef QUANT_2PASS_SUPPORTED
 METHODDEF(void) post_process_prepass
-	JPP((j_decompress_ptr cinfo,
-	     JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-	     JDIMENSION in_row_groups_avail,
-	     JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-	     JDIMENSION out_rows_avail));
+        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+         JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
+         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+         JDIMENSION out_rows_avail);
 METHODDEF(void) post_process_2pass
-	JPP((j_decompress_ptr cinfo,
-	     JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-	     JDIMENSION in_row_groups_avail,
-	     JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-	     JDIMENSION out_rows_avail));
+        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+         JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
+         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+         JDIMENSION out_rows_avail);
 #endif
 
 
@@ -84,9 +84,9 @@
        * allocate a strip buffer.  Use the virtual-array buffer as workspace.
        */
       if (post->buffer == NULL) {
-	post->buffer = (*cinfo->mem->access_virt_sarray)
-	  ((j_common_ptr) cinfo, post->whole_image,
-	   (JDIMENSION) 0, post->strip_height, TRUE);
+        post->buffer = (*cinfo->mem->access_virt_sarray)
+          ((j_common_ptr) cinfo, post->whole_image,
+           (JDIMENSION) 0, post->strip_height, TRUE);
       }
     } else {
       /* For single-pass processing without color quantization,
@@ -124,10 +124,10 @@
 
 METHODDEF(void)
 post_process_1pass (j_decompress_ptr cinfo,
-		    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-		    JDIMENSION in_row_groups_avail,
-		    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-		    JDIMENSION out_rows_avail)
+                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
+                    JDIMENSION in_row_groups_avail,
+                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+                    JDIMENSION out_rows_avail)
 {
   my_post_ptr post = (my_post_ptr) cinfo->post;
   JDIMENSION num_rows, max_rows;
@@ -139,11 +139,11 @@
     max_rows = post->strip_height;
   num_rows = 0;
   (*cinfo->upsample->upsample) (cinfo,
-		input_buf, in_row_group_ctr, in_row_groups_avail,
-		post->buffer, &num_rows, max_rows);
+                input_buf, in_row_group_ctr, in_row_groups_avail,
+                post->buffer, &num_rows, max_rows);
   /* Quantize and emit data. */
   (*cinfo->cquantize->color_quantize) (cinfo,
-		post->buffer, output_buf + *out_row_ctr, (int) num_rows);
+                post->buffer, output_buf + *out_row_ctr, (int) num_rows);
   *out_row_ctr += num_rows;
 }
 
@@ -156,10 +156,10 @@
 
 METHODDEF(void)
 post_process_prepass (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-		      JDIMENSION in_row_groups_avail,
-		      JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-		      JDIMENSION out_rows_avail)
+                      JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
+                      JDIMENSION in_row_groups_avail,
+                      JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+                      JDIMENSION out_rows_avail)
 {
   my_post_ptr post = (my_post_ptr) cinfo->post;
   JDIMENSION old_next_row, num_rows;
@@ -167,22 +167,22 @@
   /* Reposition virtual buffer if at start of strip. */
   if (post->next_row == 0) {
     post->buffer = (*cinfo->mem->access_virt_sarray)
-	((j_common_ptr) cinfo, post->whole_image,
-	 post->starting_row, post->strip_height, TRUE);
+        ((j_common_ptr) cinfo, post->whole_image,
+         post->starting_row, post->strip_height, TRUE);
   }
 
   /* Upsample some data (up to a strip height's worth). */
   old_next_row = post->next_row;
   (*cinfo->upsample->upsample) (cinfo,
-		input_buf, in_row_group_ctr, in_row_groups_avail,
-		post->buffer, &post->next_row, post->strip_height);
+                input_buf, in_row_group_ctr, in_row_groups_avail,
+                post->buffer, &post->next_row, post->strip_height);
 
   /* Allow quantizer to scan new data.  No data is emitted, */
   /* but we advance out_row_ctr so outer loop can tell when we're done. */
   if (post->next_row > old_next_row) {
     num_rows = post->next_row - old_next_row;
     (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + old_next_row,
-					 (JSAMPARRAY) NULL, (int) num_rows);
+                                         (JSAMPARRAY) NULL, (int) num_rows);
     *out_row_ctr += num_rows;
   }
 
@@ -200,10 +200,10 @@
 
 METHODDEF(void)
 post_process_2pass (j_decompress_ptr cinfo,
-		    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-		    JDIMENSION in_row_groups_avail,
-		    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-		    JDIMENSION out_rows_avail)
+                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
+                    JDIMENSION in_row_groups_avail,
+                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+                    JDIMENSION out_rows_avail)
 {
   my_post_ptr post = (my_post_ptr) cinfo->post;
   JDIMENSION num_rows, max_rows;
@@ -211,8 +211,8 @@
   /* Reposition virtual buffer if at start of strip. */
   if (post->next_row == 0) {
     post->buffer = (*cinfo->mem->access_virt_sarray)
-	((j_common_ptr) cinfo, post->whole_image,
-	 post->starting_row, post->strip_height, FALSE);
+        ((j_common_ptr) cinfo, post->whole_image,
+         post->starting_row, post->strip_height, FALSE);
   }
 
   /* Determine number of rows to emit. */
@@ -227,8 +227,8 @@
 
   /* Quantize and emit data. */
   (*cinfo->cquantize->color_quantize) (cinfo,
-		post->buffer + post->next_row, output_buf + *out_row_ctr,
-		(int) num_rows);
+                post->buffer + post->next_row, output_buf + *out_row_ctr,
+                (int) num_rows);
   *out_row_ctr += num_rows;
 
   /* Advance if we filled the strip. */
@@ -253,11 +253,11 @@
 
   post = (my_post_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_post_controller));
+                                sizeof(my_post_controller));
   cinfo->post = (struct jpeg_d_post_controller *) post;
   post->pub.start_pass = start_pass_dpost;
-  post->whole_image = NULL;	/* flag for no virtual arrays */
-  post->buffer = NULL;		/* flag for no strip buffer */
+  post->whole_image = NULL;     /* flag for no virtual arrays */
+  post->buffer = NULL;          /* flag for no strip buffer */
 
   /* Create the quantization buffer, if needed */
   if (cinfo->quantize_colors) {
@@ -271,20 +271,20 @@
       /* We round up the number of rows to a multiple of the strip height. */
 #ifdef QUANT_2PASS_SUPPORTED
       post->whole_image = (*cinfo->mem->request_virt_sarray)
-	((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-	 cinfo->output_width * cinfo->out_color_components,
-	 (JDIMENSION) jround_up((long) cinfo->output_height,
-				(long) post->strip_height),
-	 post->strip_height);
+        ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
+         cinfo->output_width * cinfo->out_color_components,
+         (JDIMENSION) jround_up((long) cinfo->output_height,
+                                (long) post->strip_height),
+         post->strip_height);
 #else
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 #endif /* QUANT_2PASS_SUPPORTED */
     } else {
       /* One-pass color quantization: just make a strip buffer. */
       post->buffer = (*cinfo->mem->alloc_sarray)
-	((j_common_ptr) cinfo, JPOOL_IMAGE,
-	 cinfo->output_width * cinfo->out_color_components,
-	 post->strip_height);
+        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+         cinfo->output_width * cinfo->out_color_components,
+         post->strip_height);
     }
   }
 }
diff --git a/jdsample.c b/jdsample.c
index 92d6b8a..39b3725 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -5,8 +5,11 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2014, MIPS Technologies, Inc., California
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains upsampling routines.
  *
@@ -21,6 +24,7 @@
  *   Pub. by IEEE Computer Society Press, Los Alamitos, CA. ISBN 0-8186-8944-7.
  */
 
+#include "jinclude.h"
 #include "jdsample.h"
 #include "jsimd.h"
 #include "jpegcomp.h"
@@ -53,26 +57,26 @@
 
 METHODDEF(void)
 sep_upsample (j_decompress_ptr cinfo,
-	      JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-	      JDIMENSION in_row_groups_avail,
-	      JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-	      JDIMENSION out_rows_avail)
+              JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
+              JDIMENSION in_row_groups_avail,
+              JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+              JDIMENSION out_rows_avail)
 {
   my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   JDIMENSION num_rows;
 
   /* Fill the conversion buffer, if it's empty */
   if (upsample->next_row_out >= cinfo->max_v_samp_factor) {
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-	 ci++, compptr++) {
+         ci++, compptr++) {
       /* Invoke per-component upsample method.  Notice we pass a POINTER
        * to color_buf[ci], so that fullsize_upsample can change it.
        */
       (*upsample->methods[ci]) (cinfo, compptr,
-	input_buf[ci] + (*in_row_group_ctr * upsample->rowgroup_height[ci]),
-	upsample->color_buf + ci);
+        input_buf[ci] + (*in_row_group_ctr * upsample->rowgroup_height[ci]),
+        upsample->color_buf + ci);
     }
     upsample->next_row_out = 0;
   }
@@ -84,7 +88,7 @@
   /* Not more than the distance to the end of the image.  Need this test
    * in case the image height is not a multiple of max_v_samp_factor:
    */
-  if (num_rows > upsample->rows_to_go) 
+  if (num_rows > upsample->rows_to_go)
     num_rows = upsample->rows_to_go;
   /* And not more than what the client can accept: */
   out_rows_avail -= *out_row_ctr;
@@ -92,9 +96,9 @@
     num_rows = out_rows_avail;
 
   (*cinfo->cconvert->color_convert) (cinfo, upsample->color_buf,
-				     (JDIMENSION) upsample->next_row_out,
-				     output_buf + *out_row_ctr,
-				     (int) num_rows);
+                                     (JDIMENSION) upsample->next_row_out,
+                                     output_buf + *out_row_ctr,
+                                     (int) num_rows);
 
   /* Adjust counts */
   *out_row_ctr += num_rows;
@@ -120,8 +124,8 @@
  */
 
 METHODDEF(void)
-fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		   JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   *output_data_ptr = input_data;
 }
@@ -133,10 +137,10 @@
  */
 
 METHODDEF(void)
-noop_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+noop_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
-  *output_data_ptr = NULL;	/* safety check */
+  *output_data_ptr = NULL;      /* safety check */
 }
 
 
@@ -152,8 +156,8 @@
  */
 
 METHODDEF(void)
-int_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	      JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
   JSAMPARRAY output_data = *output_data_ptr;
@@ -174,15 +178,15 @@
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;	/* don't need GETJSAMPLE() here */
+      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
       for (h = h_expand; h > 0; h--) {
-	*outptr++ = invalue;
+        *outptr++ = invalue;
       }
     }
     /* Generate any additional output rows by duplicating the first one */
     if (v_expand > 1) {
       jcopy_sample_rows(output_data, outrow, output_data, outrow+1,
-			v_expand-1, cinfo->output_width);
+                        v_expand-1, cinfo->output_width);
     }
     inrow++;
     outrow += v_expand;
@@ -196,8 +200,8 @@
  */
 
 METHODDEF(void)
-h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -210,7 +214,7 @@
     outptr = output_data[inrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;	/* don't need GETJSAMPLE() here */
+      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
@@ -224,8 +228,8 @@
  */
 
 METHODDEF(void)
-h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -239,12 +243,12 @@
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;	/* don't need GETJSAMPLE() here */
+      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
     jcopy_sample_rows(output_data, outrow, output_data, outrow+1,
-		      1, cinfo->output_width);
+                      1, cinfo->output_width);
     inrow++;
     outrow += 2;
   }
@@ -267,8 +271,8 @@
  */
 
 METHODDEF(void)
-h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		     JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -308,15 +312,15 @@
  */
 
 METHODDEF(void)
-h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		     JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr0, inptr1, outptr;
 #if BITS_IN_JSAMPLE == 8
   register int thiscolsum, lastcolsum, nextcolsum;
 #else
-  register INT32 thiscolsum, lastcolsum, nextcolsum;
+  register JLONG thiscolsum, lastcolsum, nextcolsum;
 #endif
   register JDIMENSION colctr;
   int inrow, outrow, v;
@@ -326,10 +330,10 @@
     for (v = 0; v < 2; v++) {
       /* inptr0 points to nearest input row, inptr1 points to next nearest */
       inptr0 = input_data[inrow];
-      if (v == 0)		/* next nearest is row above */
-	inptr1 = input_data[inrow-1];
-      else			/* next nearest is row below */
-	inptr1 = input_data[inrow+1];
+      if (v == 0)               /* next nearest is row above */
+        inptr1 = input_data[inrow-1];
+      else                      /* next nearest is row below */
+        inptr1 = input_data[inrow+1];
       outptr = output_data[outrow++];
 
       /* Special case for first column */
@@ -340,12 +344,12 @@
       lastcolsum = thiscolsum; thiscolsum = nextcolsum;
 
       for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
-	/* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
-	/* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
-	nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-	*outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-	*outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-	lastcolsum = thiscolsum; thiscolsum = nextcolsum;
+        /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
+        /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
+        nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+        *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
+        *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
+        lastcolsum = thiscolsum; thiscolsum = nextcolsum;
       }
 
       /* Special case for last column */
@@ -366,19 +370,22 @@
 {
   my_upsample_ptr upsample;
   int ci;
-  jpeg_component_info * compptr;
+  jpeg_component_info *compptr;
   boolean need_buffer, do_fancy;
   int h_in_group, v_in_group, h_out_group, v_out_group;
 
-  upsample = (my_upsample_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_upsampler));
-  cinfo->upsample = (struct jpeg_upsampler *) upsample;
-  upsample->pub.start_pass = start_pass_upsample;
-  upsample->pub.upsample = sep_upsample;
-  upsample->pub.need_context_rows = FALSE; /* until we find out differently */
+  if (!cinfo->master->jinit_upsampler_no_alloc) {
+    upsample = (my_upsample_ptr)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                  sizeof(my_upsampler));
+    cinfo->upsample = (struct jpeg_upsampler *) upsample;
+    upsample->pub.start_pass = start_pass_upsample;
+    upsample->pub.upsample = sep_upsample;
+    upsample->pub.need_context_rows = FALSE; /* until we find out differently */
+  } else
+    upsample = (my_upsample_ptr) cinfo->upsample;
 
-  if (cinfo->CCIR601_sampling)	/* this isn't supported */
+  if (cinfo->CCIR601_sampling)  /* this isn't supported */
     ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);
 
   /* jdmainct.c doesn't support context rows when min_DCT_scaled_size = 1,
@@ -395,9 +402,9 @@
      * are to be converted to max_h_samp_factor * max_v_samp_factor pixels.
      */
     h_in_group = (compptr->h_samp_factor * compptr->_DCT_scaled_size) /
-		 cinfo->_min_DCT_scaled_size;
+                 cinfo->_min_DCT_scaled_size;
     v_in_group = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
-		 cinfo->_min_DCT_scaled_size;
+                 cinfo->_min_DCT_scaled_size;
     h_out_group = cinfo->max_h_samp_factor;
     v_out_group = cinfo->max_v_samp_factor;
     upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
@@ -411,48 +418,53 @@
       upsample->methods[ci] = fullsize_upsample;
       need_buffer = FALSE;
     } else if (h_in_group * 2 == h_out_group &&
-	       v_in_group == v_out_group) {
+               v_in_group == v_out_group) {
       /* Special cases for 2h1v upsampling */
       if (do_fancy && compptr->downsampled_width > 2) {
-	if (jsimd_can_h2v1_fancy_upsample())
-	  upsample->methods[ci] = jsimd_h2v1_fancy_upsample;
-	else
-	  upsample->methods[ci] = h2v1_fancy_upsample;
+        if (jsimd_can_h2v1_fancy_upsample())
+          upsample->methods[ci] = jsimd_h2v1_fancy_upsample;
+        else
+          upsample->methods[ci] = h2v1_fancy_upsample;
       } else {
-	if (jsimd_can_h2v1_upsample())
-	  upsample->methods[ci] = jsimd_h2v1_upsample;
-	else
-	  upsample->methods[ci] = h2v1_upsample;
+        if (jsimd_can_h2v1_upsample())
+          upsample->methods[ci] = jsimd_h2v1_upsample;
+        else
+          upsample->methods[ci] = h2v1_upsample;
       }
     } else if (h_in_group * 2 == h_out_group &&
-	       v_in_group * 2 == v_out_group) {
+               v_in_group * 2 == v_out_group) {
       /* Special cases for 2h2v upsampling */
       if (do_fancy && compptr->downsampled_width > 2) {
-	if (jsimd_can_h2v2_fancy_upsample())
-	  upsample->methods[ci] = jsimd_h2v2_fancy_upsample;
-	else
-	  upsample->methods[ci] = h2v2_fancy_upsample;
-	upsample->pub.need_context_rows = TRUE;
+        if (jsimd_can_h2v2_fancy_upsample())
+          upsample->methods[ci] = jsimd_h2v2_fancy_upsample;
+        else
+          upsample->methods[ci] = h2v2_fancy_upsample;
+        upsample->pub.need_context_rows = TRUE;
       } else {
-	if (jsimd_can_h2v2_upsample())
-	  upsample->methods[ci] = jsimd_h2v2_upsample;
-	else
-	  upsample->methods[ci] = h2v2_upsample;
+        if (jsimd_can_h2v2_upsample())
+          upsample->methods[ci] = jsimd_h2v2_upsample;
+        else
+          upsample->methods[ci] = h2v2_upsample;
       }
     } else if ((h_out_group % h_in_group) == 0 &&
-	       (v_out_group % v_in_group) == 0) {
+               (v_out_group % v_in_group) == 0) {
       /* Generic integral-factors upsampling method */
-      upsample->methods[ci] = int_upsample;
+#if defined(__mips__)
+      if (jsimd_can_int_upsample())
+        upsample->methods[ci] = jsimd_int_upsample;
+      else
+#endif
+        upsample->methods[ci] = int_upsample;
       upsample->h_expand[ci] = (UINT8) (h_out_group / h_in_group);
       upsample->v_expand[ci] = (UINT8) (v_out_group / v_in_group);
     } else
       ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
-    if (need_buffer) {
+    if (need_buffer && !cinfo->master->jinit_upsampler_no_alloc) {
       upsample->color_buf[ci] = (*cinfo->mem->alloc_sarray)
-	((j_common_ptr) cinfo, JPOOL_IMAGE,
-	 (JDIMENSION) jround_up((long) cinfo->output_width,
-				(long) cinfo->max_h_samp_factor),
-	 (JDIMENSION) cinfo->max_v_samp_factor);
+        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+         (JDIMENSION) jround_up((long) cinfo->output_width,
+                                (long) cinfo->max_h_samp_factor),
+         (JDIMENSION) cinfo->max_v_samp_factor);
     }
   }
 }
diff --git a/jdsample.h b/jdsample.h
index 5226f26..a6bf08a 100644
--- a/jdsample.h
+++ b/jdsample.h
@@ -3,19 +3,19 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  */
 
 #define JPEG_INTERNALS
-#include "jinclude.h"
 #include "jpeglib.h"
 
 
 /* Pointer to routine to upsample a single component */
 typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
-                               jpeg_component_info * compptr,
+                               jpeg_component_info *compptr,
                                JSAMPARRAY input_data,
-                               JSAMPARRAY * output_data_ptr);
+                               JSAMPARRAY *output_data_ptr);
 
 /* Private subobject */
 
@@ -47,4 +47,4 @@
   UINT8 v_expand[MAX_COMPONENTS];
 } my_upsampler;
 
-typedef my_upsampler * my_upsample_ptr;
+typedef my_upsampler *my_upsample_ptr;
diff --git a/jdtrans.c b/jdtrans.c
index f0cd0ae..cfc85dd 100644
--- a/jdtrans.c
+++ b/jdtrans.c
@@ -1,9 +1,12 @@
 /*
  * jdtrans.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains library routines for transcoding decompression,
  * that is, reading raw DCT coefficient arrays from an input JPEG file.
@@ -16,7 +19,7 @@
 
 
 /* Forward declarations */
-LOCAL(void) transdecode_master_selection JPP((j_decompress_ptr cinfo));
+LOCAL(void) transdecode_master_selection (j_decompress_ptr cinfo);
 
 
 /*
@@ -55,20 +58,20 @@
       int retcode;
       /* Call progress monitor hook if present */
       if (cinfo->progress != NULL)
-	(*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
       /* Absorb some more input */
       retcode = (*cinfo->inputctl->consume_input) (cinfo);
       if (retcode == JPEG_SUSPENDED)
-	return NULL;
+        return NULL;
       if (retcode == JPEG_REACHED_EOI)
-	break;
+        break;
       /* Advance progress counter if appropriate */
       if (cinfo->progress != NULL &&
-	  (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
-	if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
-	  /* startup underestimated number of scans; ratchet up one scan */
-	  cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
-	}
+          (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
+        if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
+          /* startup underestimated number of scans; ratchet up one scan */
+          cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
+        }
       }
     }
     /* Set state so that jpeg_finish_decompress does the right thing */
@@ -84,7 +87,7 @@
   }
   /* Oops, improper usage */
   ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
-  return NULL;			/* keep compiler happy */
+  return NULL;                  /* keep compiler happy */
 }
 
 
diff --git a/jerror.c b/jerror.c
index 3da7be8..c31acd9 100644
--- a/jerror.c
+++ b/jerror.c
@@ -1,9 +1,12 @@
 /*
  * jerror.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains simple error-reporting and trace-message routines.
  * These are suitable for Unix-like systems and others where writing to
@@ -28,7 +31,7 @@
 #include <windows.h>
 #endif
 
-#ifndef EXIT_FAILURE		/* define exit() codes if not provided */
+#ifndef EXIT_FAILURE            /* define exit() codes if not provided */
 #define EXIT_FAILURE  1
 #endif
 
@@ -41,11 +44,7 @@
  * want to refer to it directly.
  */
 
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_std_message_table	jMsgTable
-#endif
-
-#define JMESSAGE(code,string)	string ,
+#define JMESSAGE(code,string)   string ,
 
 const char * const jpeg_std_message_table[] = {
 #include "jerror.h"
@@ -105,7 +104,7 @@
 #ifdef USE_WINDOWS_MESSAGEBOX
   /* Display it in a message dialog box */
   MessageBox(GetActiveWindow(), buffer, "JPEG Library Error",
-	     MB_OK | MB_ICONERROR);
+             MB_OK | MB_ICONERROR);
 #else
   /* Send it to stderr, adding a newline */
   fprintf(stderr, "%s\n", buffer);
@@ -127,7 +126,7 @@
 METHODDEF(void)
 emit_message (j_common_ptr cinfo, int msg_level)
 {
-  struct jpeg_error_mgr * err = cinfo->err;
+  struct jpeg_error_mgr *err = cinfo->err;
 
   if (msg_level < 0) {
     /* It's a warning message.  Since corrupt files may generate many warnings,
@@ -154,12 +153,12 @@
  */
 
 METHODDEF(void)
-format_message (j_common_ptr cinfo, char * buffer)
+format_message (j_common_ptr cinfo, char *buffer)
 {
-  struct jpeg_error_mgr * err = cinfo->err;
+  struct jpeg_error_mgr *err = cinfo->err;
   int msg_code = err->msg_code;
-  const char * msgtext = NULL;
-  const char * msgptr;
+  const char *msgtext = NULL;
+  const char *msgptr;
   char ch;
   boolean isstring;
 
@@ -167,8 +166,8 @@
   if (msg_code > 0 && msg_code <= err->last_jpeg_message) {
     msgtext = err->jpeg_message_table[msg_code];
   } else if (err->addon_message_table != NULL &&
-	     msg_code >= err->first_addon_message &&
-	     msg_code <= err->last_addon_message) {
+             msg_code >= err->first_addon_message &&
+             msg_code <= err->last_addon_message) {
     msgtext = err->addon_message_table[msg_code - err->first_addon_message];
   }
 
@@ -193,10 +192,10 @@
     sprintf(buffer, msgtext, err->msg_parm.s);
   else
     sprintf(buffer, msgtext,
-	    err->msg_parm.i[0], err->msg_parm.i[1],
-	    err->msg_parm.i[2], err->msg_parm.i[3],
-	    err->msg_parm.i[4], err->msg_parm.i[5],
-	    err->msg_parm.i[6], err->msg_parm.i[7]);
+            err->msg_parm.i[0], err->msg_parm.i[1],
+            err->msg_parm.i[2], err->msg_parm.i[3],
+            err->msg_parm.i[4], err->msg_parm.i[5],
+            err->msg_parm.i[6], err->msg_parm.i[7]);
 }
 
 
@@ -213,22 +212,22 @@
 {
   cinfo->err->num_warnings = 0;
   /* trace_level is not reset since it is an application-supplied parameter */
-  cinfo->err->msg_code = 0;	/* may be useful as a flag for "no error" */
+  cinfo->err->msg_code = 0;     /* may be useful as a flag for "no error" */
 }
 
 
 /*
  * Fill in the standard error-handling methods in a jpeg_error_mgr object.
  * Typical call is:
- *	struct jpeg_compress_struct cinfo;
- *	struct jpeg_error_mgr err;
+ *      struct jpeg_compress_struct cinfo;
+ *      struct jpeg_error_mgr err;
  *
- *	cinfo.err = jpeg_std_error(&err);
+ *      cinfo.err = jpeg_std_error(&err);
  * after which the application may override some of the methods.
  */
 
 GLOBAL(struct jpeg_error_mgr *)
-jpeg_std_error (struct jpeg_error_mgr * err)
+jpeg_std_error (struct jpeg_error_mgr *err)
 {
   err->error_exit = error_exit;
   err->emit_message = emit_message;
@@ -236,16 +235,16 @@
   err->format_message = format_message;
   err->reset_error_mgr = reset_error_mgr;
 
-  err->trace_level = 0;		/* default = no tracing */
-  err->num_warnings = 0;	/* no warnings emitted yet */
-  err->msg_code = 0;		/* may be useful as a flag for "no error" */
+  err->trace_level = 0;         /* default = no tracing */
+  err->num_warnings = 0;        /* no warnings emitted yet */
+  err->msg_code = 0;            /* may be useful as a flag for "no error" */
 
   /* Initialize message table pointers */
   err->jpeg_message_table = jpeg_std_message_table;
   err->last_jpeg_message = (int) JMSG_LASTMSGCODE - 1;
 
   err->addon_message_table = NULL;
-  err->first_addon_message = 0;	/* for safety */
+  err->first_addon_message = 0; /* for safety */
   err->last_addon_message = 0;
 
   return err;
diff --git a/jerror.h b/jerror.h
index 275086e..11a07cb 100644
--- a/jerror.h
+++ b/jerror.h
@@ -1,10 +1,13 @@
 /*
  * jerror.h
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2014, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file defines the error and message codes for the JPEG library.
  * Edit this file to add new codes, or to translate the message strings to
@@ -33,7 +36,7 @@
 
 typedef enum {
 
-#define JMESSAGE(code,string)	code ,
+#define JMESSAGE(code,string)   code ,
 
 #endif /* JMAKE_ENUM_LIST */
 
@@ -42,7 +45,7 @@
 /* For maintenance convenience, list is alphabetical by message code name */
 #if JPEG_LIB_VERSION < 70
 JMESSAGE(JERR_ARITH_NOTIMPL,
-	 "Sorry, arithmetic coding is not implemented")
+         "Sorry, arithmetic coding is not implemented")
 #endif
 JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
 JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
@@ -55,26 +58,26 @@
 JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported")
 #if JPEG_LIB_VERSION >= 70
 JMESSAGE(JERR_BAD_DROP_SAMPLING,
-	 "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
+         "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
 #endif
 JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition")
 JMESSAGE(JERR_BAD_IN_COLORSPACE, "Bogus input colorspace")
 JMESSAGE(JERR_BAD_J_COLORSPACE, "Bogus JPEG colorspace")
 JMESSAGE(JERR_BAD_LENGTH, "Bogus marker length")
 JMESSAGE(JERR_BAD_LIB_VERSION,
-	 "Wrong JPEG library version: library is %d, caller expects %d")
+         "Wrong JPEG library version: library is %d, caller expects %d")
 JMESSAGE(JERR_BAD_MCU_SIZE, "Sampling factors too large for interleaved scan")
 JMESSAGE(JERR_BAD_POOL_ID, "Invalid memory pool code %d")
 JMESSAGE(JERR_BAD_PRECISION, "Unsupported JPEG data precision %d")
 JMESSAGE(JERR_BAD_PROGRESSION,
-	 "Invalid progressive parameters Ss=%d Se=%d Ah=%d Al=%d")
+         "Invalid progressive parameters Ss=%d Se=%d Ah=%d Al=%d")
 JMESSAGE(JERR_BAD_PROG_SCRIPT,
-	 "Invalid progressive parameters at scan script entry %d")
+         "Invalid progressive parameters at scan script entry %d")
 JMESSAGE(JERR_BAD_SAMPLING, "Bogus sampling factors")
 JMESSAGE(JERR_BAD_SCAN_SCRIPT, "Invalid scan script at entry %d")
 JMESSAGE(JERR_BAD_STATE, "Improper call to JPEG library in state %d")
 JMESSAGE(JERR_BAD_STRUCT_SIZE,
-	 "JPEG parameter struct mismatch: library thinks size is %u, caller expects %u")
+         "JPEG parameter struct mismatch: library thinks size is %u, caller expects %u")
 JMESSAGE(JERR_BAD_VIRTUAL_ACCESS, "Bogus virtual array access")
 JMESSAGE(JERR_BUFFER_SIZE, "Buffer passed to JPEG library is too small")
 JMESSAGE(JERR_CANT_SUSPEND, "Suspension not allowed here")
@@ -98,7 +101,7 @@
 JMESSAGE(JERR_INPUT_EMPTY, "Empty input file")
 JMESSAGE(JERR_INPUT_EOF, "Premature end of input file")
 JMESSAGE(JERR_MISMATCHED_QUANT_TABLE,
-	 "Cannot transcode due to multiple use of quantization table %d")
+         "Cannot transcode due to multiple use of quantization table %d")
 JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
 JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
 JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
@@ -113,7 +116,7 @@
 JMESSAGE(JERR_NO_SOI, "Not a JPEG file: starts with 0x%02x 0x%02x")
 JMESSAGE(JERR_OUT_OF_MEMORY, "Insufficient memory (case %d)")
 JMESSAGE(JERR_QUANT_COMPONENTS,
-	 "Cannot quantize more than %d color components")
+         "Cannot quantize more than %d color components")
 JMESSAGE(JERR_QUANT_FEW_COLORS, "Cannot quantize to fewer than %d colors")
 JMESSAGE(JERR_QUANT_MANY_COLORS, "Cannot quantize to more than %d colors")
 JMESSAGE(JERR_SOF_DUPLICATE, "Invalid JPEG file structure: two SOF markers")
@@ -125,19 +128,19 @@
 JMESSAGE(JERR_TFILE_READ, "Read failed on temporary file")
 JMESSAGE(JERR_TFILE_SEEK, "Seek failed on temporary file")
 JMESSAGE(JERR_TFILE_WRITE,
-	 "Write failed on temporary file --- out of disk space?")
+         "Write failed on temporary file --- out of disk space?")
 JMESSAGE(JERR_TOO_LITTLE_DATA, "Application transferred too few scanlines")
 JMESSAGE(JERR_UNKNOWN_MARKER, "Unsupported marker type 0x%02x")
 JMESSAGE(JERR_VIRTUAL_BUG, "Virtual array controller messed up")
 JMESSAGE(JERR_WIDTH_OVERFLOW, "Image too wide for this implementation")
 JMESSAGE(JERR_XMS_READ, "Read from XMS failed")
 JMESSAGE(JERR_XMS_WRITE, "Write to XMS failed")
-JMESSAGE(JMSG_COPYRIGHT, JCOPYRIGHT)
+JMESSAGE(JMSG_COPYRIGHT, JCOPYRIGHT_SHORT)
 JMESSAGE(JMSG_VERSION, JVERSION)
 JMESSAGE(JTRC_16BIT_TABLES,
-	 "Caution: quantization tables are too coarse for baseline JPEG")
+         "Caution: quantization tables are too coarse for baseline JPEG")
 JMESSAGE(JTRC_ADOBE,
-	 "Adobe APP14 marker: version %d, flags 0x%04x 0x%04x, transform %d")
+         "Adobe APP14 marker: version %d, flags 0x%04x 0x%04x, transform %d")
 JMESSAGE(JTRC_APP0, "Unknown APP0 marker (not JFIF), length %u")
 JMESSAGE(JTRC_APP14, "Unknown APP14 marker (not Adobe), length %u")
 JMESSAGE(JTRC_DAC, "Define Arithmetic Table 0x%02x: 0x%02x")
@@ -150,9 +153,9 @@
 JMESSAGE(JTRC_HUFFBITS, "        %3d %3d %3d %3d %3d %3d %3d %3d")
 JMESSAGE(JTRC_JFIF, "JFIF APP0 marker: version %d.%02d, density %dx%d  %d")
 JMESSAGE(JTRC_JFIF_BADTHUMBNAILSIZE,
-	 "Warning: thumbnail image size does not match data length %u")
+         "Warning: thumbnail image size does not match data length %u")
 JMESSAGE(JTRC_JFIF_EXTENSION,
-	 "JFIF extension marker: type 0x%02x, length %u")
+         "JFIF extension marker: type 0x%02x, length %u")
 JMESSAGE(JTRC_JFIF_THUMBNAIL, "    with %d x %d thumbnail image")
 JMESSAGE(JTRC_MISC_MARKER, "Miscellaneous marker 0x%02x, length %u")
 JMESSAGE(JTRC_PARMLESS_MARKER, "Unexpected marker 0x%02x")
@@ -163,7 +166,7 @@
 JMESSAGE(JTRC_RECOVERY_ACTION, "At marker 0x%02x, recovery action %d")
 JMESSAGE(JTRC_RST, "RST%d")
 JMESSAGE(JTRC_SMOOTH_NOTIMPL,
-	 "Smoothing not supported with nonstandard sampling ratios")
+         "Smoothing not supported with nonstandard sampling ratios")
 JMESSAGE(JTRC_SOF, "Start Of Frame 0x%02x: width=%u, height=%u, components=%d")
 JMESSAGE(JTRC_SOF_COMPONENT, "    Component %d: %dhx%dv q=%d")
 JMESSAGE(JTRC_SOI, "Start of Image")
@@ -173,13 +176,13 @@
 JMESSAGE(JTRC_TFILE_CLOSE, "Closed temporary file %s")
 JMESSAGE(JTRC_TFILE_OPEN, "Opened temporary file %s")
 JMESSAGE(JTRC_THUMB_JPEG,
-	 "JFIF extension marker: JPEG-compressed thumbnail image, length %u")
+         "JFIF extension marker: JPEG-compressed thumbnail image, length %u")
 JMESSAGE(JTRC_THUMB_PALETTE,
-	 "JFIF extension marker: palette thumbnail image, length %u")
+         "JFIF extension marker: palette thumbnail image, length %u")
 JMESSAGE(JTRC_THUMB_RGB,
-	 "JFIF extension marker: RGB thumbnail image, length %u")
+         "JFIF extension marker: RGB thumbnail image, length %u")
 JMESSAGE(JTRC_UNKNOWN_IDS,
-	 "Unrecognized component IDs %d %d %d, assuming YCbCr")
+         "Unrecognized component IDs %d %d %d, assuming YCbCr")
 JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u")
 JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u")
 JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d")
@@ -187,15 +190,15 @@
 JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 #endif
 JMESSAGE(JWRN_BOGUS_PROGRESSION,
-	 "Inconsistent progression sequence for component %d coefficient %d")
+         "Inconsistent progression sequence for component %d coefficient %d")
 JMESSAGE(JWRN_EXTRANEOUS_DATA,
-	 "Corrupt JPEG data: %u extraneous bytes before marker 0x%02x")
+         "Corrupt JPEG data: %u extraneous bytes before marker 0x%02x")
 JMESSAGE(JWRN_HIT_MARKER, "Corrupt JPEG data: premature end of data segment")
 JMESSAGE(JWRN_HUFF_BAD_CODE, "Corrupt JPEG data: bad Huffman code")
 JMESSAGE(JWRN_JFIF_MAJOR, "Warning: unknown JFIF revision number %d.%02d")
 JMESSAGE(JWRN_JPEG_EOF, "Premature end of JPEG file")
 JMESSAGE(JWRN_MUST_RESYNC,
-	 "Corrupt JPEG data: found marker 0x%02x instead of RST%d")
+         "Corrupt JPEG data: found marker 0x%02x instead of RST%d")
 JMESSAGE(JWRN_NOT_SEQUENTIAL, "Invalid SOS parameters for sequential JPEG")
 JMESSAGE(JWRN_TOO_MUCH_DATA, "Application transferred too many scanlines")
 #if JPEG_LIB_VERSION < 70
@@ -255,7 +258,7 @@
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
    (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
 
-#define MAKESTMT(stuff)		do { stuff } while (0)
+#define MAKESTMT(stuff)         do { stuff } while (0)
 
 /* Nonfatal errors (we can keep going, but the data is probably corrupt) */
 #define WARNMS(cinfo,code)  \
@@ -286,26 +289,26 @@
    (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
 #define TRACEMS3(cinfo,lvl,code,p1,p2,p3)  \
   MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
-	   _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); \
-	   (cinfo)->err->msg_code = (code); \
-	   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
+           _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); \
+           (cinfo)->err->msg_code = (code); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
 #define TRACEMS4(cinfo,lvl,code,p1,p2,p3,p4)  \
   MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
-	   _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
-	   (cinfo)->err->msg_code = (code); \
-	   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
+           _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+           (cinfo)->err->msg_code = (code); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
 #define TRACEMS5(cinfo,lvl,code,p1,p2,p3,p4,p5)  \
   MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
-	   _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
-	   _mp[4] = (p5); \
-	   (cinfo)->err->msg_code = (code); \
-	   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
+           _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+           _mp[4] = (p5); \
+           (cinfo)->err->msg_code = (code); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
 #define TRACEMS8(cinfo,lvl,code,p1,p2,p3,p4,p5,p6,p7,p8)  \
   MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
-	   _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
-	   _mp[4] = (p5); _mp[5] = (p6); _mp[6] = (p7); _mp[7] = (p8); \
-	   (cinfo)->err->msg_code = (code); \
-	   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
+           _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+           _mp[4] = (p5); _mp[5] = (p6); _mp[6] = (p7); _mp[7] = (p8); \
+           (cinfo)->err->msg_code = (code); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
 #define TRACEMSS(cinfo,lvl,code,str)  \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
diff --git a/jfdctflt.c b/jfdctflt.c
index 79d7a00..b3da3eb 100644
--- a/jfdctflt.c
+++ b/jfdctflt.c
@@ -3,7 +3,8 @@
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a floating-point implementation of the
  * forward DCT (Discrete Cosine Transform).
@@ -20,8 +21,8 @@
  * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
  * Japanese, but the algorithm is described in the Pennebaker & Mitchell
- * JPEG textbook (see REFERENCES section in file README).  The following code
- * is based directly on figure 4-8 in P&M.
+ * JPEG textbook (see REFERENCES section in file README.ijg).  The following
+ * code is based directly on figure 4-8 in P&M.
  * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  * possible to arrange the computation so that many of the multiplies are
  * simple scalings of the final outputs.  These multiplies can then be
@@ -37,7 +38,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jdct.h"               /* Private declarations for DCT subsystem */
 
 #ifdef DCT_FLOAT_SUPPORTED
 
@@ -56,7 +57,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_float (FAST_FLOAT * data)
+jpeg_fdct_float (FAST_FLOAT *data)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
@@ -76,24 +77,24 @@
     tmp5 = dataptr[2] - dataptr[5];
     tmp3 = dataptr[3] + dataptr[4];
     tmp4 = dataptr[3] - dataptr[4];
-    
+
     /* Even part */
-    
-    tmp10 = tmp0 + tmp3;	/* phase 2 */
+
+    tmp10 = tmp0 + tmp3;        /* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     dataptr[0] = tmp10 + tmp11; /* phase 3 */
     dataptr[4] = tmp10 - tmp11;
-    
+
     z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
-    dataptr[2] = tmp13 + z1;	/* phase 5 */
+    dataptr[2] = tmp13 + z1;    /* phase 5 */
     dataptr[6] = tmp13 - z1;
-    
+
     /* Odd part */
 
-    tmp10 = tmp4 + tmp5;	/* phase 2 */
+    tmp10 = tmp4 + tmp5;        /* phase 2 */
     tmp11 = tmp5 + tmp6;
     tmp12 = tmp6 + tmp7;
 
@@ -103,15 +104,15 @@
     z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */
     z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */
 
-    z11 = tmp7 + z3;		/* phase 5 */
+    z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
 
-    dataptr[5] = z13 + z2;	/* phase 6 */
+    dataptr[5] = z13 + z2;      /* phase 6 */
     dataptr[3] = z13 - z2;
     dataptr[1] = z11 + z4;
     dataptr[7] = z11 - z4;
 
-    dataptr += DCTSIZE;		/* advance pointer to next row */
+    dataptr += DCTSIZE;         /* advance pointer to next row */
   }
 
   /* Pass 2: process columns. */
@@ -126,24 +127,24 @@
     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
     tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-    
+
     /* Even part */
-    
-    tmp10 = tmp0 + tmp3;	/* phase 2 */
+
+    tmp10 = tmp0 + tmp3;        /* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
     dataptr[DCTSIZE*4] = tmp10 - tmp11;
-    
+
     z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
     dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
     dataptr[DCTSIZE*6] = tmp13 - z1;
-    
+
     /* Odd part */
 
-    tmp10 = tmp4 + tmp5;	/* phase 2 */
+    tmp10 = tmp4 + tmp5;        /* phase 2 */
     tmp11 = tmp5 + tmp6;
     tmp12 = tmp6 + tmp7;
 
@@ -153,7 +154,7 @@
     z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */
     z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */
 
-    z11 = tmp7 + z3;		/* phase 5 */
+    z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
 
     dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
@@ -161,7 +162,7 @@
     dataptr[DCTSIZE*1] = z11 + z4;
     dataptr[DCTSIZE*7] = z11 - z4;
 
-    dataptr++;			/* advance pointer to next column */
+    dataptr++;                  /* advance pointer to next column */
   }
 }
 
diff --git a/jfdctfst.c b/jfdctfst.c
index ccb378a..82b2515 100644
--- a/jfdctfst.c
+++ b/jfdctfst.c
@@ -1,9 +1,12 @@
 /*
  * jfdctfst.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a fast, not so accurate integer implementation of the
  * forward DCT (Discrete Cosine Transform).
@@ -15,8 +18,8 @@
  * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
  * Japanese, but the algorithm is described in the Pennebaker & Mitchell
- * JPEG textbook (see REFERENCES section in file README).  The following code
- * is based directly on figure 4-8 in P&M.
+ * JPEG textbook (see REFERENCES section in file README.ijg).  The following
+ * code is based directly on figure 4-8 in P&M.
  * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  * possible to arrange the computation so that many of the multiplies are
  * simple scalings of the final outputs.  These multiplies can then be
@@ -33,7 +36,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jdct.h"               /* Private declarations for DCT subsystem */
 
 #ifdef DCT_IFAST_SUPPORTED
 
@@ -76,10 +79,10 @@
  */
 
 #if CONST_BITS == 8
-#define FIX_0_382683433  ((INT32)   98)		/* FIX(0.382683433) */
-#define FIX_0_541196100  ((INT32)  139)		/* FIX(0.541196100) */
-#define FIX_0_707106781  ((INT32)  181)		/* FIX(0.707106781) */
-#define FIX_1_306562965  ((INT32)  334)		/* FIX(1.306562965) */
+#define FIX_0_382683433  ((JLONG)   98)         /* FIX(0.382683433) */
+#define FIX_0_541196100  ((JLONG)  139)         /* FIX(0.541196100) */
+#define FIX_0_707106781  ((JLONG)  181)         /* FIX(0.707106781) */
+#define FIX_1_306562965  ((JLONG)  334)         /* FIX(1.306562965) */
 #else
 #define FIX_0_382683433  FIX(0.382683433)
 #define FIX_0_541196100  FIX(0.541196100)
@@ -99,7 +102,7 @@
 #endif
 
 
-/* Multiply a DCTELEM variable by an INT32 constant, and immediately
+/* Multiply a DCTELEM variable by an JLONG constant, and immediately
  * descale to yield a DCTELEM result.
  */
 
@@ -111,7 +114,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_ifast (DCTELEM * data)
+jpeg_fdct_ifast (DCTELEM *data)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
@@ -132,24 +135,24 @@
     tmp5 = dataptr[2] - dataptr[5];
     tmp3 = dataptr[3] + dataptr[4];
     tmp4 = dataptr[3] - dataptr[4];
-    
+
     /* Even part */
-    
-    tmp10 = tmp0 + tmp3;	/* phase 2 */
+
+    tmp10 = tmp0 + tmp3;        /* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     dataptr[0] = tmp10 + tmp11; /* phase 3 */
     dataptr[4] = tmp10 - tmp11;
-    
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
-    dataptr[2] = tmp13 + z1;	/* phase 5 */
+    dataptr[2] = tmp13 + z1;    /* phase 5 */
     dataptr[6] = tmp13 - z1;
-    
+
     /* Odd part */
 
-    tmp10 = tmp4 + tmp5;	/* phase 2 */
+    tmp10 = tmp4 + tmp5;        /* phase 2 */
     tmp11 = tmp5 + tmp6;
     tmp12 = tmp6 + tmp7;
 
@@ -159,15 +162,15 @@
     z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
     z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
 
-    z11 = tmp7 + z3;		/* phase 5 */
+    z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
 
-    dataptr[5] = z13 + z2;	/* phase 6 */
+    dataptr[5] = z13 + z2;      /* phase 6 */
     dataptr[3] = z13 - z2;
     dataptr[1] = z11 + z4;
     dataptr[7] = z11 - z4;
 
-    dataptr += DCTSIZE;		/* advance pointer to next row */
+    dataptr += DCTSIZE;         /* advance pointer to next row */
   }
 
   /* Pass 2: process columns. */
@@ -182,24 +185,24 @@
     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
     tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-    
+
     /* Even part */
-    
-    tmp10 = tmp0 + tmp3;	/* phase 2 */
+
+    tmp10 = tmp0 + tmp3;        /* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
     dataptr[DCTSIZE*4] = tmp10 - tmp11;
-    
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
     dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
     dataptr[DCTSIZE*6] = tmp13 - z1;
-    
+
     /* Odd part */
 
-    tmp10 = tmp4 + tmp5;	/* phase 2 */
+    tmp10 = tmp4 + tmp5;        /* phase 2 */
     tmp11 = tmp5 + tmp6;
     tmp12 = tmp6 + tmp7;
 
@@ -209,7 +212,7 @@
     z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
     z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
 
-    z11 = tmp7 + z3;		/* phase 5 */
+    z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
 
     dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
@@ -217,7 +220,7 @@
     dataptr[DCTSIZE*1] = z11 + z4;
     dataptr[DCTSIZE*7] = z11 - z4;
 
-    dataptr++;			/* advance pointer to next column */
+    dataptr++;                  /* advance pointer to next column */
   }
 }
 
diff --git a/jfdctint.c b/jfdctint.c
index 0a78b64..73e0b59 100644
--- a/jfdctint.c
+++ b/jfdctint.c
@@ -1,9 +1,12 @@
 /*
  * jfdctint.c
  *
+ * This file was part of the Independent JPEG Group's software.
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a slow-but-accurate integer implementation of the
  * forward DCT (Discrete Cosine Transform).
@@ -26,7 +29,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jdct.h"               /* Private declarations for DCT subsystem */
 
 #ifdef DCT_ISLOW_SUPPORTED
 
@@ -67,7 +70,7 @@
  * they are represented to better-than-integral precision.  These outputs
  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
  * with the recommended scaling.  (For 12-bit sample data, the intermediate
- * array is INT32 anyway.)
+ * array is JLONG anyway.)
  *
  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
@@ -79,7 +82,7 @@
 #define PASS1_BITS  2
 #else
 #define CONST_BITS  13
-#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
+#define PASS1_BITS  1           /* lose a little precision to avoid overflow */
 #endif
 
 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
@@ -90,18 +93,18 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
-#define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
-#define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
-#define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
-#define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
-#define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
-#define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
-#define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
-#define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
-#define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
-#define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
-#define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
+#define FIX_0_298631336  ((JLONG)  2446)        /* FIX(0.298631336) */
+#define FIX_0_390180644  ((JLONG)  3196)        /* FIX(0.390180644) */
+#define FIX_0_541196100  ((JLONG)  4433)        /* FIX(0.541196100) */
+#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
+#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
+#define FIX_1_175875602  ((JLONG)  9633)        /* FIX(1.175875602) */
+#define FIX_1_501321110  ((JLONG)  12299)       /* FIX(1.501321110) */
+#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
+#define FIX_1_961570560  ((JLONG)  16069)       /* FIX(1.961570560) */
+#define FIX_2_053119869  ((JLONG)  16819)       /* FIX(2.053119869) */
+#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
+#define FIX_3_072711026  ((JLONG)  25172)       /* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
@@ -118,7 +121,7 @@
 #endif
 
 
-/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
+/* Multiply an JLONG variable by an JLONG constant to yield an JLONG result.
  * For 8-bit samples with the recommended scaling, all the variable
  * and constant values involved are no more than 16 bits wide, so a
  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
@@ -137,11 +140,11 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_islow (DCTELEM * data)
+jpeg_fdct_islow (DCTELEM *data)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  INT32 tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3, z4, z5;
+  JLONG tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  JLONG tmp10, tmp11, tmp12, tmp13;
+  JLONG z1, z2, z3, z4, z5;
   DCTELEM *dataptr;
   int ctr;
   SHIFT_TEMPS
@@ -160,36 +163,36 @@
     tmp5 = dataptr[2] - dataptr[5];
     tmp3 = dataptr[3] + dataptr[4];
     tmp4 = dataptr[3] - dataptr[4];
-    
+
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
      */
-    
+
     tmp10 = tmp0 + tmp3;
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
-    dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
-    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
-    
+
+    dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+    dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
     dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-				   CONST_BITS-PASS1_BITS);
+                                   CONST_BITS-PASS1_BITS);
     dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-				   CONST_BITS-PASS1_BITS);
-    
+                                   CONST_BITS-PASS1_BITS);
+
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
      * cK represents cos(K*pi/16).
      * i0..i3 in the paper are tmp4..tmp7 here.
      */
-    
+
     z1 = tmp4 + tmp7;
     z2 = tmp5 + tmp6;
     z3 = tmp4 + tmp6;
     z4 = tmp5 + tmp7;
     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
+
     tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
     tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
@@ -198,16 +201,16 @@
     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
+
     z3 += z5;
     z4 += z5;
-    
+
     dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
     dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
     dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
     dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
-    
-    dataptr += DCTSIZE;		/* advance pointer to next row */
+
+    dataptr += DCTSIZE;         /* advance pointer to next row */
   }
 
   /* Pass 2: process columns.
@@ -225,36 +228,36 @@
     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
     tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-    
+
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
      */
-    
+
     tmp10 = tmp0 + tmp3;
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
-    
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-					   CONST_BITS+PASS1_BITS);
+                                           CONST_BITS+PASS1_BITS);
     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-					   CONST_BITS+PASS1_BITS);
-    
+                                           CONST_BITS+PASS1_BITS);
+
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
      * cK represents cos(K*pi/16).
      * i0..i3 in the paper are tmp4..tmp7 here.
      */
-    
+
     z1 = tmp4 + tmp7;
     z2 = tmp5 + tmp6;
     z3 = tmp4 + tmp6;
     z4 = tmp5 + tmp7;
     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
+
     tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
     tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
@@ -263,20 +266,20 @@
     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
+
     z3 += z5;
     z4 += z5;
-    
+
     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
-					   CONST_BITS+PASS1_BITS);
+                                           CONST_BITS+PASS1_BITS);
     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
-					   CONST_BITS+PASS1_BITS);
+                                           CONST_BITS+PASS1_BITS);
     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
-					   CONST_BITS+PASS1_BITS);
+                                           CONST_BITS+PASS1_BITS);
     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
-					   CONST_BITS+PASS1_BITS);
-    
-    dataptr++;			/* advance pointer to next column */
+                                           CONST_BITS+PASS1_BITS);
+
+    dataptr++;                  /* advance pointer to next column */
   }
 }
 
diff --git a/jidctflt.c b/jidctflt.c
index 0188ce3..68c521e 100644
--- a/jidctflt.c
+++ b/jidctflt.c
@@ -1,9 +1,13 @@
 /*
  * jidctflt.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * Modified 2010 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2014, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a floating-point implementation of the
  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
@@ -22,8 +26,8 @@
  * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
  * Japanese, but the algorithm is described in the Pennebaker & Mitchell
- * JPEG textbook (see REFERENCES section in file README).  The following code
- * is based directly on figure 4-8 in P&M.
+ * JPEG textbook (see REFERENCES section in file README.ijg).  The following
+ * code is based directly on figure 4-8 in P&M.
  * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  * possible to arrange the computation so that many of the multiplies are
  * simple scalings of the final outputs.  These multiplies can then be
@@ -39,7 +43,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jdct.h"               /* Private declarations for DCT subsystem */
 
 #ifdef DCT_FLOAT_SUPPORTED
 
@@ -65,21 +69,21 @@
  */
 
 GLOBAL(void)
-jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block,
+                 JSAMPARRAY output_buf, JDIMENSION output_col)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
   FAST_FLOAT z5, z10, z11, z12, z13;
   JCOEFPTR inptr;
-  FLOAT_MULT_TYPE * quantptr;
-  FAST_FLOAT * wsptr;
+  FLOAT_MULT_TYPE *quantptr;
+  FAST_FLOAT *wsptr;
   JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  JSAMPLE *range_limit = cinfo->sample_range_limit;
   int ctr;
   FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */
-  SHIFT_TEMPS
+  #define _0_125 ((FLOAT_MULT_TYPE)0.125)
 
   /* Pass 1: process columns from input, store into work array. */
 
@@ -95,14 +99,15 @@
      * With typical images and quantization tables, half or more of the
      * column DCT calculations can be simplified this way.
      */
-    
+
     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-	inptr[DCTSIZE*7] == 0) {
+        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+        inptr[DCTSIZE*7] == 0) {
       /* AC terms all zero */
-      FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-      
+      FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0],
+                                    quantptr[DCTSIZE*0] * _0_125);
+
       wsptr[DCTSIZE*0] = dcval;
       wsptr[DCTSIZE*1] = dcval;
       wsptr[DCTSIZE*2] = dcval;
@@ -111,53 +116,53 @@
       wsptr[DCTSIZE*5] = dcval;
       wsptr[DCTSIZE*6] = dcval;
       wsptr[DCTSIZE*7] = dcval;
-      
-      inptr++;			/* advance pointers to next column */
+
+      inptr++;                  /* advance pointers to next column */
       quantptr++;
       wsptr++;
       continue;
     }
-    
+
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0] * _0_125);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2] * _0_125);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4] * _0_125);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6] * _0_125);
 
-    tmp10 = tmp0 + tmp2;	/* phase 3 */
+    tmp10 = tmp0 + tmp2;        /* phase 3 */
     tmp11 = tmp0 - tmp2;
 
-    tmp13 = tmp1 + tmp3;	/* phases 5-3 */
+    tmp13 = tmp1 + tmp3;        /* phases 5-3 */
     tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */
 
-    tmp0 = tmp10 + tmp13;	/* phase 2 */
+    tmp0 = tmp10 + tmp13;       /* phase 2 */
     tmp3 = tmp10 - tmp13;
     tmp1 = tmp11 + tmp12;
     tmp2 = tmp11 - tmp12;
-    
+
     /* Odd part */
 
-    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1] * _0_125);
+    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3] * _0_125);
+    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5] * _0_125);
+    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7] * _0_125);
 
-    z13 = tmp6 + tmp5;		/* phase 6 */
+    z13 = tmp6 + tmp5;          /* phase 6 */
     z10 = tmp6 - tmp5;
     z11 = tmp4 + tmp7;
     z12 = tmp4 - tmp7;
 
-    tmp7 = z11 + z13;		/* phase 5 */
+    tmp7 = z11 + z13;           /* phase 5 */
     tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */
 
     z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */
-    tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */
+    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
 
-    tmp6 = tmp12 - tmp7;	/* phase 2 */
+    tmp6 = tmp12 - tmp7;        /* phase 2 */
     tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
+    tmp4 = tmp10 - tmp5;
 
     wsptr[DCTSIZE*0] = tmp0 + tmp7;
     wsptr[DCTSIZE*7] = tmp0 - tmp7;
@@ -165,16 +170,15 @@
     wsptr[DCTSIZE*6] = tmp1 - tmp6;
     wsptr[DCTSIZE*2] = tmp2 + tmp5;
     wsptr[DCTSIZE*5] = tmp2 - tmp5;
-    wsptr[DCTSIZE*4] = tmp3 + tmp4;
-    wsptr[DCTSIZE*3] = tmp3 - tmp4;
+    wsptr[DCTSIZE*3] = tmp3 + tmp4;
+    wsptr[DCTSIZE*4] = tmp3 - tmp4;
 
-    inptr++;			/* advance pointers to next column */
+    inptr++;                    /* advance pointers to next column */
     quantptr++;
     wsptr++;
   }
-  
+
   /* Pass 2: process rows from work array, store into output array. */
-  /* Note that we must descale the results by a factor of 8 == 2**3. */
 
   wsptr = workspace;
   for (ctr = 0; ctr < DCTSIZE; ctr++) {
@@ -184,11 +188,13 @@
      * the simplification applies less often (typically 5% to 10% of the time).
      * And testing floats for zero is relatively expensive, so we don't bother.
      */
-    
+
     /* Even part */
 
-    tmp10 = wsptr[0] + wsptr[4];
-    tmp11 = wsptr[0] - wsptr[4];
+    /* Apply signed->unsigned and prepare float->int conversion */
+    z5 = wsptr[0] + ((FAST_FLOAT) CENTERJSAMPLE + (FAST_FLOAT) 0.5);
+    tmp10 = z5 + wsptr[4];
+    tmp11 = z5 - wsptr[4];
 
     tmp13 = wsptr[2] + wsptr[6];
     tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;
@@ -209,33 +215,25 @@
     tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);
 
     z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */
-    tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */
+    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;
     tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
+    tmp4 = tmp10 - tmp5;
 
-    /* Final output stage: scale down by a factor of 8 and range-limit */
+    /* Final output stage: float->int conversion and range-limit */
 
-    outptr[0] = range_limit[(int) DESCALE((INT32) (tmp0 + tmp7), 3)
-			    & RANGE_MASK];
-    outptr[7] = range_limit[(int) DESCALE((INT32) (tmp0 - tmp7), 3)
-			    & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE((INT32) (tmp1 + tmp6), 3)
-			    & RANGE_MASK];
-    outptr[6] = range_limit[(int) DESCALE((INT32) (tmp1 - tmp6), 3)
-			    & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE((INT32) (tmp2 + tmp5), 3)
-			    & RANGE_MASK];
-    outptr[5] = range_limit[(int) DESCALE((INT32) (tmp2 - tmp5), 3)
-			    & RANGE_MASK];
-    outptr[4] = range_limit[(int) DESCALE((INT32) (tmp3 + tmp4), 3)
-			    & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE((INT32) (tmp3 - tmp4), 3)
-			    & RANGE_MASK];
-    
-    wsptr += DCTSIZE;		/* advance pointer to next row */
+    outptr[0] = range_limit[((int) (tmp0 + tmp7)) & RANGE_MASK];
+    outptr[7] = range_limit[((int) (tmp0 - tmp7)) & RANGE_MASK];
+    outptr[1] = range_limit[((int) (tmp1 + tmp6)) & RANGE_MASK];
+    outptr[6] = range_limit[((int) (tmp1 - tmp6)) & RANGE_MASK];
+    outptr[2] = range_limit[((int) (tmp2 + tmp5)) & RANGE_MASK];
+    outptr[5] = range_limit[((int) (tmp2 - tmp5)) & RANGE_MASK];
+    outptr[3] = range_limit[((int) (tmp3 + tmp4)) & RANGE_MASK];
+    outptr[4] = range_limit[((int) (tmp3 - tmp4)) & RANGE_MASK];
+
+    wsptr += DCTSIZE;           /* advance pointer to next row */
   }
 }
 
diff --git a/jidctfst.c b/jidctfst.c
index dba4216..10db739 100644
--- a/jidctfst.c
+++ b/jidctfst.c
@@ -1,9 +1,12 @@
 /*
  * jidctfst.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a fast, not so accurate integer implementation of the
  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
@@ -17,8 +20,8 @@
  * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
  * Japanese, but the algorithm is described in the Pennebaker & Mitchell
- * JPEG textbook (see REFERENCES section in file README).  The following code
- * is based directly on figure 4-8 in P&M.
+ * JPEG textbook (see REFERENCES section in file README.ijg).  The following
+ * code is based directly on figure 4-8 in P&M.
  * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  * possible to arrange the computation so that many of the multiplies are
  * simple scalings of the final outputs.  These multiplies can then be
@@ -35,7 +38,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jdct.h"               /* Private declarations for DCT subsystem */
 
 #ifdef DCT_IFAST_SUPPORTED
 
@@ -78,7 +81,7 @@
 #define PASS1_BITS  2
 #else
 #define CONST_BITS  8
-#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
+#define PASS1_BITS  1           /* lose a little precision to avoid overflow */
 #endif
 
 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
@@ -89,10 +92,10 @@
  */
 
 #if CONST_BITS == 8
-#define FIX_1_082392200  ((INT32)  277)		/* FIX(1.082392200) */
-#define FIX_1_414213562  ((INT32)  362)		/* FIX(1.414213562) */
-#define FIX_1_847759065  ((INT32)  473)		/* FIX(1.847759065) */
-#define FIX_2_613125930  ((INT32)  669)		/* FIX(2.613125930) */
+#define FIX_1_082392200  ((JLONG)  277)         /* FIX(1.082392200) */
+#define FIX_1_414213562  ((JLONG)  362)         /* FIX(1.414213562) */
+#define FIX_1_847759065  ((JLONG)  473)         /* FIX(1.847759065) */
+#define FIX_2_613125930  ((JLONG)  669)         /* FIX(2.613125930) */
 #else
 #define FIX_1_082392200  FIX(1.082392200)
 #define FIX_1_414213562  FIX(1.414213562)
@@ -112,7 +115,7 @@
 #endif
 
 
-/* Multiply a DCTELEM variable by an INT32 constant, and immediately
+/* Multiply a DCTELEM variable by an JLONG constant, and immediately
  * descale to yield a DCTELEM result.
  */
 
@@ -122,27 +125,27 @@
 /* Dequantize a coefficient by multiplying it by the multiplier-table
  * entry; produce a DCTELEM result.  For 8-bit data a 16x16->16
  * multiplication will do.  For 12-bit data, the multiplier table is
- * declared INT32, so a 32-bit multiply will be used.
+ * declared JLONG, so a 32-bit multiply will be used.
  */
 
 #if BITS_IN_JSAMPLE == 8
 #define DEQUANTIZE(coef,quantval)  (((IFAST_MULT_TYPE) (coef)) * (quantval))
 #else
 #define DEQUANTIZE(coef,quantval)  \
-	DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
+        DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
 #endif
 
 
 /* Like DESCALE, but applies to a DCTELEM and produces an int.
- * We assume that int right shift is unsigned if INT32 right shift is.
+ * We assume that int right shift is unsigned if JLONG right shift is.
  */
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
-#define ISHIFT_TEMPS	DCTELEM ishift_temp;
+#define ISHIFT_TEMPS    DCTELEM ishift_temp;
 #if BITS_IN_JSAMPLE == 8
-#define DCTELEMBITS  16		/* DCTELEM may be 16 or 32 bits */
+#define DCTELEMBITS  16         /* DCTELEM may be 16 or 32 bits */
 #else
-#define DCTELEMBITS  32		/* DCTELEM must be 32 bits */
+#define DCTELEMBITS  32         /* DCTELEM must be 32 bits */
 #endif
 #define IRIGHT_SHIFT(x,shft)  \
     ((ishift_temp = (x)) < 0 ? \
@@ -150,7 +153,7 @@
      (ishift_temp >> (shft)))
 #else
 #define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
+#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
 #endif
 
 #ifdef USE_ACCURATE_ROUNDING
@@ -165,22 +168,22 @@
  */
 
 GLOBAL(void)
-jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block,
+                 JSAMPARRAY output_buf, JDIMENSION output_col)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
   DCTELEM z5, z10, z11, z12, z13;
   JCOEFPTR inptr;
-  IFAST_MULT_TYPE * quantptr;
-  int * wsptr;
+  IFAST_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[DCTSIZE2];	/* buffers data between passes */
-  SHIFT_TEMPS			/* for DESCALE */
-  ISHIFT_TEMPS			/* for IDESCALE */
+  int workspace[DCTSIZE2];      /* buffers data between passes */
+  SHIFT_TEMPS                   /* for DESCALE */
+  ISHIFT_TEMPS                  /* for IDESCALE */
 
   /* Pass 1: process columns from input, store into work array. */
 
@@ -196,11 +199,11 @@
      * With typical images and quantization tables, half or more of the
      * column DCT calculations can be simplified this way.
      */
-    
+
     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-	inptr[DCTSIZE*7] == 0) {
+        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+        inptr[DCTSIZE*7] == 0) {
       /* AC terms all zero */
       int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 
@@ -212,13 +215,13 @@
       wsptr[DCTSIZE*5] = dcval;
       wsptr[DCTSIZE*6] = dcval;
       wsptr[DCTSIZE*7] = dcval;
-      
-      inptr++;			/* advance pointers to next column */
+
+      inptr++;                  /* advance pointers to next column */
       quantptr++;
       wsptr++;
       continue;
     }
-    
+
     /* Even part */
 
     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
@@ -226,17 +229,17 @@
     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
     tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
 
-    tmp10 = tmp0 + tmp2;	/* phase 3 */
+    tmp10 = tmp0 + tmp2;        /* phase 3 */
     tmp11 = tmp0 - tmp2;
 
-    tmp13 = tmp1 + tmp3;	/* phases 5-3 */
+    tmp13 = tmp1 + tmp3;        /* phases 5-3 */
     tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
 
-    tmp0 = tmp10 + tmp13;	/* phase 2 */
+    tmp0 = tmp10 + tmp13;       /* phase 2 */
     tmp3 = tmp10 - tmp13;
     tmp1 = tmp11 + tmp12;
     tmp2 = tmp11 - tmp12;
-    
+
     /* Odd part */
 
     tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
@@ -244,19 +247,19 @@
     tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
     tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
 
-    z13 = tmp6 + tmp5;		/* phase 6 */
+    z13 = tmp6 + tmp5;          /* phase 6 */
     z10 = tmp6 - tmp5;
     z11 = tmp4 + tmp7;
     z12 = tmp4 - tmp7;
 
-    tmp7 = z11 + z13;		/* phase 5 */
+    tmp7 = z11 + z13;           /* phase 5 */
     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
 
     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
     tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
 
-    tmp6 = tmp12 - tmp7;	/* phase 2 */
+    tmp6 = tmp12 - tmp7;        /* phase 2 */
     tmp5 = tmp11 - tmp6;
     tmp4 = tmp10 + tmp5;
 
@@ -269,11 +272,11 @@
     wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
     wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
 
-    inptr++;			/* advance pointers to next column */
+    inptr++;                    /* advance pointers to next column */
     quantptr++;
     wsptr++;
   }
-  
+
   /* Pass 2: process rows from work array, store into output array. */
   /* Note that we must descale the results by a factor of 8 == 2**3, */
   /* and also undo the PASS1_BITS scaling. */
@@ -288,14 +291,14 @@
      * test takes more time than it's worth.  In that case this section
      * may be commented out.
      */
-    
+
 #ifndef NO_ZERO_ROW_TEST
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
-	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
+        wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
       JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
-				  & RANGE_MASK];
-      
+                                  & RANGE_MASK];
+
       outptr[0] = dcval;
       outptr[1] = dcval;
       outptr[2] = dcval;
@@ -305,11 +308,11 @@
       outptr[6] = dcval;
       outptr[7] = dcval;
 
-      wsptr += DCTSIZE;		/* advance pointer to next row */
+      wsptr += DCTSIZE;         /* advance pointer to next row */
       continue;
     }
 #endif
-    
+
     /* Even part */
 
     tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
@@ -317,7 +320,7 @@
 
     tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
     tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
-	    - tmp13;
+            - tmp13;
 
     tmp0 = tmp10 + tmp13;
     tmp3 = tmp10 - tmp13;
@@ -331,37 +334,37 @@
     z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
     z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
 
-    tmp7 = z11 + z13;		/* phase 5 */
+    tmp7 = z11 + z13;           /* phase 5 */
     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
 
     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
     tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
 
-    tmp6 = tmp12 - tmp7;	/* phase 2 */
+    tmp6 = tmp12 - tmp7;        /* phase 2 */
     tmp5 = tmp11 - tmp6;
     tmp4 = tmp10 + tmp5;
 
     /* Final output stage: scale down by a factor of 8 and range-limit */
 
     outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
-			    & RANGE_MASK];
+                            & RANGE_MASK];
     outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
-			    & RANGE_MASK];
+                            & RANGE_MASK];
     outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
-			    & RANGE_MASK];
+                            & RANGE_MASK];
     outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
-			    & RANGE_MASK];
+                            & RANGE_MASK];
     outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
-			    & RANGE_MASK];
+                            & RANGE_MASK];
     outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
-			    & RANGE_MASK];
+                            & RANGE_MASK];
     outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
-			    & RANGE_MASK];
+                            & RANGE_MASK];
     outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
-			    & RANGE_MASK];
+                            & RANGE_MASK];
 
-    wsptr += DCTSIZE;		/* advance pointer to next row */
+    wsptr += DCTSIZE;           /* advance pointer to next row */
   }
 }
 
diff --git a/jidctint.c b/jidctint.c
index 77d8121..a2d03fc 100644
--- a/jidctint.c
+++ b/jidctint.c
@@ -1,10 +1,13 @@
 /*
  * jidctint.c
  *
+ * This file was part of the Independent JPEG Group's software.
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modification developed 2002-2009 by Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a slow-but-accurate integer implementation of the
  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
@@ -50,7 +53,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jdct.h"               /* Private declarations for DCT subsystem */
 
 #ifdef DCT_ISLOW_SUPPORTED
 
@@ -89,7 +92,7 @@
  * they are represented to better-than-integral precision.  These outputs
  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
  * with the recommended scaling.  (To scale up 12-bit sample data further, an
- * intermediate INT32 array would be needed.)
+ * intermediate JLONG array would be needed.)
  *
  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
@@ -101,7 +104,7 @@
 #define PASS1_BITS  2
 #else
 #define CONST_BITS  13
-#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
+#define PASS1_BITS  1           /* lose a little precision to avoid overflow */
 #endif
 
 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
@@ -112,18 +115,18 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
-#define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
-#define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
-#define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
-#define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
-#define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
-#define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
-#define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
-#define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
-#define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
-#define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
-#define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
+#define FIX_0_298631336  ((JLONG)  2446)        /* FIX(0.298631336) */
+#define FIX_0_390180644  ((JLONG)  3196)        /* FIX(0.390180644) */
+#define FIX_0_541196100  ((JLONG)  4433)        /* FIX(0.541196100) */
+#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
+#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
+#define FIX_1_175875602  ((JLONG)  9633)        /* FIX(1.175875602) */
+#define FIX_1_501321110  ((JLONG)  12299)       /* FIX(1.501321110) */
+#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
+#define FIX_1_961570560  ((JLONG)  16069)       /* FIX(1.961570560) */
+#define FIX_2_053119869  ((JLONG)  16819)       /* FIX(2.053119869) */
+#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
+#define FIX_3_072711026  ((JLONG)  25172)       /* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
@@ -140,7 +143,7 @@
 #endif
 
 
-/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
+/* Multiply an JLONG variable by an JLONG constant to yield an JLONG result.
  * For 8-bit samples with the recommended scaling, all the variable
  * and constant values involved are no more than 16 bits wide, so a
  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
@@ -167,20 +170,20 @@
  */
 
 GLOBAL(void)
-jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block,
+                 JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3;
-  INT32 tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3, z4, z5;
+  JLONG tmp0, tmp1, tmp2, tmp3;
+  JLONG tmp10, tmp11, tmp12, tmp13;
+  JLONG z1, z2, z3, z4, z5;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[DCTSIZE2];	/* buffers data between passes */
+  int workspace[DCTSIZE2];      /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -199,14 +202,15 @@
      * With typical images and quantization tables, half or more of the
      * column DCT calculations can be simplified this way.
      */
-    
+
     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-	inptr[DCTSIZE*7] == 0) {
+        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+        inptr[DCTSIZE*7] == 0) {
       /* AC terms all zero */
-      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-      
+      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
+                             PASS1_BITS);
+
       wsptr[DCTSIZE*0] = dcval;
       wsptr[DCTSIZE*1] = dcval;
       wsptr[DCTSIZE*2] = dcval;
@@ -215,49 +219,49 @@
       wsptr[DCTSIZE*5] = dcval;
       wsptr[DCTSIZE*6] = dcval;
       wsptr[DCTSIZE*7] = dcval;
-      
-      inptr++;			/* advance pointers to next column */
+
+      inptr++;                  /* advance pointers to next column */
       quantptr++;
       wsptr++;
       continue;
     }
-    
+
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
-    
+
     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-    
+
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-    
+
     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 
-    tmp0 = (z2 + z3) << CONST_BITS;
-    tmp1 = (z2 - z3) << CONST_BITS;
-    
+    tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS);
+    tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS);
+
     tmp10 = tmp0 + tmp3;
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
-    
+
     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    
+
     z1 = tmp0 + tmp3;
     z2 = tmp1 + tmp2;
     z3 = tmp0 + tmp2;
     z4 = tmp1 + tmp3;
     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
+
     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
@@ -266,17 +270,17 @@
     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
+
     z3 += z5;
     z4 += z5;
-    
+
     tmp0 += z1 + z3;
     tmp1 += z2 + z4;
     tmp2 += z2 + z3;
     tmp3 += z1 + z4;
-    
+
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-    
+
     wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
     wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
     wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
@@ -285,12 +289,12 @@
     wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
     wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
     wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
-    
-    inptr++;			/* advance pointers to next column */
+
+    inptr++;                    /* advance pointers to next column */
     quantptr++;
     wsptr++;
   }
-  
+
   /* Pass 2: process rows from work array, store into output array. */
   /* Note that we must descale the results by a factor of 8 == 2**3, */
   /* and also undo the PASS1_BITS scaling. */
@@ -305,14 +309,14 @@
      * test takes more time than it's worth.  In that case this section
      * may be commented out.
      */
-    
+
 #ifndef NO_ZERO_ROW_TEST
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
-	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
+        wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
-				  & RANGE_MASK];
-      
+      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
+                                  & RANGE_MASK];
+
       outptr[0] = dcval;
       outptr[1] = dcval;
       outptr[2] = dcval;
@@ -322,44 +326,44 @@
       outptr[6] = dcval;
       outptr[7] = dcval;
 
-      wsptr += DCTSIZE;		/* advance pointer to next row */
+      wsptr += DCTSIZE;         /* advance pointer to next row */
       continue;
     }
 #endif
-    
+
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
-    
-    z2 = (INT32) wsptr[2];
-    z3 = (INT32) wsptr[6];
-    
+
+    z2 = (JLONG) wsptr[2];
+    z3 = (JLONG) wsptr[6];
+
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-    
-    tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
-    tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
-    
+
+    tmp0 = LEFT_SHIFT((JLONG) wsptr[0] + (JLONG) wsptr[4], CONST_BITS);
+    tmp1 = LEFT_SHIFT((JLONG) wsptr[0] - (JLONG) wsptr[4], CONST_BITS);
+
     tmp10 = tmp0 + tmp3;
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
-    
-    tmp0 = (INT32) wsptr[7];
-    tmp1 = (INT32) wsptr[5];
-    tmp2 = (INT32) wsptr[3];
-    tmp3 = (INT32) wsptr[1];
-    
+
+    tmp0 = (JLONG) wsptr[7];
+    tmp1 = (JLONG) wsptr[5];
+    tmp2 = (JLONG) wsptr[3];
+    tmp3 = (JLONG) wsptr[1];
+
     z1 = tmp0 + tmp3;
     z2 = tmp1 + tmp2;
     z3 = tmp0 + tmp2;
     z4 = tmp1 + tmp3;
     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
+
     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
@@ -368,43 +372,43 @@
     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
+
     z3 += z5;
     z4 += z5;
-    
+
     tmp0 += z1 + z3;
     tmp1 += z2 + z4;
     tmp2 += z2 + z3;
     tmp3 += z1 + z4;
-    
+
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-    
+
     outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
-					  CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                          CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
-					  CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                          CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
-					  CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                          CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
-					  CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                          CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
-					  CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                          CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
-					  CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                          CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
-					  CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                          CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
-					  CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
-    
-    wsptr += DCTSIZE;		/* advance pointer to next row */
+                                          CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
+
+    wsptr += DCTSIZE;           /* advance pointer to next row */
   }
 }
 
@@ -420,19 +424,19 @@
  */
 
 GLOBAL(void)
-jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block,
+               JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3;
+  JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
+  JLONG z1, z2, z3;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[7*7];	/* buffers data between passes */
+  int workspace[7*7];   /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -444,7 +448,7 @@
     /* Even part */
 
     tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp13 <<= CONST_BITS;
+    tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
     /* Add fudge factor here for final descale. */
     tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
 
@@ -498,12 +502,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
-    tmp13 <<= CONST_BITS;
+    tmp13 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
 
-    z1 = (INT32) wsptr[2];
-    z2 = (INT32) wsptr[4];
-    z3 = (INT32) wsptr[6];
+    z1 = (JLONG) wsptr[2];
+    z2 = (JLONG) wsptr[4];
+    z3 = (JLONG) wsptr[6];
 
     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
@@ -517,9 +521,9 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
 
     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
@@ -534,28 +538,28 @@
     /* Final output stage */
 
     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
 
-    wsptr += 7;		/* advance pointer to next row */
+    wsptr += 7;         /* advance pointer to next row */
   }
 }
 
@@ -569,19 +573,19 @@
  */
 
 GLOBAL(void)
-jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block,
+               JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
-  INT32 z1, z2, z3;
+  JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+  JLONG z1, z2, z3;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[6*6];	/* buffers data between passes */
+  int workspace[6*6];   /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -593,7 +597,7 @@
     /* Even part */
 
     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp0 <<= CONST_BITS;
+    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
@@ -611,9 +615,9 @@
     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
-    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
-    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
-    tmp1 = (z1 - z2 - z3) << PASS1_BITS;
+    tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
+    tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
+    tmp1 = LEFT_SHIFT(z1 - z2 - z3, PASS1_BITS);
 
     /* Final output stage */
 
@@ -634,49 +638,49 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
-    tmp0 <<= CONST_BITS;
-    tmp2 = (INT32) wsptr[4];
+    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
+    tmp2 = (JLONG) wsptr[4];
     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
     tmp1 = tmp0 + tmp10;
     tmp11 = tmp0 - tmp10 - tmp10;
-    tmp10 = (INT32) wsptr[2];
+    tmp10 = (JLONG) wsptr[2];
     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
     tmp10 = tmp1 + tmp0;
     tmp12 = tmp1 - tmp0;
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
-    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
-    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
-    tmp1 = (z1 - z2 - z3) << CONST_BITS;
+    tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
+    tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
+    tmp1 = LEFT_SHIFT(z1 - z2 - z3, CONST_BITS);
 
     /* Final output stage */
 
     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
 
-    wsptr += 6;		/* advance pointer to next row */
+    wsptr += 6;         /* advance pointer to next row */
   }
 }
 
@@ -690,19 +694,19 @@
  */
 
 GLOBAL(void)
-jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block,
+               JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
-  INT32 z1, z2, z3;
+  JLONG tmp0, tmp1, tmp10, tmp11, tmp12;
+  JLONG z1, z2, z3;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[5*5];	/* buffers data between passes */
+  int workspace[5*5];   /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -714,7 +718,7 @@
     /* Even part */
 
     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp12 <<= CONST_BITS;
+    tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
     /* Add fudge factor here for final descale. */
     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
     tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
@@ -724,7 +728,7 @@
     z3 = tmp12 + z2;
     tmp10 = z3 + z1;
     tmp11 = z3 - z1;
-    tmp12 -= z2 << 2;
+    tmp12 -= LEFT_SHIFT(z2, 2);
 
     /* Odd part */
 
@@ -753,21 +757,21 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
-    tmp12 <<= CONST_BITS;
-    tmp0 = (INT32) wsptr[2];
-    tmp1 = (INT32) wsptr[4];
+    tmp12 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
+    tmp0 = (JLONG) wsptr[2];
+    tmp1 = (JLONG) wsptr[4];
     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
     z3 = tmp12 + z2;
     tmp10 = z3 + z1;
     tmp11 = z3 - z1;
-    tmp12 -= z2 << 2;
+    tmp12 -= LEFT_SHIFT(z2, 2);
 
     /* Odd part */
 
-    z2 = (INT32) wsptr[1];
-    z3 = (INT32) wsptr[3];
+    z2 = (JLONG) wsptr[1];
+    z3 = (JLONG) wsptr[3];
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
@@ -776,22 +780,22 @@
     /* Final output stage */
 
     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
 
-    wsptr += 5;		/* advance pointer to next row */
+    wsptr += 5;         /* advance pointer to next row */
   }
 }
 
@@ -805,18 +809,18 @@
  */
 
 GLOBAL(void)
-jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block,
+               JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp2, tmp10, tmp12;
+  JLONG tmp0, tmp2, tmp10, tmp12;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[3*3];	/* buffers data between passes */
+  int workspace[3*3];   /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -828,7 +832,7 @@
     /* Even part */
 
     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp0 <<= CONST_BITS;
+    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
@@ -857,31 +861,31 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
-    tmp0 <<= CONST_BITS;
-    tmp2 = (INT32) wsptr[2];
+    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
+    tmp2 = (JLONG) wsptr[2];
     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
     tmp10 = tmp0 + tmp12;
     tmp2 = tmp0 - tmp12 - tmp12;
 
     /* Odd part */
 
-    tmp12 = (INT32) wsptr[1];
+    tmp12 = (JLONG) wsptr[1];
     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
 
     /* Final output stage */
 
     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
 
-    wsptr += 3;		/* advance pointer to next row */
+    wsptr += 3;         /* advance pointer to next row */
   }
 }
 
@@ -895,19 +899,19 @@
  */
 
 GLOBAL(void)
-jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block,
+               JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*9];	/* buffers data between passes */
+  int workspace[8*9];   /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -919,7 +923,7 @@
     /* Even part */
 
     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp0 <<= CONST_BITS;
+    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
 
@@ -982,12 +986,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
-    tmp0 <<= CONST_BITS;
+    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
 
-    z1 = (INT32) wsptr[2];
-    z2 = (INT32) wsptr[4];
-    z3 = (INT32) wsptr[6];
+    z1 = (JLONG) wsptr[2];
+    z2 = (JLONG) wsptr[4];
+    z3 = (JLONG) wsptr[6];
 
     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
     tmp1 = tmp0 + tmp3;
@@ -1007,10 +1011,10 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z4 = (INT32) wsptr[7];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z4 = (JLONG) wsptr[7];
 
     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
 
@@ -1025,34 +1029,34 @@
     /* Final output stage */
 
     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
 
-    wsptr += 8;		/* advance pointer to next row */
+    wsptr += 8;         /* advance pointer to next row */
   }
 }
 
@@ -1066,20 +1070,20 @@
  */
 
 GLOBAL(void)
-jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block,
+                 JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
-  INT32 z1, z2, z3, z4, z5;
+  JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24;
+  JLONG z1, z2, z3, z4, z5;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*10];	/* buffers data between passes */
+  int workspace[8*10];  /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -1091,7 +1095,7 @@
     /* Even part */
 
     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    z3 <<= CONST_BITS;
+    z3 = LEFT_SHIFT(z3, CONST_BITS);
     /* Add fudge factor here for final descale. */
     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
@@ -1100,8 +1104,8 @@
     tmp10 = z3 + z1;
     tmp11 = z3 - z2;
 
-    tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
-			CONST_BITS-PASS1_BITS);
+    tmp22 = RIGHT_SHIFT(z3 - LEFT_SHIFT(z1 - z2, 1),
+                        CONST_BITS-PASS1_BITS);  /* c0 = (c4-c8)*2 */
 
     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
@@ -1126,7 +1130,7 @@
     tmp13 = z2 - z4;
 
     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
-    z5 = z3 << CONST_BITS;
+    z5 = LEFT_SHIFT(z3, CONST_BITS);
 
     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
     z4 = z5 + tmp12;
@@ -1135,9 +1139,9 @@
     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
 
     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
-    z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
+    z4 = z5 - tmp12 - LEFT_SHIFT(tmp13, CONST_BITS - 1);
 
-    tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
+    tmp12 = LEFT_SHIFT(z1 - tmp13 - z3, PASS1_BITS);
 
     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
@@ -1165,18 +1169,18 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
-    z3 <<= CONST_BITS;
-    z4 = (INT32) wsptr[4];
+    z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = LEFT_SHIFT(z3, CONST_BITS);
+    z4 = (JLONG) wsptr[4];
     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
     tmp10 = z3 + z1;
     tmp11 = z3 - z2;
 
-    tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
+    tmp22 = z3 - LEFT_SHIFT(z1 - z2, 1);         /* c0 = (c4-c8)*2 */
 
-    z2 = (INT32) wsptr[2];
-    z3 = (INT32) wsptr[6];
+    z2 = (JLONG) wsptr[2];
+    z3 = (JLONG) wsptr[6];
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
@@ -1189,11 +1193,11 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z3 <<= CONST_BITS;
-    z4 = (INT32) wsptr[7];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z3 = LEFT_SHIFT(z3, CONST_BITS);
+    z4 = (JLONG) wsptr[7];
 
     tmp11 = z2 + z4;
     tmp13 = z2 - z4;
@@ -1207,9 +1211,9 @@
     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
 
     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
-    z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
+    z4 = z3 - tmp12 - LEFT_SHIFT(tmp13, CONST_BITS - 1);
 
-    tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
+    tmp12 = LEFT_SHIFT(z1 - tmp13, CONST_BITS) - z3;
 
     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
@@ -1217,37 +1221,37 @@
     /* Final output stage */
 
     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-					      CONST_BITS+PASS1_BITS+3)
-			    & RANGE_MASK];
+                                              CONST_BITS+PASS1_BITS+3)
+                            & RANGE_MASK];
 
-    wsptr += 8;		/* advance pointer to next row */
+    wsptr += 8;         /* advance pointer to next row */
   }
 }
 
@@ -1261,20 +1265,20 @@
  */
 
 GLOBAL(void)
-jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block,
+                 JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*11];	/* buffers data between passes */
+  int workspace[8*11];  /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -1286,7 +1290,7 @@
     /* Even part */
 
     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp10 <<= CONST_BITS;
+    tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
     /* Add fudge factor here for final descale. */
     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
 
@@ -1301,13 +1305,13 @@
     z4 -= z2;
     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
     tmp21 = tmp20 + tmp23 + tmp25 -
-	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
+            MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
     tmp24 += tmp25;
     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
-	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
+             MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
 
     /* Odd part */
@@ -1323,7 +1327,7 @@
     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
     tmp10 = tmp11 + tmp12 + tmp13 -
-	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
+            MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
@@ -1331,8 +1335,8 @@
     tmp11 += z1;
     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
-	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
-	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
+             MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
+             MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
 
     /* Final output stage */
 
@@ -1358,12 +1362,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
-    tmp10 <<= CONST_BITS;
+    tmp10 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
 
-    z1 = (INT32) wsptr[2];
-    z2 = (INT32) wsptr[4];
-    z3 = (INT32) wsptr[6];
+    z1 = (JLONG) wsptr[2];
+    z2 = (JLONG) wsptr[4];
+    z3 = (JLONG) wsptr[6];
 
     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
@@ -1372,21 +1376,21 @@
     z4 -= z2;
     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
     tmp21 = tmp20 + tmp23 + tmp25 -
-	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
+            MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
     tmp24 += tmp25;
     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
-	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
+             MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z4 = (INT32) wsptr[7];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z4 = (JLONG) wsptr[7];
 
     tmp11 = z1 + z2;
     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
@@ -1394,7 +1398,7 @@
     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
     tmp10 = tmp11 + tmp12 + tmp13 -
-	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
+            MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
@@ -1402,46 +1406,46 @@
     tmp11 += z1;
     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
-	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
-	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
+             MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
+             MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
 
     /* Final output stage */
 
     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
 
-    wsptr += 8;		/* advance pointer to next row */
+    wsptr += 8;         /* advance pointer to next row */
   }
 }
 
@@ -1455,20 +1459,20 @@
  */
 
 GLOBAL(void)
-jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block,
+                 JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*12];	/* buffers data between passes */
+  int workspace[8*12];  /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -1480,7 +1484,7 @@
     /* Even part */
 
     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    z3 <<= CONST_BITS;
+    z3 = LEFT_SHIFT(z3, CONST_BITS);
     /* Add fudge factor here for final descale. */
     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
 
@@ -1492,9 +1496,9 @@
 
     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
-    z1 <<= CONST_BITS;
+    z1 = LEFT_SHIFT(z1, CONST_BITS);
     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-    z2 <<= CONST_BITS;
+    z2 = LEFT_SHIFT(z2, CONST_BITS);
 
     tmp12 = z1 - z2;
 
@@ -1529,7 +1533,7 @@
     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
-	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+             MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
 
     z1 -= z4;
     z2 -= z3;
@@ -1562,20 +1566,20 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
-    z3 <<= CONST_BITS;
+    z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = LEFT_SHIFT(z3, CONST_BITS);
 
-    z4 = (INT32) wsptr[4];
+    z4 = (JLONG) wsptr[4];
     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
 
-    z1 = (INT32) wsptr[2];
+    z1 = (JLONG) wsptr[2];
     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
-    z1 <<= CONST_BITS;
-    z2 = (INT32) wsptr[6];
-    z2 <<= CONST_BITS;
+    z1 = LEFT_SHIFT(z1, CONST_BITS);
+    z2 = (JLONG) wsptr[6];
+    z2 = LEFT_SHIFT(z2, CONST_BITS);
 
     tmp12 = z1 - z2;
 
@@ -1594,10 +1598,10 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z4 = (INT32) wsptr[7];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z4 = (JLONG) wsptr[7];
 
     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
@@ -1610,7 +1614,7 @@
     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
-	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+             MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
 
     z1 -= z4;
     z2 -= z3;
@@ -1621,43 +1625,43 @@
     /* Final output stage */
 
     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
 
-    wsptr += 8;		/* advance pointer to next row */
+    wsptr += 8;         /* advance pointer to next row */
   }
 }
 
@@ -1671,20 +1675,20 @@
  */
 
 GLOBAL(void)
-jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block,
+                 JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*13];	/* buffers data between passes */
+  int workspace[8*13];  /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -1696,7 +1700,7 @@
     /* Even part */
 
     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    z1 <<= CONST_BITS;
+    z1 = LEFT_SHIFT(z1, CONST_BITS);
     /* Add fudge factor here for final descale. */
     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
 
@@ -1739,7 +1743,7 @@
     tmp15 = z1 + z4;
     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
     tmp10 = tmp11 + tmp12 + tmp13 -
-	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
+            MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
@@ -1751,11 +1755,11 @@
     tmp13 += tmp14;
     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
-	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
+            MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
     tmp14 += z1;
     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
-	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
+             MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
 
     /* Final output stage */
 
@@ -1783,12 +1787,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
-    z1 <<= CONST_BITS;
+    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = LEFT_SHIFT(z1, CONST_BITS);
 
-    z2 = (INT32) wsptr[2];
-    z3 = (INT32) wsptr[4];
-    z4 = (INT32) wsptr[6];
+    z2 = (JLONG) wsptr[2];
+    z3 = (JLONG) wsptr[4];
+    z4 = (JLONG) wsptr[6];
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
@@ -1815,17 +1819,17 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z4 = (INT32) wsptr[7];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z4 = (JLONG) wsptr[7];
 
     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
     tmp15 = z1 + z4;
     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
     tmp10 = tmp11 + tmp12 + tmp13 -
-	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
+            MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
@@ -1837,55 +1841,55 @@
     tmp13 += tmp14;
     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
-	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
+            MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
     tmp14 += z1;
     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
-	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
+             MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
 
     /* Final output stage */
 
     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
 
-    wsptr += 8;		/* advance pointer to next row */
+    wsptr += 8;         /* advance pointer to next row */
   }
 }
 
@@ -1899,20 +1903,20 @@
  */
 
 GLOBAL(void)
-jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block,
+                 JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*14];	/* buffers data between passes */
+  int workspace[8*14];  /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -1924,7 +1928,7 @@
     /* Even part */
 
     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    z1 <<= CONST_BITS;
+    z1 = LEFT_SHIFT(z1, CONST_BITS);
     /* Add fudge factor here for final descale. */
     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
@@ -1936,8 +1940,8 @@
     tmp11 = z1 + z3;
     tmp12 = z1 - z4;
 
-    tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
-			CONST_BITS-PASS1_BITS);
+    tmp23 = RIGHT_SHIFT(z1 - LEFT_SHIFT(z2 + z3 - z4, 1),
+                        CONST_BITS-PASS1_BITS);  /* c0 = (c4+c12-c8)*2 */
 
     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
@@ -1947,7 +1951,7 @@
     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
-	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+            MULTIPLY(z2, FIX(1.378756276));      /* c2 */
 
     tmp20 = tmp10 + tmp13;
     tmp26 = tmp10 - tmp13;
@@ -1962,7 +1966,7 @@
     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    tmp13 = z4 << CONST_BITS;
+    tmp13 = LEFT_SHIFT(z4, CONST_BITS);
 
     tmp14 = z1 + z3;
     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
@@ -1981,7 +1985,7 @@
     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
 
-    tmp13 = (z1 - z3) << PASS1_BITS;
+    tmp13 = LEFT_SHIFT(z1 - z3, PASS1_BITS);
 
     /* Final output stage */
 
@@ -2010,9 +2014,9 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
-    z1 <<= CONST_BITS;
-    z4 = (INT32) wsptr[4];
+    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = LEFT_SHIFT(z1, CONST_BITS);
+    z4 = (JLONG) wsptr[4];
     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
@@ -2021,17 +2025,17 @@
     tmp11 = z1 + z3;
     tmp12 = z1 - z4;
 
-    tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
+    tmp23 = z1 - LEFT_SHIFT(z2 + z3 - z4, 1);    /* c0 = (c4+c12-c8)*2 */
 
-    z1 = (INT32) wsptr[2];
-    z2 = (INT32) wsptr[6];
+    z1 = (JLONG) wsptr[2];
+    z2 = (JLONG) wsptr[6];
 
     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
 
     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
-	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+            MULTIPLY(z2, FIX(1.378756276));      /* c2 */
 
     tmp20 = tmp10 + tmp13;
     tmp26 = tmp10 - tmp13;
@@ -2042,11 +2046,11 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z4 = (INT32) wsptr[7];
-    z4 <<= CONST_BITS;
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z4 = (JLONG) wsptr[7];
+    z4 = LEFT_SHIFT(z4, CONST_BITS);
 
     tmp14 = z1 + z3;
     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
@@ -2064,54 +2068,54 @@
     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
 
-    tmp13 = ((z1 - z3) << CONST_BITS) + z4;
+    tmp13 = LEFT_SHIFT(z1 - z3, CONST_BITS) + z4;
 
     /* Final output stage */
 
     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
 
-    wsptr += 8;		/* advance pointer to next row */
+    wsptr += 8;         /* advance pointer to next row */
   }
 }
 
@@ -2125,20 +2129,20 @@
  */
 
 GLOBAL(void)
-jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block,
+                 JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*15];	/* buffers data between passes */
+  int workspace[8*15];  /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -2150,7 +2154,7 @@
     /* Even part */
 
     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    z1 <<= CONST_BITS;
+    z1 = LEFT_SHIFT(z1, CONST_BITS);
     /* Add fudge factor here for final descale. */
     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
 
@@ -2163,7 +2167,7 @@
 
     tmp12 = z1 - tmp10;
     tmp13 = z1 + tmp11;
-    z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
+    z1 -= LEFT_SHIFT(tmp11 - tmp10, 1);     /* c0 = (c6-c12)*2 */
 
     z4 = z2 - z3;
     z3 += z2;
@@ -2242,19 +2246,19 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
-    z1 <<= CONST_BITS;
+    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = LEFT_SHIFT(z1, CONST_BITS);
 
-    z2 = (INT32) wsptr[2];
-    z3 = (INT32) wsptr[4];
-    z4 = (INT32) wsptr[6];
+    z2 = (JLONG) wsptr[2];
+    z3 = (JLONG) wsptr[4];
+    z4 = (JLONG) wsptr[6];
 
     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
 
     tmp12 = z1 - tmp10;
     tmp13 = z1 + tmp11;
-    z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
+    z1 -= LEFT_SHIFT(tmp11 - tmp10, 1);     /* c0 = (c6-c12)*2 */
 
     z4 = z2 - z3;
     z3 += z2;
@@ -2282,11 +2286,11 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z4 = (INT32) wsptr[5];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z4 = (JLONG) wsptr[5];
     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
-    z4 = (INT32) wsptr[7];
+    z4 = (JLONG) wsptr[7];
 
     tmp13 = z2 - z4;
     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
@@ -2308,52 +2312,52 @@
     /* Final output stage */
 
     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
 
-    wsptr += 8;		/* advance pointer to next row */
+    wsptr += 8;         /* advance pointer to next row */
   }
 }
 
@@ -2367,20 +2371,20 @@
  */
 
 GLOBAL(void)
-jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block,
+                 JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
-  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*16];	/* buffers data between passes */
+  int workspace[8*16];  /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -2392,7 +2396,7 @@
     /* Even part */
 
     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp0 <<= CONST_BITS;
+    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
     tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
 
@@ -2441,9 +2445,9 @@
     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
     tmp0  = tmp1 + tmp2 + tmp3 -
-	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+            MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
     tmp13 = tmp10 + tmp11 + tmp12 -
-	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+            MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
@@ -2493,10 +2497,10 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
-    tmp0 <<= CONST_BITS;
+    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
 
-    z1 = (INT32) wsptr[4];
+    z1 = (JLONG) wsptr[4];
     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
 
@@ -2505,8 +2509,8 @@
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp0 - tmp2;
 
-    z1 = (INT32) wsptr[2];
-    z2 = (INT32) wsptr[6];
+    z1 = (JLONG) wsptr[2];
+    z2 = (JLONG) wsptr[6];
     z3 = z1 - z2;
     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
@@ -2527,10 +2531,10 @@
 
     /* Odd part */
 
-    z1 = (INT32) wsptr[1];
-    z2 = (INT32) wsptr[3];
-    z3 = (INT32) wsptr[5];
-    z4 = (INT32) wsptr[7];
+    z1 = (JLONG) wsptr[1];
+    z2 = (JLONG) wsptr[3];
+    z3 = (JLONG) wsptr[5];
+    z4 = (JLONG) wsptr[7];
 
     tmp11 = z1 + z3;
 
@@ -2541,9 +2545,9 @@
     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
     tmp0  = tmp1 + tmp2 + tmp3 -
-	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+            MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
     tmp13 = tmp10 + tmp11 + tmp12 -
-	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+            MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
@@ -2567,55 +2571,55 @@
     /* Final output stage */
 
     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
-					       CONST_BITS+PASS1_BITS+3)
-			     & RANGE_MASK];
+                                               CONST_BITS+PASS1_BITS+3)
+                             & RANGE_MASK];
 
-    wsptr += 8;		/* advance pointer to next row */
+    wsptr += 8;         /* advance pointer to next row */
   }
 }
 
diff --git a/jidctred.c b/jidctred.c
index 421f3c7..2d5b546 100644
--- a/jidctred.c
+++ b/jidctred.c
@@ -1,9 +1,12 @@
 /*
  * jidctred.c
  *
+ * This file was part of the Independent JPEG Group's software.
  * Copyright (C) 1994-1998, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains inverse-DCT routines that produce reduced-size output:
  * either 4x4, 2x2, or 1x1 pixels from an 8x8 DCT block.
@@ -23,7 +26,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jdct.h"               /* Private declarations for DCT subsystem */
 
 #ifdef IDCT_SCALING_SUPPORTED
 
@@ -44,7 +47,7 @@
 #define PASS1_BITS  2
 #else
 #define CONST_BITS  13
-#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
+#define PASS1_BITS  1           /* lose a little precision to avoid overflow */
 #endif
 
 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
@@ -55,20 +58,20 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_211164243  ((INT32)  1730)	/* FIX(0.211164243) */
-#define FIX_0_509795579  ((INT32)  4176)	/* FIX(0.509795579) */
-#define FIX_0_601344887  ((INT32)  4926)	/* FIX(0.601344887) */
-#define FIX_0_720959822  ((INT32)  5906)	/* FIX(0.720959822) */
-#define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
-#define FIX_0_850430095  ((INT32)  6967)	/* FIX(0.850430095) */
-#define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
-#define FIX_1_061594337  ((INT32)  8697)	/* FIX(1.061594337) */
-#define FIX_1_272758580  ((INT32)  10426)	/* FIX(1.272758580) */
-#define FIX_1_451774981  ((INT32)  11893)	/* FIX(1.451774981) */
-#define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
-#define FIX_2_172734803  ((INT32)  17799)	/* FIX(2.172734803) */
-#define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
-#define FIX_3_624509785  ((INT32)  29692)	/* FIX(3.624509785) */
+#define FIX_0_211164243  ((JLONG)  1730)        /* FIX(0.211164243) */
+#define FIX_0_509795579  ((JLONG)  4176)        /* FIX(0.509795579) */
+#define FIX_0_601344887  ((JLONG)  4926)        /* FIX(0.601344887) */
+#define FIX_0_720959822  ((JLONG)  5906)        /* FIX(0.720959822) */
+#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
+#define FIX_0_850430095  ((JLONG)  6967)        /* FIX(0.850430095) */
+#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
+#define FIX_1_061594337  ((JLONG)  8697)        /* FIX(1.061594337) */
+#define FIX_1_272758580  ((JLONG)  10426)       /* FIX(1.272758580) */
+#define FIX_1_451774981  ((JLONG)  11893)       /* FIX(1.451774981) */
+#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
+#define FIX_2_172734803  ((JLONG)  17799)       /* FIX(2.172734803) */
+#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
+#define FIX_3_624509785  ((JLONG)  29692)       /* FIX(3.624509785) */
 #else
 #define FIX_0_211164243  FIX(0.211164243)
 #define FIX_0_509795579  FIX(0.509795579)
@@ -87,7 +90,7 @@
 #endif
 
 
-/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
+/* Multiply a JLONG variable by a JLONG constant to yield a JLONG result.
  * For 8-bit samples with the recommended scaling, all the variable
  * and constant values involved are no more than 16 bits wide, so a
  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
@@ -115,19 +118,19 @@
  */
 
 GLOBAL(void)
-jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block,
+               JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp2, tmp10, tmp12;
-  INT32 z1, z2, z3, z4;
+  JLONG tmp0, tmp2, tmp10, tmp12;
+  JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[DCTSIZE*4];	/* buffers data between passes */
+  int workspace[DCTSIZE*4];     /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -140,57 +143,58 @@
     if (ctr == DCTSIZE-4)
       continue;
     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*5] == 0 &&
-	inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) {
+        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*5] == 0 &&
+        inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) {
       /* AC terms all zero; we need not examine term 4 for 4x4 output */
-      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-      
+      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
+                             PASS1_BITS);
+
       wsptr[DCTSIZE*0] = dcval;
       wsptr[DCTSIZE*1] = dcval;
       wsptr[DCTSIZE*2] = dcval;
       wsptr[DCTSIZE*3] = dcval;
-      
+
       continue;
     }
-    
+
     /* Even part */
-    
+
     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp0 <<= (CONST_BITS+1);
-    
+    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS+1);
+
     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
 
     tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, - FIX_0_765366865);
-    
+
     tmp10 = tmp0 + tmp2;
     tmp12 = tmp0 - tmp2;
-    
+
     /* Odd part */
-    
+
     z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
     z2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
     z4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    
+
     tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-	 + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-	 + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-	 + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
-    
+         + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
+         + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
+         + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
+
     tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-	 + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-	 + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-	 + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
+         + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
+         + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
+         + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
 
     /* Final output stage */
-    
+
     wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1);
     wsptr[DCTSIZE*3] = (int) DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1);
     wsptr[DCTSIZE*1] = (int) DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1);
     wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1);
   }
-  
+
   /* Pass 2: process 4 rows from work array, store into output array. */
 
   wsptr = workspace;
@@ -200,64 +204,64 @@
 
 #ifndef NO_ZERO_ROW_TEST
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 &&
-	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
+        wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
-				  & RANGE_MASK];
-      
+      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
+                                  & RANGE_MASK];
+
       outptr[0] = dcval;
       outptr[1] = dcval;
       outptr[2] = dcval;
       outptr[3] = dcval;
-      
-      wsptr += DCTSIZE;		/* advance pointer to next row */
+
+      wsptr += DCTSIZE;         /* advance pointer to next row */
       continue;
     }
 #endif
-    
+
     /* Even part */
-    
-    tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1);
-    
-    tmp2 = MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
-	 + MULTIPLY((INT32) wsptr[6], - FIX_0_765366865);
-    
+
+    tmp0 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+1);
+
+    tmp2 = MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
+         + MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865);
+
     tmp10 = tmp0 + tmp2;
     tmp12 = tmp0 - tmp2;
-    
+
     /* Odd part */
-    
-    z1 = (INT32) wsptr[7];
-    z2 = (INT32) wsptr[5];
-    z3 = (INT32) wsptr[3];
-    z4 = (INT32) wsptr[1];
-    
+
+    z1 = (JLONG) wsptr[7];
+    z2 = (JLONG) wsptr[5];
+    z3 = (JLONG) wsptr[3];
+    z4 = (JLONG) wsptr[1];
+
     tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-	 + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-	 + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-	 + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
-    
+         + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
+         + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
+         + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
+
     tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-	 + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-	 + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-	 + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
+         + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
+         + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
+         + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
 
     /* Final output stage */
-    
+
     outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2,
-					  CONST_BITS+PASS1_BITS+3+1)
-			    & RANGE_MASK];
+                                          CONST_BITS+PASS1_BITS+3+1)
+                            & RANGE_MASK];
     outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2,
-					  CONST_BITS+PASS1_BITS+3+1)
-			    & RANGE_MASK];
+                                          CONST_BITS+PASS1_BITS+3+1)
+                            & RANGE_MASK];
     outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0,
-					  CONST_BITS+PASS1_BITS+3+1)
-			    & RANGE_MASK];
+                                          CONST_BITS+PASS1_BITS+3+1)
+                            & RANGE_MASK];
     outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0,
-					  CONST_BITS+PASS1_BITS+3+1)
-			    & RANGE_MASK];
-    
-    wsptr += DCTSIZE;		/* advance pointer to next row */
+                                          CONST_BITS+PASS1_BITS+3+1)
+                            & RANGE_MASK];
+
+    wsptr += DCTSIZE;           /* advance pointer to next row */
   }
 }
 
@@ -268,18 +272,18 @@
  */
 
 GLOBAL(void)
-jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block,
+               JSAMPARRAY output_buf, JDIMENSION output_col)
 {
-  INT32 tmp0, tmp10, z1;
+  JLONG tmp0, tmp10, z1;
   JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  ISLOW_MULT_TYPE *quantptr;
+  int *wsptr;
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[DCTSIZE*2];	/* buffers data between passes */
+  int workspace[DCTSIZE*2];     /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
@@ -292,21 +296,22 @@
     if (ctr == DCTSIZE-2 || ctr == DCTSIZE-4 || ctr == DCTSIZE-6)
       continue;
     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*3] == 0 &&
-	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*7] == 0) {
+        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*7] == 0) {
       /* AC terms all zero; we need not examine terms 2,4,6 for 2x2 output */
-      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-      
+      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
+                             PASS1_BITS);
+
       wsptr[DCTSIZE*0] = dcval;
       wsptr[DCTSIZE*1] = dcval;
-      
+
       continue;
     }
-    
+
     /* Even part */
-    
+
     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp10 = z1 << (CONST_BITS+2);
-    
+    tmp10 = LEFT_SHIFT(z1, CONST_BITS+2);
+
     /* Odd part */
 
     z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
@@ -319,11 +324,11 @@
     tmp0 += MULTIPLY(z1, FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
 
     /* Final output stage */
-    
+
     wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS+2);
     wsptr[DCTSIZE*1] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS+2);
   }
-  
+
   /* Pass 2: process 2 rows from work array, store into output array. */
 
   wsptr = workspace;
@@ -334,38 +339,38 @@
 #ifndef NO_ZERO_ROW_TEST
     if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
-				  & RANGE_MASK];
-      
+      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
+                                  & RANGE_MASK];
+
       outptr[0] = dcval;
       outptr[1] = dcval;
-      
-      wsptr += DCTSIZE;		/* advance pointer to next row */
+
+      wsptr += DCTSIZE;         /* advance pointer to next row */
       continue;
     }
 #endif
-    
+
     /* Even part */
-    
-    tmp10 = ((INT32) wsptr[0]) << (CONST_BITS+2);
-    
+
+    tmp10 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+2);
+
     /* Odd part */
 
-    tmp0 = MULTIPLY((INT32) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */
-	 + MULTIPLY((INT32) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */
-	 + MULTIPLY((INT32) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */
-	 + MULTIPLY((INT32) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
+    tmp0 = MULTIPLY((JLONG) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */
+         + MULTIPLY((JLONG) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */
+         + MULTIPLY((JLONG) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */
+         + MULTIPLY((JLONG) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
 
     /* Final output stage */
-    
+
     outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0,
-					  CONST_BITS+PASS1_BITS+3+2)
-			    & RANGE_MASK];
+                                          CONST_BITS+PASS1_BITS+3+2)
+                            & RANGE_MASK];
     outptr[1] = range_limit[(int) DESCALE(tmp10 - tmp0,
-					  CONST_BITS+PASS1_BITS+3+2)
-			    & RANGE_MASK];
-    
-    wsptr += DCTSIZE;		/* advance pointer to next row */
+                                          CONST_BITS+PASS1_BITS+3+2)
+                            & RANGE_MASK];
+
+    wsptr += DCTSIZE;           /* advance pointer to next row */
   }
 }
 
@@ -376,12 +381,12 @@
  */
 
 GLOBAL(void)
-jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block,
+               JSAMPARRAY output_buf, JDIMENSION output_col)
 {
   int dcval;
-  ISLOW_MULT_TYPE * quantptr;
+  ISLOW_MULT_TYPE *quantptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   SHIFT_TEMPS
 
@@ -390,7 +395,7 @@
    */
   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
-  dcval = (int) DESCALE((INT32) dcval, 3);
+  dcval = (int) DESCALE((JLONG) dcval, 3);
 
   output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
 }
diff --git a/jinclude.h b/jinclude.h
index 0a4f151..d461a1a 100644
--- a/jinclude.h
+++ b/jinclude.h
@@ -1,9 +1,12 @@
 /*
  * jinclude.h
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1994, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file exists to provide a single place to fix any problems with
  * including the wrong system include files.  (Common problems are taken
@@ -17,8 +20,8 @@
 
 /* Include auto-config file to find out which system include files we need. */
 
-#include "jconfig.h"		/* auto configuration options */
-#define JCONFIG_INCLUDED	/* so that jpeglib.h doesn't do it again */
+#include "jconfig.h"            /* auto configuration options */
+#define JCONFIG_INCLUDED        /* so that jpeglib.h doesn't do it again */
 
 /*
  * We need the NULL macro and size_t typedef.
@@ -58,28 +61,18 @@
 #ifdef NEED_BSD_STRINGS
 
 #include <strings.h>
-#define MEMZERO(target,size)	bzero((void *)(target), (size_t)(size))
-#define MEMCOPY(dest,src,size)	bcopy((const void *)(src), (void *)(dest), (size_t)(size))
+#define MEMZERO(target,size)    bzero((void *)(target), (size_t)(size))
+#define MEMCOPY(dest,src,size)  bcopy((const void *)(src), (void *)(dest), (size_t)(size))
 
 #else /* not BSD, assume ANSI/SysV string lib */
 
 #include <string.h>
-#define MEMZERO(target,size)	memset((void *)(target), 0, (size_t)(size))
-#define MEMCOPY(dest,src,size)	memcpy((void *)(dest), (const void *)(src), (size_t)(size))
+#define MEMZERO(target,size)    memset((void *)(target), 0, (size_t)(size))
+#define MEMCOPY(dest,src,size)  memcpy((void *)(dest), (const void *)(src), (size_t)(size))
 
 #endif
 
 /*
- * In ANSI C, and indeed any rational implementation, size_t is also the
- * type returned by sizeof().  However, it seems there are some irrational
- * implementations out there, in which sizeof() returns an int even though
- * size_t is defined as long or unsigned long.  To ensure consistent results
- * we always use this SIZEOF() macro in place of using sizeof() directly.
- */
-
-#define SIZEOF(object)	((size_t) sizeof(object))
-
-/*
  * The modules that use fread() and fwrite() always invoke them through
  * these macros.  On some systems you may need to twiddle the argument casts.
  * CAUTION: argument order is different from underlying functions!
diff --git a/jmemmgr.c b/jmemmgr.c
index cf32524..9174ad3 100644
--- a/jmemmgr.c
+++ b/jmemmgr.c
@@ -1,9 +1,12 @@
 /*
  * jmemmgr.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains the JPEG system-independent memory management
  * routines.  This code is usable across a wide variety of machines; most
@@ -25,14 +28,14 @@
  */
 
 #define JPEG_INTERNALS
-#define AM_MEMORY_MANAGER	/* we define jvirt_Xarray_control structs */
+#define AM_MEMORY_MANAGER       /* we define jvirt_Xarray_control structs */
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jmemsys.h"		/* import the system-dependent declarations */
+#include "jmemsys.h"            /* import the system-dependent declarations */
 
 #ifndef NO_GETENV
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare getenv() */
-extern char * getenv JPP((const char * name));
+#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare getenv() */
+extern char *getenv (const char *name);
 #endif
 #endif
 
@@ -67,9 +70,9 @@
  * There isn't any really portable way to determine the worst-case alignment
  * requirement.  This module assumes that the alignment requirement is
  * multiples of ALIGN_SIZE.
- * By default, we define ALIGN_SIZE as sizeof(double).  This is necessary on some
- * workstations (where doubles really do need 8-byte alignment) and will work
- * fine on nearly everything.  If your machine has lesser alignment needs,
+ * By default, we define ALIGN_SIZE as sizeof(double).  This is necessary on
+ * some workstations (where doubles really do need 8-byte alignment) and will
+ * work fine on nearly everything.  If your machine has lesser alignment needs,
  * you can save a few bytes by making ALIGN_SIZE smaller.
  * The only place I know of where this will NOT work is certain Macintosh
  * 680x0 compilers that define double as a 10-byte IEEE extended float.
@@ -78,9 +81,9 @@
  * such a compiler.
  */
 
-#ifndef ALIGN_SIZE		/* so can override from jconfig.h */
+#ifndef ALIGN_SIZE              /* so can override from jconfig.h */
 #ifndef WITH_SIMD
-#define ALIGN_SIZE  SIZEOF(double)
+#define ALIGN_SIZE  sizeof(double)
 #else
 #define ALIGN_SIZE  16 /* Most SIMD implementations require this */
 #endif
@@ -91,24 +94,23 @@
  * request to jpeg_get_small() or jpeg_get_large().  There is no per-object
  * overhead within a pool, except for alignment padding.  Each pool has a
  * header with a link to the next pool of the same class.
- * Small and large pool headers are identical except that the latter's
- * link pointer must be FAR on 80x86 machines.
+ * Small and large pool headers are identical.
  */
 
-typedef struct small_pool_struct * small_pool_ptr;
+typedef struct small_pool_struct *small_pool_ptr;
 
 typedef struct small_pool_struct {
-  small_pool_ptr next;	/* next in list of pools */
-  size_t bytes_used;		/* how many bytes already used within pool */
-  size_t bytes_left;		/* bytes still available in this pool */
+  small_pool_ptr next;  /* next in list of pools */
+  size_t bytes_used;            /* how many bytes already used within pool */
+  size_t bytes_left;            /* bytes still available in this pool */
 } small_pool_hdr;
 
-typedef struct large_pool_struct FAR * large_pool_ptr;
+typedef struct large_pool_struct *large_pool_ptr;
 
 typedef struct large_pool_struct {
-  large_pool_ptr next;	/* next in list of pools */
-  size_t bytes_used;		/* how many bytes already used within pool */
-  size_t bytes_left;		/* bytes still available in this pool */
+  large_pool_ptr next;  /* next in list of pools */
+  size_t bytes_used;            /* how many bytes already used within pool */
+  size_t bytes_left;            /* bytes still available in this pool */
 } large_pool_hdr;
 
 /*
@@ -116,7 +118,7 @@
  */
 
 typedef struct {
-  struct jpeg_memory_mgr pub;	/* public fields */
+  struct jpeg_memory_mgr pub;   /* public fields */
 
   /* Each pool identifier (lifetime class) names a linked list of pools. */
   small_pool_ptr small_list[JPOOL_NUMPOOLS];
@@ -136,10 +138,10 @@
   /* alloc_sarray and alloc_barray set this value for use by virtual
    * array routines.
    */
-  JDIMENSION last_rowsperchunk;	/* from most recent alloc_sarray/barray */
+  JDIMENSION last_rowsperchunk; /* from most recent alloc_sarray/barray */
 } my_memory_mgr;
 
-typedef my_memory_mgr * my_mem_ptr;
+typedef my_memory_mgr *my_mem_ptr;
 
 
 /*
@@ -150,39 +152,39 @@
  */
 
 struct jvirt_sarray_control {
-  JSAMPARRAY mem_buffer;	/* => the in-memory buffer */
-  JDIMENSION rows_in_array;	/* total virtual array height */
-  JDIMENSION samplesperrow;	/* width of array (and of memory buffer) */
-  JDIMENSION maxaccess;		/* max rows accessed by access_virt_sarray */
-  JDIMENSION rows_in_mem;	/* height of memory buffer */
-  JDIMENSION rowsperchunk;	/* allocation chunk size in mem_buffer */
-  JDIMENSION cur_start_row;	/* first logical row # in the buffer */
-  JDIMENSION first_undef_row;	/* row # of first uninitialized row */
-  boolean pre_zero;		/* pre-zero mode requested? */
-  boolean dirty;		/* do current buffer contents need written? */
-  boolean b_s_open;		/* is backing-store data valid? */
-  jvirt_sarray_ptr next;	/* link to next virtual sarray control block */
-  backing_store_info b_s_info;	/* System-dependent control info */
+  JSAMPARRAY mem_buffer;        /* => the in-memory buffer */
+  JDIMENSION rows_in_array;     /* total virtual array height */
+  JDIMENSION samplesperrow;     /* width of array (and of memory buffer) */
+  JDIMENSION maxaccess;         /* max rows accessed by access_virt_sarray */
+  JDIMENSION rows_in_mem;       /* height of memory buffer */
+  JDIMENSION rowsperchunk;      /* allocation chunk size in mem_buffer */
+  JDIMENSION cur_start_row;     /* first logical row # in the buffer */
+  JDIMENSION first_undef_row;   /* row # of first uninitialized row */
+  boolean pre_zero;             /* pre-zero mode requested? */
+  boolean dirty;                /* do current buffer contents need written? */
+  boolean b_s_open;             /* is backing-store data valid? */
+  jvirt_sarray_ptr next;        /* link to next virtual sarray control block */
+  backing_store_info b_s_info;  /* System-dependent control info */
 };
 
 struct jvirt_barray_control {
-  JBLOCKARRAY mem_buffer;	/* => the in-memory buffer */
-  JDIMENSION rows_in_array;	/* total virtual array height */
-  JDIMENSION blocksperrow;	/* width of array (and of memory buffer) */
-  JDIMENSION maxaccess;		/* max rows accessed by access_virt_barray */
-  JDIMENSION rows_in_mem;	/* height of memory buffer */
-  JDIMENSION rowsperchunk;	/* allocation chunk size in mem_buffer */
-  JDIMENSION cur_start_row;	/* first logical row # in the buffer */
-  JDIMENSION first_undef_row;	/* row # of first uninitialized row */
-  boolean pre_zero;		/* pre-zero mode requested? */
-  boolean dirty;		/* do current buffer contents need written? */
-  boolean b_s_open;		/* is backing-store data valid? */
-  jvirt_barray_ptr next;	/* link to next virtual barray control block */
-  backing_store_info b_s_info;	/* System-dependent control info */
+  JBLOCKARRAY mem_buffer;       /* => the in-memory buffer */
+  JDIMENSION rows_in_array;     /* total virtual array height */
+  JDIMENSION blocksperrow;      /* width of array (and of memory buffer) */
+  JDIMENSION maxaccess;         /* max rows accessed by access_virt_barray */
+  JDIMENSION rows_in_mem;       /* height of memory buffer */
+  JDIMENSION rowsperchunk;      /* allocation chunk size in mem_buffer */
+  JDIMENSION cur_start_row;     /* first logical row # in the buffer */
+  JDIMENSION first_undef_row;   /* row # of first uninitialized row */
+  boolean pre_zero;             /* pre-zero mode requested? */
+  boolean dirty;                /* do current buffer contents need written? */
+  boolean b_s_open;             /* is backing-store data valid? */
+  jvirt_barray_ptr next;        /* link to next virtual barray control block */
+  backing_store_info b_s_info;  /* System-dependent control info */
 };
 
 
-#ifdef MEM_STATS		/* optional extra stuff for statistics */
+#ifdef MEM_STATS                /* optional extra stuff for statistics */
 
 LOCAL(void)
 print_mem_stats (j_common_ptr cinfo, int pool_id)
@@ -196,19 +198,19 @@
    * This is helpful because message parm array can't handle longs.
    */
   fprintf(stderr, "Freeing pool %d, total space = %ld\n",
-	  pool_id, mem->total_space_allocated);
+          pool_id, mem->total_space_allocated);
 
   for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL;
        lhdr_ptr = lhdr_ptr->next) {
     fprintf(stderr, "  Large chunk used %ld\n",
-	    (long) lhdr_ptr->bytes_used);
+            (long) lhdr_ptr->bytes_used);
   }
 
   for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL;
        shdr_ptr = shdr_ptr->next) {
     fprintf(stderr, "  Small chunk used %ld free %ld\n",
-	    (long) shdr_ptr->bytes_used,
-	    (long) shdr_ptr->bytes_left);
+            (long) shdr_ptr->bytes_used,
+            (long) shdr_ptr->bytes_left);
   }
 }
 
@@ -221,7 +223,7 @@
 /* If we compiled MEM_STATS support, report alloc requests before dying */
 {
 #ifdef MEM_STATS
-  cinfo->err->trace_level = 2;	/* force self_destruct to report stats */
+  cinfo->err->trace_level = 2;  /* force self_destruct to report stats */
 #endif
   ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, which);
 }
@@ -244,19 +246,19 @@
  * adjustment.
  */
 
-static const size_t first_pool_slop[JPOOL_NUMPOOLS] = 
+static const size_t first_pool_slop[JPOOL_NUMPOOLS] =
 {
-	1600,			/* first PERMANENT pool */
-	16000			/* first IMAGE pool */
+        1600,                   /* first PERMANENT pool */
+        16000                   /* first IMAGE pool */
 };
 
-static const size_t extra_pool_slop[JPOOL_NUMPOOLS] = 
+static const size_t extra_pool_slop[JPOOL_NUMPOOLS] =
 {
-	0,			/* additional PERMANENT pools */
-	5000			/* additional IMAGE pools */
+        0,                      /* additional PERMANENT pools */
+        5000                    /* additional IMAGE pools */
 };
 
-#define MIN_SLOP  50		/* greater than 0 to avoid futile looping */
+#define MIN_SLOP  50            /* greater than 0 to avoid futile looping */
 
 
 METHODDEF(void *)
@@ -265,7 +267,7 @@
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
   small_pool_ptr hdr_ptr, prev_hdr_ptr;
-  char * data_ptr;
+  char *data_ptr;
   size_t min_request, slop;
 
   /*
@@ -274,20 +276,26 @@
    * and so that algorithms can straddle outside the proper area up
    * to the next alignment.
    */
+  if (sizeofobject > MAX_ALLOC_CHUNK) {
+    /* This prevents overflow/wrap-around in round_up_pow2() if sizeofobject
+       is close to SIZE_MAX. */
+    out_of_memory(cinfo, 7);
+  }
   sizeofobject = round_up_pow2(sizeofobject, ALIGN_SIZE);
 
   /* Check for unsatisfiable request (do now to ensure no overflow below) */
-  if ((SIZEOF(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > MAX_ALLOC_CHUNK)
-    out_of_memory(cinfo, 1);	/* request exceeds malloc's ability */
+  if ((sizeof(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) >
+      MAX_ALLOC_CHUNK)
+    out_of_memory(cinfo, 1);    /* request exceeds malloc's ability */
 
   /* See if space is available in any existing pool */
   if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
-    ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id);	/* safety check */
+    ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
   prev_hdr_ptr = NULL;
   hdr_ptr = mem->small_list[pool_id];
   while (hdr_ptr != NULL) {
     if (hdr_ptr->bytes_left >= sizeofobject)
-      break;			/* found pool with enough space */
+      break;                    /* found pool with enough space */
     prev_hdr_ptr = hdr_ptr;
     hdr_ptr = hdr_ptr->next;
   }
@@ -295,8 +303,8 @@
   /* Time to make a new pool? */
   if (hdr_ptr == NULL) {
     /* min_request is what we need now, slop is what will be leftover */
-    min_request = SIZEOF(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1;
-    if (prev_hdr_ptr == NULL)	/* first pool in class? */
+    min_request = sizeof(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1;
+    if (prev_hdr_ptr == NULL)   /* first pool in class? */
       slop = first_pool_slop[pool_id];
     else
       slop = extra_pool_slop[pool_id];
@@ -307,17 +315,17 @@
     for (;;) {
       hdr_ptr = (small_pool_ptr) jpeg_get_small(cinfo, min_request + slop);
       if (hdr_ptr != NULL)
-	break;
+        break;
       slop /= 2;
-      if (slop < MIN_SLOP)	/* give up when it gets real small */
-	out_of_memory(cinfo, 2); /* jpeg_get_small failed */
+      if (slop < MIN_SLOP)      /* give up when it gets real small */
+        out_of_memory(cinfo, 2); /* jpeg_get_small failed */
     }
     mem->total_space_allocated += min_request + slop;
     /* Success, initialize the new pool header and add to end of list */
     hdr_ptr->next = NULL;
     hdr_ptr->bytes_used = 0;
     hdr_ptr->bytes_left = sizeofobject + slop;
-    if (prev_hdr_ptr == NULL)	/* first pool in class? */
+    if (prev_hdr_ptr == NULL)   /* first pool in class? */
       mem->small_list[pool_id] = hdr_ptr;
     else
       prev_hdr_ptr->next = hdr_ptr;
@@ -325,7 +333,7 @@
 
   /* OK, allocate the object from the current pool */
   data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
-  data_ptr += SIZEOF(small_pool_hdr); /* ...by skipping the header... */
+  data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */
   if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
     data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
   data_ptr += hdr_ptr->bytes_used; /* point to place for object */
@@ -339,9 +347,8 @@
 /*
  * Allocation of "large" objects.
  *
- * The external semantics of these are the same as "small" objects,
- * except that FAR pointers are used on 80x86.  However the pool
- * management heuristics are quite different.  We assume that each
+ * The external semantics of these are the same as "small" objects.  However,
+ * the pool management heuristics are quite different.  We assume that each
  * request is large enough that it may as well be passed directly to
  * jpeg_get_large; the pool management just links everything together
  * so that we can free it all on demand.
@@ -350,35 +357,42 @@
  * deliberately bunch rows together to ensure a large request size.
  */
 
-METHODDEF(void FAR *)
+METHODDEF(void *)
 alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
 /* Allocate a "large" object */
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
   large_pool_ptr hdr_ptr;
-  char FAR * data_ptr;
+  char *data_ptr;
 
   /*
    * Round up the requested size to a multiple of ALIGN_SIZE so that
    * algorithms can straddle outside the proper area up to the next
    * alignment.
    */
+  if (sizeofobject > MAX_ALLOC_CHUNK) {
+    /* This prevents overflow/wrap-around in round_up_pow2() if sizeofobject
+       is close to SIZE_MAX. */
+    out_of_memory(cinfo, 8);
+  }
   sizeofobject = round_up_pow2(sizeofobject, ALIGN_SIZE);
 
   /* Check for unsatisfiable request (do now to ensure no overflow below) */
-  if ((SIZEOF(large_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > MAX_ALLOC_CHUNK)
-    out_of_memory(cinfo, 3);	/* request exceeds malloc's ability */
+  if ((sizeof(large_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) >
+      MAX_ALLOC_CHUNK)
+    out_of_memory(cinfo, 3);    /* request exceeds malloc's ability */
 
   /* Always make a new pool */
   if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
-    ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id);	/* safety check */
+    ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
   hdr_ptr = (large_pool_ptr) jpeg_get_large(cinfo, sizeofobject +
-					    SIZEOF(large_pool_hdr) +
-					    ALIGN_SIZE - 1);
+                                            sizeof(large_pool_hdr) +
+                                            ALIGN_SIZE - 1);
   if (hdr_ptr == NULL)
-    out_of_memory(cinfo, 4);	/* jpeg_get_large failed */
-  mem->total_space_allocated += sizeofobject + SIZEOF(large_pool_hdr) + ALIGN_SIZE - 1;
+    out_of_memory(cinfo, 4);    /* jpeg_get_large failed */
+  mem->total_space_allocated += sizeofobject + sizeof(large_pool_hdr) +
+                                ALIGN_SIZE - 1;
 
   /* Success, initialize the new pool header and add to list */
   hdr_ptr->next = mem->large_list[pool_id];
@@ -390,17 +404,16 @@
   mem->large_list[pool_id] = hdr_ptr;
 
   data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
-  data_ptr += SIZEOF(small_pool_hdr); /* ...by skipping the header... */
+  data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */
   if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
     data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
 
-  return (void FAR *) data_ptr;
+  return (void *) data_ptr;
 }
 
 
 /*
  * Creation of 2-D sample arrays.
- * The pointers are in near heap, the samples themselves in FAR heap.
  *
  * To minimize allocation overhead and to allow I/O of large contiguous
  * blocks, we allocate the sample rows in groups of as many rows as possible
@@ -417,7 +430,7 @@
 
 METHODDEF(JSAMPARRAY)
 alloc_sarray (j_common_ptr cinfo, int pool_id,
-	      JDIMENSION samplesperrow, JDIMENSION numrows)
+              JDIMENSION samplesperrow, JDIMENSION numrows)
 /* Allocate a 2-D sample array */
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
@@ -427,13 +440,20 @@
   long ltemp;
 
   /* Make sure each row is properly aligned */
-  if ((ALIGN_SIZE % SIZEOF(JSAMPLE)) != 0)
-    out_of_memory(cinfo, 5);	/* safety check */
-  samplesperrow = (JDIMENSION)round_up_pow2(samplesperrow, (2 * ALIGN_SIZE) / SIZEOF(JSAMPLE));
+  if ((ALIGN_SIZE % sizeof(JSAMPLE)) != 0)
+    out_of_memory(cinfo, 5);    /* safety check */
+
+  if (samplesperrow > MAX_ALLOC_CHUNK) {
+    /* This prevents overflow/wrap-around in round_up_pow2() if sizeofobject
+       is close to SIZE_MAX. */
+    out_of_memory(cinfo, 9);
+  }
+  samplesperrow = (JDIMENSION)round_up_pow2(samplesperrow, (2 * ALIGN_SIZE) /
+                                                           sizeof(JSAMPLE));
 
   /* Calculate max # of rows allowed in one allocation chunk */
-  ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
-	  ((long) samplesperrow * SIZEOF(JSAMPLE));
+  ltemp = (MAX_ALLOC_CHUNK-sizeof(large_pool_hdr)) /
+          ((long) samplesperrow * sizeof(JSAMPLE));
   if (ltemp <= 0)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
   if (ltemp < (long) numrows)
@@ -444,15 +464,15 @@
 
   /* Get space for row pointers (small object) */
   result = (JSAMPARRAY) alloc_small(cinfo, pool_id,
-				    (size_t) (numrows * SIZEOF(JSAMPROW)));
+                                    (size_t) (numrows * sizeof(JSAMPROW)));
 
   /* Get the rows themselves (large objects) */
   currow = 0;
   while (currow < numrows) {
     rowsperchunk = MIN(rowsperchunk, numrows - currow);
     workspace = (JSAMPROW) alloc_large(cinfo, pool_id,
-	(size_t) ((size_t) rowsperchunk * (size_t) samplesperrow
-		  * SIZEOF(JSAMPLE)));
+        (size_t) ((size_t) rowsperchunk * (size_t) samplesperrow
+                  * sizeof(JSAMPLE)));
     for (i = rowsperchunk; i > 0; i--) {
       result[currow++] = workspace;
       workspace += samplesperrow;
@@ -470,7 +490,7 @@
 
 METHODDEF(JBLOCKARRAY)
 alloc_barray (j_common_ptr cinfo, int pool_id,
-	      JDIMENSION blocksperrow, JDIMENSION numrows)
+              JDIMENSION blocksperrow, JDIMENSION numrows)
 /* Allocate a 2-D coefficient-block array */
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
@@ -480,12 +500,12 @@
   long ltemp;
 
   /* Make sure each row is properly aligned */
-  if ((SIZEOF(JBLOCK) % ALIGN_SIZE) != 0)
-    out_of_memory(cinfo, 6);	/* safety check */
+  if ((sizeof(JBLOCK) % ALIGN_SIZE) != 0)
+    out_of_memory(cinfo, 6);    /* safety check */
 
   /* Calculate max # of rows allowed in one allocation chunk */
-  ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
-	  ((long) blocksperrow * SIZEOF(JBLOCK));
+  ltemp = (MAX_ALLOC_CHUNK-sizeof(large_pool_hdr)) /
+          ((long) blocksperrow * sizeof(JBLOCK));
   if (ltemp <= 0)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
   if (ltemp < (long) numrows)
@@ -496,15 +516,15 @@
 
   /* Get space for row pointers (small object) */
   result = (JBLOCKARRAY) alloc_small(cinfo, pool_id,
-				     (size_t) (numrows * SIZEOF(JBLOCKROW)));
+                                     (size_t) (numrows * sizeof(JBLOCKROW)));
 
   /* Get the rows themselves (large objects) */
   currow = 0;
   while (currow < numrows) {
     rowsperchunk = MIN(rowsperchunk, numrows - currow);
     workspace = (JBLOCKROW) alloc_large(cinfo, pool_id,
-	(size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
-		  * SIZEOF(JBLOCK)));
+        (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
+                  * sizeof(JBLOCK)));
     for (i = rowsperchunk; i > 0; i--) {
       result[currow++] = workspace;
       workspace += blocksperrow;
@@ -554,8 +574,8 @@
 
 METHODDEF(jvirt_sarray_ptr)
 request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
-		     JDIMENSION samplesperrow, JDIMENSION numrows,
-		     JDIMENSION maxaccess)
+                     JDIMENSION samplesperrow, JDIMENSION numrows,
+                     JDIMENSION maxaccess)
 /* Request a virtual 2-D sample array */
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
@@ -563,18 +583,18 @@
 
   /* Only IMAGE-lifetime virtual arrays are currently supported */
   if (pool_id != JPOOL_IMAGE)
-    ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id);	/* safety check */
+    ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
   /* get control block */
   result = (jvirt_sarray_ptr) alloc_small(cinfo, pool_id,
-					  SIZEOF(struct jvirt_sarray_control));
+                                          sizeof(struct jvirt_sarray_control));
 
-  result->mem_buffer = NULL;	/* marks array not yet realized */
+  result->mem_buffer = NULL;    /* marks array not yet realized */
   result->rows_in_array = numrows;
   result->samplesperrow = samplesperrow;
   result->maxaccess = maxaccess;
   result->pre_zero = pre_zero;
-  result->b_s_open = FALSE;	/* no associated backing-store object */
+  result->b_s_open = FALSE;     /* no associated backing-store object */
   result->next = mem->virt_sarray_list; /* add to list of virtual arrays */
   mem->virt_sarray_list = result;
 
@@ -584,8 +604,8 @@
 
 METHODDEF(jvirt_barray_ptr)
 request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
-		     JDIMENSION blocksperrow, JDIMENSION numrows,
-		     JDIMENSION maxaccess)
+                     JDIMENSION blocksperrow, JDIMENSION numrows,
+                     JDIMENSION maxaccess)
 /* Request a virtual 2-D coefficient-block array */
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
@@ -593,18 +613,18 @@
 
   /* Only IMAGE-lifetime virtual arrays are currently supported */
   if (pool_id != JPOOL_IMAGE)
-    ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id);	/* safety check */
+    ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
   /* get control block */
   result = (jvirt_barray_ptr) alloc_small(cinfo, pool_id,
-					  SIZEOF(struct jvirt_barray_control));
+                                          sizeof(struct jvirt_barray_control));
 
-  result->mem_buffer = NULL;	/* marks array not yet realized */
+  result->mem_buffer = NULL;    /* marks array not yet realized */
   result->rows_in_array = numrows;
   result->blocksperrow = blocksperrow;
   result->maxaccess = maxaccess;
   result->pre_zero = pre_zero;
-  result->b_s_open = FALSE;	/* no associated backing-store object */
+  result->b_s_open = FALSE;     /* no associated backing-store object */
   result->next = mem->virt_barray_list; /* add to list of virtual arrays */
   mem->virt_barray_list = result;
 
@@ -631,26 +651,26 @@
   for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
     if (sptr->mem_buffer == NULL) { /* if not realized yet */
       space_per_minheight += (long) sptr->maxaccess *
-			     (long) sptr->samplesperrow * SIZEOF(JSAMPLE);
+                             (long) sptr->samplesperrow * sizeof(JSAMPLE);
       maximum_space += (long) sptr->rows_in_array *
-		       (long) sptr->samplesperrow * SIZEOF(JSAMPLE);
+                       (long) sptr->samplesperrow * sizeof(JSAMPLE);
     }
   }
   for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
     if (bptr->mem_buffer == NULL) { /* if not realized yet */
       space_per_minheight += (long) bptr->maxaccess *
-			     (long) bptr->blocksperrow * SIZEOF(JBLOCK);
+                             (long) bptr->blocksperrow * sizeof(JBLOCK);
       maximum_space += (long) bptr->rows_in_array *
-		       (long) bptr->blocksperrow * SIZEOF(JBLOCK);
+                       (long) bptr->blocksperrow * sizeof(JBLOCK);
     }
   }
 
   if (space_per_minheight <= 0)
-    return;			/* no unrealized arrays, no work */
+    return;                     /* no unrealized arrays, no work */
 
   /* Determine amount of memory to actually use; this is system-dependent. */
   avail_mem = jpeg_mem_available(cinfo, space_per_minheight, maximum_space,
-				 mem->total_space_allocated);
+                                 mem->total_space_allocated);
 
   /* If the maximum space needed is available, make all the buffers full
    * height; otherwise parcel it out with the same number of minheights
@@ -673,19 +693,19 @@
     if (sptr->mem_buffer == NULL) { /* if not realized yet */
       minheights = ((long) sptr->rows_in_array - 1L) / sptr->maxaccess + 1L;
       if (minheights <= max_minheights) {
-	/* This buffer fits in memory */
-	sptr->rows_in_mem = sptr->rows_in_array;
+        /* This buffer fits in memory */
+        sptr->rows_in_mem = sptr->rows_in_array;
       } else {
-	/* It doesn't fit in memory, create backing store. */
-	sptr->rows_in_mem = (JDIMENSION) (max_minheights * sptr->maxaccess);
-	jpeg_open_backing_store(cinfo, & sptr->b_s_info,
-				(long) sptr->rows_in_array *
-				(long) sptr->samplesperrow *
-				(long) SIZEOF(JSAMPLE));
-	sptr->b_s_open = TRUE;
+        /* It doesn't fit in memory, create backing store. */
+        sptr->rows_in_mem = (JDIMENSION) (max_minheights * sptr->maxaccess);
+        jpeg_open_backing_store(cinfo, & sptr->b_s_info,
+                                (long) sptr->rows_in_array *
+                                (long) sptr->samplesperrow *
+                                (long) sizeof(JSAMPLE));
+        sptr->b_s_open = TRUE;
       }
       sptr->mem_buffer = alloc_sarray(cinfo, JPOOL_IMAGE,
-				      sptr->samplesperrow, sptr->rows_in_mem);
+                                      sptr->samplesperrow, sptr->rows_in_mem);
       sptr->rowsperchunk = mem->last_rowsperchunk;
       sptr->cur_start_row = 0;
       sptr->first_undef_row = 0;
@@ -697,19 +717,19 @@
     if (bptr->mem_buffer == NULL) { /* if not realized yet */
       minheights = ((long) bptr->rows_in_array - 1L) / bptr->maxaccess + 1L;
       if (minheights <= max_minheights) {
-	/* This buffer fits in memory */
-	bptr->rows_in_mem = bptr->rows_in_array;
+        /* This buffer fits in memory */
+        bptr->rows_in_mem = bptr->rows_in_array;
       } else {
-	/* It doesn't fit in memory, create backing store. */
-	bptr->rows_in_mem = (JDIMENSION) (max_minheights * bptr->maxaccess);
-	jpeg_open_backing_store(cinfo, & bptr->b_s_info,
-				(long) bptr->rows_in_array *
-				(long) bptr->blocksperrow *
-				(long) SIZEOF(JBLOCK));
-	bptr->b_s_open = TRUE;
+        /* It doesn't fit in memory, create backing store. */
+        bptr->rows_in_mem = (JDIMENSION) (max_minheights * bptr->maxaccess);
+        jpeg_open_backing_store(cinfo, & bptr->b_s_info,
+                                (long) bptr->rows_in_array *
+                                (long) bptr->blocksperrow *
+                                (long) sizeof(JBLOCK));
+        bptr->b_s_open = TRUE;
       }
       bptr->mem_buffer = alloc_barray(cinfo, JPOOL_IMAGE,
-				      bptr->blocksperrow, bptr->rows_in_mem);
+                                      bptr->blocksperrow, bptr->rows_in_mem);
       bptr->rowsperchunk = mem->last_rowsperchunk;
       bptr->cur_start_row = 0;
       bptr->first_undef_row = 0;
@@ -725,7 +745,7 @@
 {
   long bytesperrow, file_offset, byte_count, rows, thisrow, i;
 
-  bytesperrow = (long) ptr->samplesperrow * SIZEOF(JSAMPLE);
+  bytesperrow = (long) ptr->samplesperrow * sizeof(JSAMPLE);
   file_offset = ptr->cur_start_row * bytesperrow;
   /* Loop to read or write each allocation chunk in mem_buffer */
   for (i = 0; i < (long) ptr->rows_in_mem; i += ptr->rowsperchunk) {
@@ -736,17 +756,17 @@
     rows = MIN(rows, (long) ptr->first_undef_row - thisrow);
     /* Transfer no more than fits in file */
     rows = MIN(rows, (long) ptr->rows_in_array - thisrow);
-    if (rows <= 0)		/* this chunk might be past end of file! */
+    if (rows <= 0)              /* this chunk might be past end of file! */
       break;
     byte_count = rows * bytesperrow;
     if (writing)
       (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info,
-					    (void FAR *) ptr->mem_buffer[i],
-					    file_offset, byte_count);
+                                            (void *) ptr->mem_buffer[i],
+                                            file_offset, byte_count);
     else
       (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info,
-					   (void FAR *) ptr->mem_buffer[i],
-					   file_offset, byte_count);
+                                           (void *) ptr->mem_buffer[i],
+                                           file_offset, byte_count);
     file_offset += byte_count;
   }
 }
@@ -758,7 +778,7 @@
 {
   long bytesperrow, file_offset, byte_count, rows, thisrow, i;
 
-  bytesperrow = (long) ptr->blocksperrow * SIZEOF(JBLOCK);
+  bytesperrow = (long) ptr->blocksperrow * sizeof(JBLOCK);
   file_offset = ptr->cur_start_row * bytesperrow;
   /* Loop to read or write each allocation chunk in mem_buffer */
   for (i = 0; i < (long) ptr->rows_in_mem; i += ptr->rowsperchunk) {
@@ -769,17 +789,17 @@
     rows = MIN(rows, (long) ptr->first_undef_row - thisrow);
     /* Transfer no more than fits in file */
     rows = MIN(rows, (long) ptr->rows_in_array - thisrow);
-    if (rows <= 0)		/* this chunk might be past end of file! */
+    if (rows <= 0)              /* this chunk might be past end of file! */
       break;
     byte_count = rows * bytesperrow;
     if (writing)
       (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info,
-					    (void FAR *) ptr->mem_buffer[i],
-					    file_offset, byte_count);
+                                            (void *) ptr->mem_buffer[i],
+                                            file_offset, byte_count);
     else
       (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info,
-					   (void FAR *) ptr->mem_buffer[i],
-					   file_offset, byte_count);
+                                           (void *) ptr->mem_buffer[i],
+                                           file_offset, byte_count);
     file_offset += byte_count;
   }
 }
@@ -787,8 +807,8 @@
 
 METHODDEF(JSAMPARRAY)
 access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
-		    JDIMENSION start_row, JDIMENSION num_rows,
-		    boolean writable)
+                    JDIMENSION start_row, JDIMENSION num_rows,
+                    boolean writable)
 /* Access the part of a virtual sample array starting at start_row */
 /* and extending for num_rows rows.  writable is true if  */
 /* caller intends to modify the accessed area. */
@@ -826,7 +846,7 @@
 
       ltemp = (long) end_row - (long) ptr->rows_in_mem;
       if (ltemp < 0)
-	ltemp = 0;		/* don't fall off front end of file */
+        ltemp = 0;              /* don't fall off front end of file */
       ptr->cur_start_row = (JDIMENSION) ltemp;
     }
     /* Read in the selected part of the array.
@@ -841,25 +861,25 @@
    */
   if (ptr->first_undef_row < end_row) {
     if (ptr->first_undef_row < start_row) {
-      if (writable)		/* writer skipped over a section of array */
-	ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
-      undef_row = start_row;	/* but reader is allowed to read ahead */
+      if (writable)             /* writer skipped over a section of array */
+        ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
+      undef_row = start_row;    /* but reader is allowed to read ahead */
     } else {
       undef_row = ptr->first_undef_row;
     }
     if (writable)
       ptr->first_undef_row = end_row;
     if (ptr->pre_zero) {
-      size_t bytesperrow = (size_t) ptr->samplesperrow * SIZEOF(JSAMPLE);
+      size_t bytesperrow = (size_t) ptr->samplesperrow * sizeof(JSAMPLE);
       undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
       end_row -= ptr->cur_start_row;
       while (undef_row < end_row) {
-	jzero_far((void FAR *) ptr->mem_buffer[undef_row], bytesperrow);
-	undef_row++;
+        jzero_far((void *) ptr->mem_buffer[undef_row], bytesperrow);
+        undef_row++;
       }
     } else {
-      if (! writable)		/* reader looking at undefined data */
-	ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
+      if (! writable)           /* reader looking at undefined data */
+        ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
     }
   }
   /* Flag the buffer dirty if caller will write in it */
@@ -872,8 +892,8 @@
 
 METHODDEF(JBLOCKARRAY)
 access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
-		    JDIMENSION start_row, JDIMENSION num_rows,
-		    boolean writable)
+                    JDIMENSION start_row, JDIMENSION num_rows,
+                    boolean writable)
 /* Access the part of a virtual block array starting at start_row */
 /* and extending for num_rows rows.  writable is true if  */
 /* caller intends to modify the accessed area. */
@@ -911,7 +931,7 @@
 
       ltemp = (long) end_row - (long) ptr->rows_in_mem;
       if (ltemp < 0)
-	ltemp = 0;		/* don't fall off front end of file */
+        ltemp = 0;              /* don't fall off front end of file */
       ptr->cur_start_row = (JDIMENSION) ltemp;
     }
     /* Read in the selected part of the array.
@@ -926,25 +946,25 @@
    */
   if (ptr->first_undef_row < end_row) {
     if (ptr->first_undef_row < start_row) {
-      if (writable)		/* writer skipped over a section of array */
-	ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
-      undef_row = start_row;	/* but reader is allowed to read ahead */
+      if (writable)             /* writer skipped over a section of array */
+        ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
+      undef_row = start_row;    /* but reader is allowed to read ahead */
     } else {
       undef_row = ptr->first_undef_row;
     }
     if (writable)
       ptr->first_undef_row = end_row;
     if (ptr->pre_zero) {
-      size_t bytesperrow = (size_t) ptr->blocksperrow * SIZEOF(JBLOCK);
+      size_t bytesperrow = (size_t) ptr->blocksperrow * sizeof(JBLOCK);
       undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
       end_row -= ptr->cur_start_row;
       while (undef_row < end_row) {
-	jzero_far((void FAR *) ptr->mem_buffer[undef_row], bytesperrow);
-	undef_row++;
+        jzero_far((void *) ptr->mem_buffer[undef_row], bytesperrow);
+        undef_row++;
       }
     } else {
-      if (! writable)		/* reader looking at undefined data */
-	ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
+      if (! writable)           /* reader looking at undefined data */
+        ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
     }
   }
   /* Flag the buffer dirty if caller will write in it */
@@ -968,7 +988,7 @@
   size_t space_freed;
 
   if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
-    ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id);	/* safety check */
+    ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
 #ifdef MEM_STATS
   if (cinfo->err->trace_level > 1)
@@ -981,16 +1001,16 @@
     jvirt_barray_ptr bptr;
 
     for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
-      if (sptr->b_s_open) {	/* there may be no backing store */
-	sptr->b_s_open = FALSE;	/* prevent recursive close if error */
-	(*sptr->b_s_info.close_backing_store) (cinfo, & sptr->b_s_info);
+      if (sptr->b_s_open) {     /* there may be no backing store */
+        sptr->b_s_open = FALSE; /* prevent recursive close if error */
+        (*sptr->b_s_info.close_backing_store) (cinfo, & sptr->b_s_info);
       }
     }
     mem->virt_sarray_list = NULL;
     for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
-      if (bptr->b_s_open) {	/* there may be no backing store */
-	bptr->b_s_open = FALSE;	/* prevent recursive close if error */
-	(*bptr->b_s_info.close_backing_store) (cinfo, & bptr->b_s_info);
+      if (bptr->b_s_open) {     /* there may be no backing store */
+        bptr->b_s_open = FALSE; /* prevent recursive close if error */
+        (*bptr->b_s_info.close_backing_store) (cinfo, & bptr->b_s_info);
       }
     }
     mem->virt_barray_list = NULL;
@@ -1003,9 +1023,9 @@
   while (lhdr_ptr != NULL) {
     large_pool_ptr next_lhdr_ptr = lhdr_ptr->next;
     space_freed = lhdr_ptr->bytes_used +
-		  lhdr_ptr->bytes_left +
-		  SIZEOF(large_pool_hdr);
-    jpeg_free_large(cinfo, (void FAR *) lhdr_ptr, space_freed);
+                  lhdr_ptr->bytes_left +
+                  sizeof(large_pool_hdr);
+    jpeg_free_large(cinfo, (void *) lhdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
     lhdr_ptr = next_lhdr_ptr;
   }
@@ -1017,8 +1037,8 @@
   while (shdr_ptr != NULL) {
     small_pool_ptr next_shdr_ptr = shdr_ptr->next;
     space_freed = shdr_ptr->bytes_used +
-		  shdr_ptr->bytes_left +
-		  SIZEOF(small_pool_hdr);
+                  shdr_ptr->bytes_left +
+                  sizeof(small_pool_hdr);
     jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
     shdr_ptr = next_shdr_ptr;
@@ -1045,10 +1065,10 @@
   }
 
   /* Release the memory manager control block too. */
-  jpeg_free_small(cinfo, (void *) cinfo->mem, SIZEOF(my_memory_mgr));
-  cinfo->mem = NULL;		/* ensures I will be called only once */
+  jpeg_free_small(cinfo, (void *) cinfo->mem, sizeof(my_memory_mgr));
+  cinfo->mem = NULL;            /* ensures I will be called only once */
 
-  jpeg_mem_term(cinfo);		/* system-dependent cleanup */
+  jpeg_mem_term(cinfo);         /* system-dependent cleanup */
 }
 
 
@@ -1065,10 +1085,10 @@
   int pool;
   size_t test_mac;
 
-  cinfo->mem = NULL;		/* for safety if init fails */
+  cinfo->mem = NULL;            /* for safety if init fails */
 
   /* Check for configuration errors.
-   * SIZEOF(ALIGN_TYPE) should be a power of 2; otherwise, it probably
+   * sizeof(ALIGN_TYPE) should be a power of 2; otherwise, it probably
    * doesn't reflect any real hardware alignment requirement.
    * The test is a little tricky: for X>0, X and X-1 have no one-bits
    * in common if and only if X is a power of 2, ie has only one one-bit.
@@ -1089,10 +1109,10 @@
   max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */
 
   /* Attempt to allocate memory manager's control block */
-  mem = (my_mem_ptr) jpeg_get_small(cinfo, SIZEOF(my_memory_mgr));
+  mem = (my_mem_ptr) jpeg_get_small(cinfo, sizeof(my_memory_mgr));
 
   if (mem == NULL) {
-    jpeg_mem_term(cinfo);	/* system-dependent cleanup */
+    jpeg_mem_term(cinfo);       /* system-dependent cleanup */
     ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 0);
   }
 
@@ -1122,7 +1142,7 @@
   mem->virt_sarray_list = NULL;
   mem->virt_barray_list = NULL;
 
-  mem->total_space_allocated = SIZEOF(my_memory_mgr);
+  mem->total_space_allocated = sizeof(my_memory_mgr);
 
   /* Declare ourselves open for business */
   cinfo->mem = & mem->pub;
@@ -1134,15 +1154,15 @@
    * this feature.
    */
 #ifndef NO_GETENV
-  { char * memenv;
+  { char *memenv;
 
     if ((memenv = getenv("JPEGMEM")) != NULL) {
       char ch = 'x';
 
       if (sscanf(memenv, "%ld%c", &max_to_use, &ch) > 0) {
-	if (ch == 'm' || ch == 'M')
-	  max_to_use *= 1000L;
-	mem->pub.max_memory_to_use = max_to_use * 1000L;
+        if (ch == 'm' || ch == 'M')
+          max_to_use *= 1000L;
+        mem->pub.max_memory_to_use = max_to_use * 1000L;
       }
     }
   }
diff --git a/jmemnobs.c b/jmemnobs.c
index 34b1895..5797198 100644
--- a/jmemnobs.c
+++ b/jmemnobs.c
@@ -1,9 +1,12 @@
 /*
  * jmemnobs.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1992-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code and
+ * information relevant to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file provides a really simple implementation of the system-
  * dependent portion of the JPEG memory manager.  This implementation
@@ -18,11 +21,11 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jmemsys.h"		/* import the system-dependent declarations */
+#include "jmemsys.h"            /* import the system-dependent declarations */
 
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
+#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
+extern void *malloc (size_t size);
+extern void free (void *ptr);
 #endif
 
 
@@ -38,7 +41,7 @@
 }
 
 GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
+jpeg_free_small (j_common_ptr cinfo, void *object, size_t sizeofobject)
 {
   free(object);
 }
@@ -46,19 +49,16 @@
 
 /*
  * "Large" objects are treated the same as "small" ones.
- * NB: although we include FAR keywords in the routine declarations,
- * this file won't actually work in 80x86 small/medium model; at least,
- * you probably won't be able to process useful-size images in only 64KB.
  */
 
-GLOBAL(void FAR *)
+GLOBAL(void *)
 jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
 {
-  return (void FAR *) malloc(sizeofobject);
+  return (void *) malloc(sizeofobject);
 }
 
 GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
+jpeg_free_large (j_common_ptr cinfo, void *object, size_t sizeofobject)
 {
   free(object);
 }
@@ -71,7 +71,7 @@
 
 GLOBAL(size_t)
 jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
-		    size_t max_bytes_needed, size_t already_allocated)
+                    size_t max_bytes_needed, size_t already_allocated)
 {
   return max_bytes_needed;
 }
@@ -85,7 +85,7 @@
 
 GLOBAL(void)
 jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-			 long total_bytes_needed)
+                         long total_bytes_needed)
 {
   ERREXIT(cinfo, JERR_NO_BACKING_STORE);
 }
@@ -99,7 +99,7 @@
 GLOBAL(long)
 jpeg_mem_init (j_common_ptr cinfo)
 {
-  return 0;			/* just set max_memory_to_use to 0 */
+  return 0;                     /* just set max_memory_to_use to 0 */
 }
 
 GLOBAL(void)
diff --git a/jmemsys.h b/jmemsys.h
index b190945..f7dfe87 100644
--- a/jmemsys.h
+++ b/jmemsys.h
@@ -1,9 +1,12 @@
 /*
  * jmemsys.h
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1992-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code and
+ * information relevant to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This include file defines the interface between the system-independent
  * and system-dependent portions of the JPEG memory manager.  No other
@@ -14,25 +17,10 @@
  * in the IJG distribution.  You may need to modify it if you write a
  * custom memory manager.  If system-dependent changes are needed in
  * this file, the best method is to #ifdef them based on a configuration
- * symbol supplied in jconfig.h, as we have done with USE_MSDOS_MEMMGR
- * and USE_MAC_MEMMGR.
+ * symbol supplied in jconfig.h.
  */
 
 
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_get_small		jGetSmall
-#define jpeg_free_small		jFreeSmall
-#define jpeg_get_large		jGetLarge
-#define jpeg_free_large		jFreeLarge
-#define jpeg_mem_available	jMemAvail
-#define jpeg_open_backing_store	jOpenBackStore
-#define jpeg_mem_init		jMemInit
-#define jpeg_mem_term		jMemTerm
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
-
 /*
  * These two functions are used to allocate and release small chunks of
  * memory.  (Typically the total amount requested through jpeg_get_small is
@@ -41,40 +29,36 @@
  * and free; in particular, jpeg_get_small must return NULL on failure.
  * On most systems, these ARE malloc and free.  jpeg_free_small is passed the
  * size of the object being freed, just in case it's needed.
- * On an 80x86 machine using small-data memory model, these manage near heap.
  */
 
-EXTERN(void *) jpeg_get_small JPP((j_common_ptr cinfo, size_t sizeofobject));
-EXTERN(void) jpeg_free_small JPP((j_common_ptr cinfo, void * object,
-				  size_t sizeofobject));
+EXTERN(void *) jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject);
+EXTERN(void) jpeg_free_small (j_common_ptr cinfo, void *object,
+                              size_t sizeofobject);
 
 /*
  * These two functions are used to allocate and release large chunks of
  * memory (up to the total free space designated by jpeg_mem_available).
- * The interface is the same as above, except that on an 80x86 machine,
- * far pointers are used.  On most other machines these are identical to
- * the jpeg_get/free_small routines; but we keep them separate anyway,
- * in case a different allocation strategy is desirable for large chunks.
+ * These are identical to the jpeg_get/free_small routines; but we keep them
+ * separate anyway, in case a different allocation strategy is desirable for
+ * large chunks.
  */
 
-EXTERN(void FAR *) jpeg_get_large JPP((j_common_ptr cinfo,
-				       size_t sizeofobject));
-EXTERN(void) jpeg_free_large JPP((j_common_ptr cinfo, void FAR * object,
-				  size_t sizeofobject));
+EXTERN(void *) jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject);
+EXTERN(void) jpeg_free_large (j_common_ptr cinfo, void *object,
+                              size_t sizeofobject);
 
 /*
  * The macro MAX_ALLOC_CHUNK designates the maximum number of bytes that may
  * be requested in a single call to jpeg_get_large (and jpeg_get_small for that
- * matter, but that case should never come into play).  This macro is needed
+ * matter, but that case should never come into play).  This macro was needed
  * to model the 64Kb-segment-size limit of far addressing on 80x86 machines.
- * On those machines, we expect that jconfig.h will provide a proper value.
- * On machines with 32-bit flat address spaces, any large constant may be used.
+ * On machines with flat address spaces, any large constant may be used.
  *
  * NB: jmemmgr.c expects that MAX_ALLOC_CHUNK will be representable as type
  * size_t and will be a multiple of sizeof(align_type).
  */
 
-#ifndef MAX_ALLOC_CHUNK		/* may be overridden in jconfig.h */
+#ifndef MAX_ALLOC_CHUNK         /* may be overridden in jconfig.h */
 #define MAX_ALLOC_CHUNK  1000000000L
 #endif
 
@@ -100,10 +84,9 @@
  * Conversely, zero may be returned to always use the minimum amount of memory.
  */
 
-EXTERN(size_t) jpeg_mem_available JPP((j_common_ptr cinfo,
-				     size_t min_bytes_needed,
-				     size_t max_bytes_needed,
-				     size_t already_allocated));
+EXTERN(size_t) jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
+                                   size_t max_bytes_needed,
+                                   size_t already_allocated);
 
 
 /*
@@ -113,56 +96,53 @@
  * are private to the system-dependent backing store routines.
  */
 
-#define TEMP_NAME_LENGTH   64	/* max length of a temporary file's name */
+#define TEMP_NAME_LENGTH   64   /* max length of a temporary file's name */
 
 
-#ifdef USE_MSDOS_MEMMGR		/* DOS-specific junk */
+#ifdef USE_MSDOS_MEMMGR         /* DOS-specific junk */
 
-typedef unsigned short XMSH;	/* type of extended-memory handles */
-typedef unsigned short EMSH;	/* type of expanded-memory handles */
+typedef unsigned short XMSH;    /* type of extended-memory handles */
+typedef unsigned short EMSH;    /* type of expanded-memory handles */
 
 typedef union {
-  short file_handle;		/* DOS file handle if it's a temp file */
-  XMSH xms_handle;		/* handle if it's a chunk of XMS */
-  EMSH ems_handle;		/* handle if it's a chunk of EMS */
+  short file_handle;            /* DOS file handle if it's a temp file */
+  XMSH xms_handle;              /* handle if it's a chunk of XMS */
+  EMSH ems_handle;              /* handle if it's a chunk of EMS */
 } handle_union;
 
 #endif /* USE_MSDOS_MEMMGR */
 
-#ifdef USE_MAC_MEMMGR		/* Mac-specific junk */
+#ifdef USE_MAC_MEMMGR           /* Mac-specific junk */
 #include <Files.h>
 #endif /* USE_MAC_MEMMGR */
 
 
-typedef struct backing_store_struct * backing_store_ptr;
+typedef struct backing_store_struct *backing_store_ptr;
 
 typedef struct backing_store_struct {
   /* Methods for reading/writing/closing this backing-store object */
-  JMETHOD(void, read_backing_store, (j_common_ptr cinfo,
-				     backing_store_ptr info,
-				     void FAR * buffer_address,
-				     long file_offset, long byte_count));
-  JMETHOD(void, write_backing_store, (j_common_ptr cinfo,
-				      backing_store_ptr info,
-				      void FAR * buffer_address,
-				      long file_offset, long byte_count));
-  JMETHOD(void, close_backing_store, (j_common_ptr cinfo,
-				      backing_store_ptr info));
+  void (*read_backing_store) (j_common_ptr cinfo, backing_store_ptr info,
+                              void *buffer_address, long file_offset,
+                              long byte_count);
+  void (*write_backing_store) (j_common_ptr cinfo, backing_store_ptr info,
+                               void *buffer_address, long file_offset,
+                               long byte_count);
+  void (*close_backing_store) (j_common_ptr cinfo, backing_store_ptr info);
 
   /* Private fields for system-dependent backing-store management */
 #ifdef USE_MSDOS_MEMMGR
   /* For the MS-DOS manager (jmemdos.c), we need: */
-  handle_union handle;		/* reference to backing-store storage object */
+  handle_union handle;          /* reference to backing-store storage object */
   char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */
 #else
 #ifdef USE_MAC_MEMMGR
   /* For the Mac manager (jmemmac.c), we need: */
-  short temp_file;		/* file reference number to temp file */
-  FSSpec tempSpec;		/* the FSSpec for the temp file */
+  short temp_file;              /* file reference number to temp file */
+  FSSpec tempSpec;              /* the FSSpec for the temp file */
   char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */
 #else
   /* For a typical implementation with temp files, we need: */
-  FILE * temp_file;		/* stdio reference to temp file */
+  FILE *temp_file;              /* stdio reference to temp file */
   char temp_name[TEMP_NAME_LENGTH]; /* name of temp file */
 #endif
 #endif
@@ -177,9 +157,9 @@
  * just take an error exit.)
  */
 
-EXTERN(void) jpeg_open_backing_store JPP((j_common_ptr cinfo,
-					  backing_store_ptr info,
-					  long total_bytes_needed));
+EXTERN(void) jpeg_open_backing_store (j_common_ptr cinfo,
+                                      backing_store_ptr info,
+                                      long total_bytes_needed);
 
 
 /*
@@ -194,5 +174,5 @@
  * all opened backing-store objects have been closed.
  */
 
-EXTERN(long) jpeg_mem_init JPP((j_common_ptr cinfo));
-EXTERN(void) jpeg_mem_term JPP((j_common_ptr cinfo));
+EXTERN(long) jpeg_mem_init (j_common_ptr cinfo);
+EXTERN(void) jpeg_mem_term (j_common_ptr cinfo);
diff --git a/jmorecfg.h b/jmorecfg.h
index d8738ae..1d96786 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -3,9 +3,11 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modifications:
- * Copyright (C) 2009, 2011, 2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Modified 1997-2009 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2011, 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains additional configuration options that customize the
  * JPEG software for special applications or support machine-dependent
@@ -14,18 +16,6 @@
 
 
 /*
- * Define BITS_IN_JSAMPLE as either
- *   8   for 8-bit sample values (the usual setting)
- *   12  for 12-bit sample values
- * Only 8 and 12 are legal data precisions for lossy JPEG according to the
- * JPEG standard, and the IJG code does not support anything else!
- * We do not support run-time selection of data precision, sorry.
- */
-
-#define BITS_IN_JSAMPLE  8	/* use 8 or 12 */
-
-
-/*
  * Maximum number of components (color channels) allowed in JPEG image.
  * To meet the letter of the JPEG spec, set this to 255.  However, darn
  * few applications need more than 4 channels (maybe 5 for CMYK + alpha
@@ -34,7 +24,7 @@
  * bytes of storage, whether actually used in an image or not.)
  */
 
-#define MAX_COMPONENTS  10	/* maximum number of image components */
+#define MAX_COMPONENTS  10      /* maximum number of image components */
 
 
 /*
@@ -72,8 +62,8 @@
 
 #endif /* HAVE_UNSIGNED_CHAR */
 
-#define MAXJSAMPLE	255
-#define CENTERJSAMPLE	128
+#define MAXJSAMPLE      255
+#define CENTERJSAMPLE   128
 
 #endif /* BITS_IN_JSAMPLE == 8 */
 
@@ -86,8 +76,8 @@
 typedef short JSAMPLE;
 #define GETJSAMPLE(value)  ((int) (value))
 
-#define MAXJSAMPLE	4095
-#define CENTERJSAMPLE	2048
+#define MAXJSAMPLE      4095
+#define CENTERJSAMPLE   2048
 
 #endif /* BITS_IN_JSAMPLE == 12 */
 
@@ -153,27 +143,52 @@
 
 /* INT16 must hold at least the values -32768..32767. */
 
-#ifndef XMD_H			/* X11/xmd.h correctly defines INT16 */
-#ifndef _BASETSD_H_		/* basetsd.h correctly defines INT32 */
+#ifndef XMD_H                   /* X11/xmd.h correctly defines INT16 */
 typedef short INT16;
 #endif
-#endif
 
-/* INT32 must hold at least signed 32-bit values. */
+/* INT32 must hold at least signed 32-bit values.
+ *
+ * NOTE: The INT32 typedef dates back to libjpeg v5 (1994.)  Integers were
+ * sometimes 16-bit back then (MS-DOS), which is why INT32 is typedef'd to
+ * long.  It also wasn't common (or at least as common) in 1994 for INT32 to be
+ * defined by platform headers.  Since then, however, INT32 is defined in
+ * several other common places:
+ *
+ * Xmd.h (X11 header) typedefs INT32 to int on 64-bit platforms and long on
+ * 32-bit platforms (i.e always a 32-bit signed type.)
+ *
+ * basetsd.h (Win32 header) typedefs INT32 to int (always a 32-bit signed type
+ * on modern platforms.)
+ *
+ * qglobal.h (Qt header) typedefs INT32 to int (always a 32-bit signed type on
+ * modern platforms.)
+ *
+ * This is a recipe for conflict, since "long" and "int" aren't always
+ * compatible types.  Since the definition of INT32 has technically been part
+ * of the libjpeg API for more than 20 years, we can't remove it, but we do not
+ * use it internally any longer.  We instead define a separate type (JLONG)
+ * for internal use, which ensures that internal behavior will always be the
+ * same regardless of any external headers that may be included.
+ */
 
-#ifndef XMD_H			/* X11/xmd.h correctly defines INT32 */
-#ifndef _BASETSD_H_		/* basetsd.h correctly defines INT32 */
+#ifndef XMD_H                   /* X11/xmd.h correctly defines INT32 */
+#ifndef _BASETSD_H_		/* Microsoft defines it in basetsd.h */
+#ifndef _BASETSD_H		/* MinGW is slightly different */
+#ifndef QGLOBAL_H		/* Qt defines it in qglobal.h */
 typedef long INT32;
 #endif
 #endif
+#endif
+#endif
 
 /* Datatype used for image dimensions.  The JPEG standard only supports
  * images up to 64K*64K due to 16-bit fields in SOF markers.  Therefore
  * "unsigned int" is sufficient on all machines.  However, if you need to
  * handle larger images and you don't mind deviating from the spec, you
- * can change this datatype. Note that changing this type will require
- * potentially updating the assembly code to correctly use the new type
- * size.
+ * can change this datatype.  (Note that changing this datatype will
+ * potentially require modifying the SIMD code.  The x86-64 SIMD extensions,
+ * in particular, assume a 32-bit JDIMENSION.)
  */
 
 typedef unsigned int JDIMENSION;
@@ -189,44 +204,31 @@
  */
 
 /* a function called through method pointers: */
-#define METHODDEF(type)		static type
+#define METHODDEF(type)         static type
 /* a function used only in its module: */
-#define LOCAL(type)		static type
+#define LOCAL(type)             static type
 /* a function referenced thru EXTERNs: */
-#define GLOBAL(type)		type
+#define GLOBAL(type)            type
 /* a reference to a GLOBAL function: */
-#define EXTERN(type)		extern type
+#define EXTERN(type)            extern type
 
 
-/* This macro is used to declare a "method", that is, a function pointer.
- * We want to supply prototype parameters if the compiler can cope.
- * Note that the arglist parameter must be parenthesized!
- * Again, you can customize this if you need special linkage keywords.
+/* Originally, this macro was used as a way of defining function prototypes
+ * for both modern compilers as well as older compilers that did not support
+ * prototype parameters.  libjpeg-turbo has never supported these older,
+ * non-ANSI compilers, but the macro is still included because there is some
+ * software out there that uses it.
  */
 
-#ifdef HAVE_PROTOTYPES
 #define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
-#else
-#define JMETHOD(type,methodname,arglist)  type (*methodname) ()
-#endif
 
 
-/* Here is the pseudo-keyword for declaring pointers that must be "far"
- * on 80x86 machines.  Most of the specialized coding for 80x86 is handled
- * by just saying "FAR *" where such a pointer is needed.  In a few places
- * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
+/* libjpeg-turbo no longer supports platforms that have far symbols (MS-DOS),
+ * but again, some software relies on this macro.
  */
 
-#ifndef FAR
-#ifdef NEED_FAR_POINTERS
-#ifndef FAR
-#define FAR  far
-#endif
-#else
 #undef FAR
 #define FAR
-#endif
-#endif
 
 
 /*
@@ -239,11 +241,11 @@
 #ifndef HAVE_BOOLEAN
 typedef int boolean;
 #endif
-#ifndef FALSE			/* in case these macros already exist */
-#define FALSE	0		/* values of boolean */
+#ifndef FALSE                   /* in case these macros already exist */
+#define FALSE   0               /* values of boolean */
 #endif
 #ifndef TRUE
-#define TRUE	1
+#define TRUE    1
 #endif
 
 
@@ -271,15 +273,15 @@
 
 /* Capability options common to encoder and decoder: */
 
-#define DCT_ISLOW_SUPPORTED	/* slow but accurate integer algorithm */
-#define DCT_IFAST_SUPPORTED	/* faster, less accurate integer method */
-#define DCT_FLOAT_SUPPORTED	/* floating-point: accurate, fast on fast HW */
+#define DCT_ISLOW_SUPPORTED     /* slow but accurate integer algorithm */
+#define DCT_IFAST_SUPPORTED     /* faster, less accurate integer method */
+#define DCT_FLOAT_SUPPORTED     /* floating-point: accurate, fast on fast HW */
 
 /* Encoder capability options: */
 
 #define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
-#define C_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
-#define ENTROPY_OPT_SUPPORTED	    /* Optimization of entropy coding parms? */
+#define C_PROGRESSIVE_SUPPORTED     /* Progressive JPEG? (Requires MULTISCAN)*/
+#define ENTROPY_OPT_SUPPORTED       /* Optimization of entropy coding parms? */
 /* Note: if you selected 12-bit data precision, it is dangerous to turn off
  * ENTROPY_OPT_SUPPORTED.  The standard Huffman tables are only good for 8-bit
  * precision, so jchuff.c normally uses entropy optimization to compute
@@ -293,37 +295,43 @@
 /* Decoder capability options: */
 
 #define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
-#define D_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
-#define SAVE_MARKERS_SUPPORTED	    /* jpeg_save_markers() needed? */
+#define D_PROGRESSIVE_SUPPORTED     /* Progressive JPEG? (Requires MULTISCAN)*/
+#define SAVE_MARKERS_SUPPORTED      /* jpeg_save_markers() needed? */
 #define BLOCK_SMOOTHING_SUPPORTED   /* Block smoothing? (Progressive only) */
-#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
+#define IDCT_SCALING_SUPPORTED      /* Output rescaling via IDCT? */
 #undef  UPSAMPLE_SCALING_SUPPORTED  /* Output rescaling at upsample stage? */
 #define UPSAMPLE_MERGING_SUPPORTED  /* Fast path for sloppy upsampling? */
-#define QUANT_1PASS_SUPPORTED	    /* 1-pass color quantization? */
-#define QUANT_2PASS_SUPPORTED	    /* 2-pass color quantization? */
+#define QUANT_1PASS_SUPPORTED       /* 1-pass color quantization? */
+#define QUANT_2PASS_SUPPORTED       /* 2-pass color quantization? */
 
 /* more capability options later, no doubt */
 
 
 /*
- * Ordering of RGB data in scanlines passed to or from the application.
- * If your application wants to deal with data in the order B,G,R, just
- * change these macros.  You can also deal with formats such as R,G,B,X
- * (one extra byte per pixel) by changing RGB_PIXELSIZE.  Note that changing
- * the offsets will also change the order in which colormap data is organized.
- * RESTRICTIONS:
- * 1. The sample applications cjpeg,djpeg do NOT support modified RGB formats.
- * 2. These macros only affect RGB<=>YCbCr color conversion, so they are not
- *    useful if you are using JPEG color spaces other than YCbCr or grayscale.
- * 3. The color quantizer modules will not behave desirably if RGB_PIXELSIZE
- *    is not 3 (they don't understand about dummy color components!).  So you
- *    can't use color quantization if you change that value.
+ * The RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE macros are a vestigial
+ * feature of libjpeg.  The idea was that, if an application developer needed
+ * to compress from/decompress to a BGR/BGRX/RGBX/XBGR/XRGB buffer, they could
+ * change these macros, rebuild libjpeg, and link their application statically
+ * with it.  In reality, few people ever did this, because there were some
+ * severe restrictions involved (cjpeg and djpeg no longer worked properly,
+ * compressing/decompressing RGB JPEGs no longer worked properly, and the color
+ * quantizer wouldn't work with pixel sizes other than 3.)  Further, since all
+ * of the O/S-supplied versions of libjpeg were built with the default values
+ * of RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE, many applications have
+ * come to regard these values as immutable.
+ *
+ * The libjpeg-turbo colorspace extensions provide a much cleaner way of
+ * compressing from/decompressing to buffers with arbitrary component orders
+ * and pixel sizes.  Thus, we do not support changing the values of RGB_RED,
+ * RGB_GREEN, RGB_BLUE, or RGB_PIXELSIZE.  In addition to the restrictions
+ * listed above, changing these values will also break the SIMD extensions and
+ * the regression tests.
  */
 
-#define RGB_RED		0	/* Offset of Red in an RGB scanline element */
-#define RGB_GREEN	1	/* Offset of Green */
-#define RGB_BLUE	2	/* Offset of Blue */
-#define RGB_PIXELSIZE	3	/* JSAMPLEs per RGB scanline element */
+#define RGB_RED         0       /* Offset of Red in an RGB scanline element */
+#define RGB_GREEN       1       /* Offset of Green */
+#define RGB_BLUE        2       /* Offset of Blue */
+#define RGB_PIXELSIZE   3       /* JSAMPLEs per RGB scanline element */
 
 #define JPEG_NUMCS 17
 
@@ -394,7 +402,7 @@
 
 #ifndef MULTIPLIER
 #ifndef WITH_SIMD
-#define MULTIPLIER  int		/* type for fastest integer multiply */
+#define MULTIPLIER  int         /* type for fastest integer multiply */
 #else
 #define MULTIPLIER short  /* prefer 16-bit with SIMD for parellelism */
 #endif
@@ -404,17 +412,10 @@
 /* FAST_FLOAT should be either float or double, whichever is done faster
  * by your compiler.  (Note that this type is only used in the floating point
  * DCT routines, so it only matters if you've defined DCT_FLOAT_SUPPORTED.)
- * Typically, float is faster in ANSI C compilers, while double is faster in
- * pre-ANSI compilers (because they insist on converting to double anyway).
- * The code below therefore chooses float if we have ANSI-style prototypes.
  */
 
 #ifndef FAST_FLOAT
-#ifdef HAVE_PROTOTYPES
 #define FAST_FLOAT  float
-#else
-#define FAST_FLOAT  double
-#endif
 #endif
 
 #endif /* JPEG_INTERNAL_OPTIONS */
diff --git a/jpeg_nbits_table.h b/jpeg_nbits_table.h
new file mode 100644
index 0000000..fcf7387
--- /dev/null
+++ b/jpeg_nbits_table.h
@@ -0,0 +1,4098 @@
+static const unsigned char jpeg_nbits_table[65536] = {
+   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
+   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+};
diff --git a/jpegcomp.h b/jpegcomp.h
index ed9eeab..c39275b 100644
--- a/jpegcomp.h
+++ b/jpegcomp.h
@@ -2,7 +2,8 @@
  * jpegcomp.h
  *
  * Copyright (C) 2010, D. R. Commander
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * JPEG compatibility macros
  * These declarations are considered internal to the JPEG library; most
diff --git a/jpegint.h b/jpegint.h
index 7871748..c3b4320 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -1,10 +1,14 @@
 /*
  * jpegint.h
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015-2016, D. R. Commander
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file provides common declarations for the various JPEG modules.
  * These declarations are considered internal to the JPEG library; most
@@ -14,121 +18,127 @@
 
 /* Declarations for both compression & decompression */
 
-typedef enum {			/* Operating modes for buffer controllers */
-	JBUF_PASS_THRU,		/* Plain stripwise operation */
-	/* Remaining modes require a full-image buffer to have been created */
-	JBUF_SAVE_SOURCE,	/* Run source subobject only, save output */
-	JBUF_CRANK_DEST,	/* Run dest subobject only, using saved data */
-	JBUF_SAVE_AND_PASS	/* Run both subobjects, save output */
+typedef enum {            /* Operating modes for buffer controllers */
+  JBUF_PASS_THRU,         /* Plain stripwise operation */
+  /* Remaining modes require a full-image buffer to have been created */
+  JBUF_SAVE_SOURCE,       /* Run source subobject only, save output */
+  JBUF_CRANK_DEST,        /* Run dest subobject only, using saved data */
+  JBUF_SAVE_AND_PASS      /* Run both subobjects, save output */
 } J_BUF_MODE;
 
 /* Values of global_state field (jdapi.c has some dependencies on ordering!) */
-#define CSTATE_START	100	/* after create_compress */
-#define CSTATE_SCANNING	101	/* start_compress done, write_scanlines OK */
-#define CSTATE_RAW_OK	102	/* start_compress done, write_raw_data OK */
-#define CSTATE_WRCOEFS	103	/* jpeg_write_coefficients done */
-#define DSTATE_START	200	/* after create_decompress */
-#define DSTATE_INHEADER	201	/* reading header markers, no SOS yet */
-#define DSTATE_READY	202	/* found SOS, ready for start_decompress */
-#define DSTATE_PRELOAD	203	/* reading multiscan file in start_decompress*/
-#define DSTATE_PRESCAN	204	/* performing dummy pass for 2-pass quant */
-#define DSTATE_SCANNING	205	/* start_decompress done, read_scanlines OK */
-#define DSTATE_RAW_OK	206	/* start_decompress done, read_raw_data OK */
-#define DSTATE_BUFIMAGE	207	/* expecting jpeg_start_output */
-#define DSTATE_BUFPOST	208	/* looking for SOS/EOI in jpeg_finish_output */
-#define DSTATE_RDCOEFS	209	/* reading file in jpeg_read_coefficients */
-#define DSTATE_STOPPING	210	/* looking for EOI in jpeg_finish_decompress */
+#define CSTATE_START    100     /* after create_compress */
+#define CSTATE_SCANNING 101     /* start_compress done, write_scanlines OK */
+#define CSTATE_RAW_OK   102     /* start_compress done, write_raw_data OK */
+#define CSTATE_WRCOEFS  103     /* jpeg_write_coefficients done */
+#define DSTATE_START    200     /* after create_decompress */
+#define DSTATE_INHEADER 201     /* reading header markers, no SOS yet */
+#define DSTATE_READY    202     /* found SOS, ready for start_decompress */
+#define DSTATE_PRELOAD  203     /* reading multiscan file in start_decompress*/
+#define DSTATE_PRESCAN  204     /* performing dummy pass for 2-pass quant */
+#define DSTATE_SCANNING 205     /* start_decompress done, read_scanlines OK */
+#define DSTATE_RAW_OK   206     /* start_decompress done, read_raw_data OK */
+#define DSTATE_BUFIMAGE 207     /* expecting jpeg_start_output */
+#define DSTATE_BUFPOST  208     /* looking for SOS/EOI in jpeg_finish_output */
+#define DSTATE_RDCOEFS  209     /* reading file in jpeg_read_coefficients */
+#define DSTATE_STOPPING 210     /* looking for EOI in jpeg_finish_decompress */
+
+
+/* JLONG must hold at least signed 32-bit values. */
+typedef long JLONG;
+
+
+/*
+ * Left shift macro that handles a negative operand without causing any
+ * sanitizer warnings
+ */
+
+#define LEFT_SHIFT(a, b) ((JLONG)((unsigned long)(a) << (b)))
 
 
 /* Declarations for compression modules */
 
 /* Master control module */
 struct jpeg_comp_master {
-  JMETHOD(void, prepare_for_pass, (j_compress_ptr cinfo));
-  JMETHOD(void, pass_startup, (j_compress_ptr cinfo));
-  JMETHOD(void, finish_pass, (j_compress_ptr cinfo));
+  void (*prepare_for_pass) (j_compress_ptr cinfo);
+  void (*pass_startup) (j_compress_ptr cinfo);
+  void (*finish_pass) (j_compress_ptr cinfo);
 
   /* State variables made visible to other modules */
-  boolean call_pass_startup;	/* True if pass_startup must be called */
-  boolean is_last_pass;		/* True during last pass */
+  boolean call_pass_startup;    /* True if pass_startup must be called */
+  boolean is_last_pass;         /* True during last pass */
 };
 
 /* Main buffer control (downsampled-data buffer) */
 struct jpeg_c_main_controller {
-  JMETHOD(void, start_pass, (j_compress_ptr cinfo, J_BUF_MODE pass_mode));
-  JMETHOD(void, process_data, (j_compress_ptr cinfo,
-			       JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-			       JDIMENSION in_rows_avail));
+  void (*start_pass) (j_compress_ptr cinfo, J_BUF_MODE pass_mode);
+  void (*process_data) (j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                        JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail);
 };
 
 /* Compression preprocessing (downsampling input buffer control) */
 struct jpeg_c_prep_controller {
-  JMETHOD(void, start_pass, (j_compress_ptr cinfo, J_BUF_MODE pass_mode));
-  JMETHOD(void, pre_process_data, (j_compress_ptr cinfo,
-				   JSAMPARRAY input_buf,
-				   JDIMENSION *in_row_ctr,
-				   JDIMENSION in_rows_avail,
-				   JSAMPIMAGE output_buf,
-				   JDIMENSION *out_row_group_ctr,
-				   JDIMENSION out_row_groups_avail));
+  void (*start_pass) (j_compress_ptr cinfo, J_BUF_MODE pass_mode);
+  void (*pre_process_data) (j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                            JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
+                            JSAMPIMAGE output_buf,
+                            JDIMENSION *out_row_group_ctr,
+                            JDIMENSION out_row_groups_avail);
 };
 
 /* Coefficient buffer control */
 struct jpeg_c_coef_controller {
-  JMETHOD(void, start_pass, (j_compress_ptr cinfo, J_BUF_MODE pass_mode));
-  JMETHOD(boolean, compress_data, (j_compress_ptr cinfo,
-				   JSAMPIMAGE input_buf));
+  void (*start_pass) (j_compress_ptr cinfo, J_BUF_MODE pass_mode);
+  boolean (*compress_data) (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
 };
 
 /* Colorspace conversion */
 struct jpeg_color_converter {
-  JMETHOD(void, start_pass, (j_compress_ptr cinfo));
-  JMETHOD(void, color_convert, (j_compress_ptr cinfo,
-				JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-				JDIMENSION output_row, int num_rows));
+  void (*start_pass) (j_compress_ptr cinfo);
+  void (*color_convert) (j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                         JSAMPIMAGE output_buf, JDIMENSION output_row,
+                         int num_rows);
 };
 
 /* Downsampling */
 struct jpeg_downsampler {
-  JMETHOD(void, start_pass, (j_compress_ptr cinfo));
-  JMETHOD(void, downsample, (j_compress_ptr cinfo,
-			     JSAMPIMAGE input_buf, JDIMENSION in_row_index,
-			     JSAMPIMAGE output_buf,
-			     JDIMENSION out_row_group_index));
+  void (*start_pass) (j_compress_ptr cinfo);
+  void (*downsample) (j_compress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION in_row_index, JSAMPIMAGE output_buf,
+                      JDIMENSION out_row_group_index);
 
-  boolean need_context_rows;	/* TRUE if need rows above & below */
+  boolean need_context_rows;    /* TRUE if need rows above & below */
 };
 
 /* Forward DCT (also controls coefficient quantization) */
 struct jpeg_forward_dct {
-  JMETHOD(void, start_pass, (j_compress_ptr cinfo));
+  void (*start_pass) (j_compress_ptr cinfo);
   /* perhaps this should be an array??? */
-  JMETHOD(void, forward_DCT, (j_compress_ptr cinfo,
-			      jpeg_component_info * compptr,
-			      JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-			      JDIMENSION start_row, JDIMENSION start_col,
-			      JDIMENSION num_blocks));
+  void (*forward_DCT) (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                       JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+                       JDIMENSION start_row, JDIMENSION start_col,
+                       JDIMENSION num_blocks);
 };
 
 /* Entropy encoding */
 struct jpeg_entropy_encoder {
-  JMETHOD(void, start_pass, (j_compress_ptr cinfo, boolean gather_statistics));
-  JMETHOD(boolean, encode_mcu, (j_compress_ptr cinfo, JBLOCKROW *MCU_data));
-  JMETHOD(void, finish_pass, (j_compress_ptr cinfo));
+  void (*start_pass) (j_compress_ptr cinfo, boolean gather_statistics);
+  boolean (*encode_mcu) (j_compress_ptr cinfo, JBLOCKROW *MCU_data);
+  void (*finish_pass) (j_compress_ptr cinfo);
 };
 
 /* Marker writing */
 struct jpeg_marker_writer {
-  JMETHOD(void, write_file_header, (j_compress_ptr cinfo));
-  JMETHOD(void, write_frame_header, (j_compress_ptr cinfo));
-  JMETHOD(void, write_scan_header, (j_compress_ptr cinfo));
-  JMETHOD(void, write_file_trailer, (j_compress_ptr cinfo));
-  JMETHOD(void, write_tables_only, (j_compress_ptr cinfo));
+  void (*write_file_header) (j_compress_ptr cinfo);
+  void (*write_frame_header) (j_compress_ptr cinfo);
+  void (*write_scan_header) (j_compress_ptr cinfo);
+  void (*write_file_trailer) (j_compress_ptr cinfo);
+  void (*write_tables_only) (j_compress_ptr cinfo);
   /* These routines are exported to allow insertion of extra markers */
   /* Probably only COM and APPn markers should be written this way */
-  JMETHOD(void, write_marker_header, (j_compress_ptr cinfo, int marker,
-				      unsigned int datalen));
-  JMETHOD(void, write_marker_byte, (j_compress_ptr cinfo, int val));
+  void (*write_marker_header) (j_compress_ptr cinfo, int marker,
+                               unsigned int datalen);
+  void (*write_marker_byte) (j_compress_ptr cinfo, int val);
 };
 
 
@@ -136,138 +146,137 @@
 
 /* Master control module */
 struct jpeg_decomp_master {
-  JMETHOD(void, prepare_for_output_pass, (j_decompress_ptr cinfo));
-  JMETHOD(void, finish_output_pass, (j_decompress_ptr cinfo));
+  void (*prepare_for_output_pass) (j_decompress_ptr cinfo);
+  void (*finish_output_pass) (j_decompress_ptr cinfo);
 
   /* State variables made visible to other modules */
-  boolean is_dummy_pass;	/* True during 1st pass for 2-pass quant */
+  boolean is_dummy_pass;        /* True during 1st pass for 2-pass quant */
+
+  /* Partial decompression variables */
+  JDIMENSION first_iMCU_col;
+  JDIMENSION last_iMCU_col;
+  JDIMENSION first_MCU_col[MAX_COMPS_IN_SCAN];
+  JDIMENSION last_MCU_col[MAX_COMPS_IN_SCAN];
+  boolean jinit_upsampler_no_alloc;
 };
 
 /* Input control module */
 struct jpeg_input_controller {
-  JMETHOD(int, consume_input, (j_decompress_ptr cinfo));
-  JMETHOD(void, reset_input_controller, (j_decompress_ptr cinfo));
-  JMETHOD(void, start_input_pass, (j_decompress_ptr cinfo));
-  JMETHOD(void, finish_input_pass, (j_decompress_ptr cinfo));
+  int (*consume_input) (j_decompress_ptr cinfo);
+  void (*reset_input_controller) (j_decompress_ptr cinfo);
+  void (*start_input_pass) (j_decompress_ptr cinfo);
+  void (*finish_input_pass) (j_decompress_ptr cinfo);
 
   /* State variables made visible to other modules */
-  boolean has_multiple_scans;	/* True if file has multiple scans */
-  boolean eoi_reached;		/* True when EOI has been consumed */
+  boolean has_multiple_scans;   /* True if file has multiple scans */
+  boolean eoi_reached;          /* True when EOI has been consumed */
 };
 
 /* Main buffer control (downsampled-data buffer) */
 struct jpeg_d_main_controller {
-  JMETHOD(void, start_pass, (j_decompress_ptr cinfo, J_BUF_MODE pass_mode));
-  JMETHOD(void, process_data, (j_decompress_ptr cinfo,
-			       JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-			       JDIMENSION out_rows_avail));
+  void (*start_pass) (j_decompress_ptr cinfo, J_BUF_MODE pass_mode);
+  void (*process_data) (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+                        JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
 };
 
 /* Coefficient buffer control */
 struct jpeg_d_coef_controller {
-  JMETHOD(void, start_input_pass, (j_decompress_ptr cinfo));
-  JMETHOD(int, consume_data, (j_decompress_ptr cinfo));
-  JMETHOD(void, start_output_pass, (j_decompress_ptr cinfo));
-  JMETHOD(int, decompress_data, (j_decompress_ptr cinfo,
-				 JSAMPIMAGE output_buf));
+  void (*start_input_pass) (j_decompress_ptr cinfo);
+  int (*consume_data) (j_decompress_ptr cinfo);
+  void (*start_output_pass) (j_decompress_ptr cinfo);
+  int (*decompress_data) (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
   /* Pointer to array of coefficient virtual arrays, or NULL if none */
   jvirt_barray_ptr *coef_arrays;
 };
 
 /* Decompression postprocessing (color quantization buffer control) */
 struct jpeg_d_post_controller {
-  JMETHOD(void, start_pass, (j_decompress_ptr cinfo, J_BUF_MODE pass_mode));
-  JMETHOD(void, post_process_data, (j_decompress_ptr cinfo,
-				    JSAMPIMAGE input_buf,
-				    JDIMENSION *in_row_group_ctr,
-				    JDIMENSION in_row_groups_avail,
-				    JSAMPARRAY output_buf,
-				    JDIMENSION *out_row_ctr,
-				    JDIMENSION out_rows_avail));
+  void (*start_pass) (j_decompress_ptr cinfo, J_BUF_MODE pass_mode);
+  void (*post_process_data) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                             JDIMENSION *in_row_group_ctr,
+                             JDIMENSION in_row_groups_avail,
+                             JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+                             JDIMENSION out_rows_avail);
 };
 
 /* Marker reading & parsing */
 struct jpeg_marker_reader {
-  JMETHOD(void, reset_marker_reader, (j_decompress_ptr cinfo));
+  void (*reset_marker_reader) (j_decompress_ptr cinfo);
   /* Read markers until SOS or EOI.
    * Returns same codes as are defined for jpeg_consume_input:
    * JPEG_SUSPENDED, JPEG_REACHED_SOS, or JPEG_REACHED_EOI.
    */
-  JMETHOD(int, read_markers, (j_decompress_ptr cinfo));
+  int (*read_markers) (j_decompress_ptr cinfo);
   /* Read a restart marker --- exported for use by entropy decoder only */
   jpeg_marker_parser_method read_restart_marker;
 
   /* State of marker reader --- nominally internal, but applications
    * supplying COM or APPn handlers might like to know the state.
    */
-  boolean saw_SOI;		/* found SOI? */
-  boolean saw_SOF;		/* found SOF? */
-  int next_restart_num;		/* next restart number expected (0-7) */
-  unsigned int discarded_bytes;	/* # of bytes skipped looking for a marker */
+  boolean saw_SOI;              /* found SOI? */
+  boolean saw_SOF;              /* found SOF? */
+  int next_restart_num;         /* next restart number expected (0-7) */
+  unsigned int discarded_bytes; /* # of bytes skipped looking for a marker */
 };
 
 /* Entropy decoding */
 struct jpeg_entropy_decoder {
-  JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
-  JMETHOD(boolean, decode_mcu, (j_decompress_ptr cinfo,
-				JBLOCKROW *MCU_data));
+  void (*start_pass) (j_decompress_ptr cinfo);
+  boolean (*decode_mcu) (j_decompress_ptr cinfo, JBLOCKROW *MCU_data);
 
   /* This is here to share code between baseline and progressive decoders; */
   /* other modules probably should not use it */
-  boolean insufficient_data;	/* set TRUE after emitting warning */
+  boolean insufficient_data;    /* set TRUE after emitting warning */
 };
 
 /* Inverse DCT (also performs dequantization) */
-typedef JMETHOD(void, inverse_DCT_method_ptr,
-		(j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JCOEFPTR coef_block,
-		 JSAMPARRAY output_buf, JDIMENSION output_col));
+typedef void (*inverse_DCT_method_ptr) (j_decompress_ptr cinfo,
+                                        jpeg_component_info *compptr,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col);
 
 struct jpeg_inverse_dct {
-  JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
+  void (*start_pass) (j_decompress_ptr cinfo);
   /* It is useful to allow each component to have a separate IDCT method. */
   inverse_DCT_method_ptr inverse_DCT[MAX_COMPONENTS];
 };
 
 /* Upsampling (note that upsampler must also call color converter) */
 struct jpeg_upsampler {
-  JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
-  JMETHOD(void, upsample, (j_decompress_ptr cinfo,
-			   JSAMPIMAGE input_buf,
-			   JDIMENSION *in_row_group_ctr,
-			   JDIMENSION in_row_groups_avail,
-			   JSAMPARRAY output_buf,
-			   JDIMENSION *out_row_ctr,
-			   JDIMENSION out_rows_avail));
+  void (*start_pass) (j_decompress_ptr cinfo);
+  void (*upsample) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION *in_row_group_ctr,
+                    JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                    JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
 
-  boolean need_context_rows;	/* TRUE if need rows above & below */
+  boolean need_context_rows;    /* TRUE if need rows above & below */
 };
 
 /* Colorspace conversion */
 struct jpeg_color_deconverter {
-  JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
-  JMETHOD(void, color_convert, (j_decompress_ptr cinfo,
-				JSAMPIMAGE input_buf, JDIMENSION input_row,
-				JSAMPARRAY output_buf, int num_rows));
+  void (*start_pass) (j_decompress_ptr cinfo);
+  void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows);
 };
 
 /* Color quantization or color precision reduction */
 struct jpeg_color_quantizer {
-  JMETHOD(void, start_pass, (j_decompress_ptr cinfo, boolean is_pre_scan));
-  JMETHOD(void, color_quantize, (j_decompress_ptr cinfo,
-				 JSAMPARRAY input_buf, JSAMPARRAY output_buf,
-				 int num_rows));
-  JMETHOD(void, finish_pass, (j_decompress_ptr cinfo));
-  JMETHOD(void, new_color_map, (j_decompress_ptr cinfo));
+  void (*start_pass) (j_decompress_ptr cinfo, boolean is_pre_scan);
+  void (*color_quantize) (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                          JSAMPARRAY output_buf, int num_rows);
+  void (*finish_pass) (j_decompress_ptr cinfo);
+  void (*new_color_map) (j_decompress_ptr cinfo);
 };
 
 
 /* Miscellaneous useful macros */
 
 #undef MAX
-#define MAX(a,b)	((a) > (b) ? (a) : (b))
+#define MAX(a,b)        ((a) > (b) ? (a) : (b))
 #undef MIN
-#define MIN(a,b)	((a) < (b) ? (a) : (b))
+#define MIN(a,b)        ((a) < (b) ? (a) : (b))
 
 
 /* We assume that right shift corresponds to signed division by 2 with
@@ -275,126 +284,84 @@
  * shift" instructions that shift in copies of the sign bit.  But some
  * C compilers implement >> with an unsigned shift.  For these machines you
  * must define RIGHT_SHIFT_IS_UNSIGNED.
- * RIGHT_SHIFT provides a proper signed right shift of an INT32 quantity.
+ * RIGHT_SHIFT provides a proper signed right shift of a JLONG quantity.
  * It is only applied with constant shift counts.  SHIFT_TEMPS must be
  * included in the variables of any routine using RIGHT_SHIFT.
  */
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
-#define SHIFT_TEMPS	INT32 shift_temp;
+#define SHIFT_TEMPS     JLONG shift_temp;
 #define RIGHT_SHIFT(x,shft)  \
-	((shift_temp = (x)) < 0 ? \
-	 (shift_temp >> (shft)) | ((~((INT32) 0)) << (32-(shft))) : \
-	 (shift_temp >> (shft)))
+        ((shift_temp = (x)) < 0 ? \
+         (shift_temp >> (shft)) | ((~((JLONG) 0)) << (32-(shft))) : \
+         (shift_temp >> (shft)))
 #else
 #define SHIFT_TEMPS
-#define RIGHT_SHIFT(x,shft)	((x) >> (shft))
+#define RIGHT_SHIFT(x,shft)     ((x) >> (shft))
 #endif
 
 
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jinit_compress_master	jICompress
-#define jinit_c_master_control	jICMaster
-#define jinit_c_main_controller	jICMainC
-#define jinit_c_prep_controller	jICPrepC
-#define jinit_c_coef_controller	jICCoefC
-#define jinit_color_converter	jICColor
-#define jinit_downsampler	jIDownsampler
-#define jinit_forward_dct	jIFDCT
-#define jinit_huff_encoder	jIHEncoder
-#define jinit_phuff_encoder	jIPHEncoder
-#define jinit_arith_encoder	jIAEncoder
-#define jinit_marker_writer	jIMWriter
-#define jinit_master_decompress	jIDMaster
-#define jinit_d_main_controller	jIDMainC
-#define jinit_d_coef_controller	jIDCoefC
-#define jinit_d_post_controller	jIDPostC
-#define jinit_input_controller	jIInCtlr
-#define jinit_marker_reader	jIMReader
-#define jinit_huff_decoder	jIHDecoder
-#define jinit_phuff_decoder	jIPHDecoder
-#define jinit_arith_decoder	jIADecoder
-#define jinit_inverse_dct	jIIDCT
-#define jinit_upsampler		jIUpsampler
-#define jinit_color_deconverter	jIDColor
-#define jinit_1pass_quantizer	jI1Quant
-#define jinit_2pass_quantizer	jI2Quant
-#define jinit_merged_upsampler	jIMUpsampler
-#define jinit_memory_mgr	jIMemMgr
-#define jdiv_round_up		jDivRound
-#define jround_up		jRound
-#define jcopy_sample_rows	jCopySamples
-#define jcopy_block_row		jCopyBlocks
-#define jzero_far		jZeroFar
-#define jpeg_zigzag_order	jZIGTable
-#define jpeg_natural_order	jZAGTable
-#define jpeg_aritab		jAriTab
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
-
 /* Compression module initialization routines */
-EXTERN(void) jinit_compress_master JPP((j_compress_ptr cinfo));
-EXTERN(void) jinit_c_master_control JPP((j_compress_ptr cinfo,
-					 boolean transcode_only));
-EXTERN(void) jinit_c_main_controller JPP((j_compress_ptr cinfo,
-					  boolean need_full_buffer));
-EXTERN(void) jinit_c_prep_controller JPP((j_compress_ptr cinfo,
-					  boolean need_full_buffer));
-EXTERN(void) jinit_c_coef_controller JPP((j_compress_ptr cinfo,
-					  boolean need_full_buffer));
-EXTERN(void) jinit_color_converter JPP((j_compress_ptr cinfo));
-EXTERN(void) jinit_downsampler JPP((j_compress_ptr cinfo));
-EXTERN(void) jinit_forward_dct JPP((j_compress_ptr cinfo));
-EXTERN(void) jinit_huff_encoder JPP((j_compress_ptr cinfo));
-EXTERN(void) jinit_phuff_encoder JPP((j_compress_ptr cinfo));
-EXTERN(void) jinit_arith_encoder JPP((j_compress_ptr cinfo));
-EXTERN(void) jinit_marker_writer JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_compress_master (j_compress_ptr cinfo);
+EXTERN(void) jinit_c_master_control (j_compress_ptr cinfo,
+                                     boolean transcode_only);
+EXTERN(void) jinit_c_main_controller (j_compress_ptr cinfo,
+                                      boolean need_full_buffer);
+EXTERN(void) jinit_c_prep_controller (j_compress_ptr cinfo,
+                                      boolean need_full_buffer);
+EXTERN(void) jinit_c_coef_controller (j_compress_ptr cinfo,
+                                      boolean need_full_buffer);
+EXTERN(void) jinit_color_converter (j_compress_ptr cinfo);
+EXTERN(void) jinit_downsampler (j_compress_ptr cinfo);
+EXTERN(void) jinit_forward_dct (j_compress_ptr cinfo);
+EXTERN(void) jinit_huff_encoder (j_compress_ptr cinfo);
+EXTERN(void) jinit_phuff_encoder (j_compress_ptr cinfo);
+EXTERN(void) jinit_arith_encoder (j_compress_ptr cinfo);
+EXTERN(void) jinit_marker_writer (j_compress_ptr cinfo);
 /* Decompression module initialization routines */
-EXTERN(void) jinit_master_decompress JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_d_main_controller JPP((j_decompress_ptr cinfo,
-					  boolean need_full_buffer));
-EXTERN(void) jinit_d_coef_controller JPP((j_decompress_ptr cinfo,
-					  boolean need_full_buffer));
-EXTERN(void) jinit_d_post_controller JPP((j_decompress_ptr cinfo,
-					  boolean need_full_buffer));
-EXTERN(void) jinit_input_controller JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_marker_reader JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_huff_decoder JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_phuff_decoder JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_arith_decoder JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_inverse_dct JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_upsampler JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_color_deconverter JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_1pass_quantizer JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_2pass_quantizer JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_merged_upsampler JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_master_decompress (j_decompress_ptr cinfo);
+EXTERN(void) jinit_d_main_controller (j_decompress_ptr cinfo,
+                                      boolean need_full_buffer);
+EXTERN(void) jinit_d_coef_controller (j_decompress_ptr cinfo,
+                                      boolean need_full_buffer);
+EXTERN(void) jinit_d_post_controller (j_decompress_ptr cinfo,
+                                      boolean need_full_buffer);
+EXTERN(void) jinit_input_controller (j_decompress_ptr cinfo);
+EXTERN(void) jinit_marker_reader (j_decompress_ptr cinfo);
+EXTERN(void) jinit_huff_decoder (j_decompress_ptr cinfo);
+EXTERN(void) jinit_phuff_decoder (j_decompress_ptr cinfo);
+EXTERN(void) jinit_arith_decoder (j_decompress_ptr cinfo);
+EXTERN(void) jinit_inverse_dct (j_decompress_ptr cinfo);
+EXTERN(void) jinit_upsampler (j_decompress_ptr cinfo);
+EXTERN(void) jinit_color_deconverter (j_decompress_ptr cinfo);
+EXTERN(void) jinit_1pass_quantizer (j_decompress_ptr cinfo);
+EXTERN(void) jinit_2pass_quantizer (j_decompress_ptr cinfo);
+EXTERN(void) jinit_merged_upsampler (j_decompress_ptr cinfo);
 /* Memory manager initialization */
-EXTERN(void) jinit_memory_mgr JPP((j_common_ptr cinfo));
+EXTERN(void) jinit_memory_mgr (j_common_ptr cinfo);
 
 /* Utility routines in jutils.c */
-EXTERN(long) jdiv_round_up JPP((long a, long b));
-EXTERN(long) jround_up JPP((long a, long b));
-EXTERN(void) jcopy_sample_rows JPP((JSAMPARRAY input_array, int source_row,
-				    JSAMPARRAY output_array, int dest_row,
-				    int num_rows, JDIMENSION num_cols));
-EXTERN(void) jcopy_block_row JPP((JBLOCKROW input_row, JBLOCKROW output_row,
-				  JDIMENSION num_blocks));
-EXTERN(void) jzero_far JPP((void FAR * target, size_t bytestozero));
+EXTERN(long) jdiv_round_up (long a, long b);
+EXTERN(long) jround_up (long a, long b);
+EXTERN(void) jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
+                                JSAMPARRAY output_array, int dest_row,
+                                int num_rows, JDIMENSION num_cols);
+EXTERN(void) jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
+                              JDIMENSION num_blocks);
+EXTERN(void) jzero_far (void *target, size_t bytestozero);
 /* Constant tables in jutils.c */
-#if 0				/* This table is not actually needed in v6a */
+#if 0                           /* This table is not actually needed in v6a */
 extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
 #endif
 extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
 
 /* Arithmetic coding probability estimation tables in jaricom.c */
-extern const INT32 jpeg_aritab[];
+extern const JLONG jpeg_aritab[];
 
 /* Suppress undefined-structure complaints if necessary. */
 
 #ifdef INCOMPLETE_TYPES_BROKEN
-#ifndef AM_MEMORY_MANAGER	/* only jmemmgr.c defines these */
+#ifndef AM_MEMORY_MANAGER       /* only jmemmgr.c defines these */
 struct jvirt_sarray_control { long dummy; };
 struct jvirt_barray_control { long dummy; };
 #endif
diff --git a/jpeglib.h b/jpeglib.h
index 4906ec6..dd261fb 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -4,10 +4,11 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
- * Modifications:
- * Copyright (C) 2009-2011, 2013-2014, D. R. Commander.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file defines the application interface for the JPEG library.
  * Most applications using the library need only include this file,
@@ -28,10 +29,10 @@
  * manual configuration options that most people need not worry about.
  */
 
-#ifndef JCONFIG_INCLUDED	/* in case jinclude.h already did */
-#include "jconfig.h"		/* widely used configuration options */
+#ifndef JCONFIG_INCLUDED        /* in case jinclude.h already did */
+#include "jconfig.h"            /* widely used configuration options */
 #endif
-#include "jmorecfg.h"		/* seldom changed options */
+#include "jmorecfg.h"           /* seldom changed options */
 
 
 #ifdef __cplusplus
@@ -46,13 +47,13 @@
  * if you want to be compatible.
  */
 
-#define DCTSIZE		    8	/* The basic DCT block is 8x8 samples */
-#define DCTSIZE2	    64	/* DCTSIZE squared; # of elements in a block */
-#define NUM_QUANT_TBLS      4	/* Quantization tables are numbered 0..3 */
-#define NUM_HUFF_TBLS       4	/* Huffman tables are numbered 0..3 */
-#define NUM_ARITH_TBLS      16	/* Arith-coding tables are numbered 0..15 */
-#define MAX_COMPS_IN_SCAN   4	/* JPEG limit on # of components in one scan */
-#define MAX_SAMP_FACTOR     4	/* JPEG limit on sampling factors */
+#define DCTSIZE             8   /* The basic DCT block is 8x8 samples */
+#define DCTSIZE2            64  /* DCTSIZE squared; # of elements in a block */
+#define NUM_QUANT_TBLS      4   /* Quantization tables are numbered 0..3 */
+#define NUM_HUFF_TBLS       4   /* Huffman tables are numbered 0..3 */
+#define NUM_ARITH_TBLS      16  /* Arith-coding tables are numbered 0..15 */
+#define MAX_COMPS_IN_SCAN   4   /* JPEG limit on # of components in one scan */
+#define MAX_SAMP_FACTOR     4   /* JPEG limit on sampling factors */
 /* Unfortunately, some bozo at Adobe saw no reason to be bound by the standard;
  * the PostScript DCT filter can emit files with many more than 10 blocks/MCU.
  * If you happen to run across such a file, you can up D_MAX_BLOCKS_IN_MCU
@@ -67,20 +68,18 @@
 
 
 /* Data structures for images (arrays of samples and of DCT coefficients).
- * On 80x86 machines, the image arrays are too big for near pointers,
- * but the pointer arrays can fit in near memory.
  */
 
-typedef JSAMPLE FAR *JSAMPROW;	/* ptr to one image row of pixel samples. */
-typedef JSAMPROW *JSAMPARRAY;	/* ptr to some rows (a 2-D sample array) */
-typedef JSAMPARRAY *JSAMPIMAGE;	/* a 3-D sample array: top index is color */
+typedef JSAMPLE *JSAMPROW;      /* ptr to one image row of pixel samples. */
+typedef JSAMPROW *JSAMPARRAY;   /* ptr to some rows (a 2-D sample array) */
+typedef JSAMPARRAY *JSAMPIMAGE; /* a 3-D sample array: top index is color */
 
-typedef JCOEF JBLOCK[DCTSIZE2];	/* one block of coefficients */
-typedef JBLOCK FAR *JBLOCKROW;	/* pointer to one row of coefficient blocks */
-typedef JBLOCKROW *JBLOCKARRAY;		/* a 2-D array of coefficient blocks */
-typedef JBLOCKARRAY *JBLOCKIMAGE;	/* a 3-D array of coefficient blocks */
+typedef JCOEF JBLOCK[DCTSIZE2]; /* one block of coefficients */
+typedef JBLOCK *JBLOCKROW;      /* pointer to one row of coefficient blocks */
+typedef JBLOCKROW *JBLOCKARRAY;         /* a 2-D array of coefficient blocks */
+typedef JBLOCKARRAY *JBLOCKIMAGE;       /* a 3-D array of coefficient blocks */
 
-typedef JCOEF FAR *JCOEFPTR;	/* useful in a couple of places */
+typedef JCOEF *JCOEFPTR;        /* useful in a couple of places */
 
 
 /* Types for JPEG compression parameters and working tables. */
@@ -93,13 +92,13 @@
    * (not the zigzag order in which they are stored in a JPEG DQT marker).
    * CAUTION: IJG versions prior to v6a kept this array in zigzag order.
    */
-  UINT16 quantval[DCTSIZE2];	/* quantization step for each coefficient */
+  UINT16 quantval[DCTSIZE2];    /* quantization step for each coefficient */
   /* This field is used only during compression.  It's initialized FALSE when
    * the table is created, and set TRUE when it's been output to the file.
    * You could suppress output of a table by setting this to TRUE.
    * (See jpeg_suppress_tables for an example.)
    */
-  boolean sent_table;		/* TRUE when table has been output */
+  boolean sent_table;           /* TRUE when table has been output */
 } JQUANT_TBL;
 
 
@@ -107,15 +106,15 @@
 
 typedef struct {
   /* These two fields directly represent the contents of a JPEG DHT marker */
-  UINT8 bits[17];		/* bits[k] = # of symbols with codes of */
-				/* length k bits; bits[0] is unused */
-  UINT8 huffval[256];		/* The symbols, in order of incr code length */
+  UINT8 bits[17];               /* bits[k] = # of symbols with codes of */
+                                /* length k bits; bits[0] is unused */
+  UINT8 huffval[256];           /* The symbols, in order of incr code length */
   /* This field is used only during compression.  It's initialized FALSE when
    * the table is created, and set TRUE when it's been output to the file.
    * You could suppress output of a table by setting this to TRUE.
    * (See jpeg_suppress_tables for an example.)
    */
-  boolean sent_table;		/* TRUE when table has been output */
+  boolean sent_table;           /* TRUE when table has been output */
 } JHUFF_TBL;
 
 
@@ -125,20 +124,20 @@
   /* These values are fixed over the whole image. */
   /* For compression, they must be supplied by parameter setup; */
   /* for decompression, they are read from the SOF marker. */
-  int component_id;		/* identifier for this component (0..255) */
-  int component_index;		/* its index in SOF or cinfo->comp_info[] */
-  int h_samp_factor;		/* horizontal sampling factor (1..4) */
-  int v_samp_factor;		/* vertical sampling factor (1..4) */
-  int quant_tbl_no;		/* quantization table selector (0..3) */
+  int component_id;             /* identifier for this component (0..255) */
+  int component_index;          /* its index in SOF or cinfo->comp_info[] */
+  int h_samp_factor;            /* horizontal sampling factor (1..4) */
+  int v_samp_factor;            /* vertical sampling factor (1..4) */
+  int quant_tbl_no;             /* quantization table selector (0..3) */
   /* These values may vary between scans. */
   /* For compression, they must be supplied by parameter setup; */
   /* for decompression, they are read from the SOS marker. */
   /* The decompressor output side may not use these variables. */
-  int dc_tbl_no;		/* DC entropy table selector (0..3) */
-  int ac_tbl_no;		/* AC entropy table selector (0..3) */
-  
+  int dc_tbl_no;                /* DC entropy table selector (0..3) */
+  int ac_tbl_no;                /* AC entropy table selector (0..3) */
+
   /* Remaining fields should be treated as private by applications. */
-  
+
   /* These values are computed during compression or decompression startup: */
   /* Component's size in DCT blocks.
    * Any dummy blocks added to complete an MCU are not counted; therefore
@@ -149,8 +148,8 @@
   /* Size of a DCT block in samples.  Always DCTSIZE for compression.
    * For decompression this is the size of the output from one DCT block,
    * reflecting any scaling we choose to apply during the IDCT step.
-   * Values of 1,2,4,8 are likely to be supported.  Note that different
-   * components may receive different IDCT scalings.
+   * Values from 1 to 16 are supported.
+   * Note that different components may receive different IDCT scalings.
    */
 #if JPEG_LIB_VERSION >= 70
   int DCT_h_scaled_size;
@@ -164,53 +163,53 @@
    * and similarly for height.  For decompression, IDCT scaling is included, so
    * downsampled_width = ceil(image_width * Hi/Hmax * DCT_[h_]scaled_size/DCTSIZE)
    */
-  JDIMENSION downsampled_width;	 /* actual width in samples */
+  JDIMENSION downsampled_width;  /* actual width in samples */
   JDIMENSION downsampled_height; /* actual height in samples */
   /* This flag is used only for decompression.  In cases where some of the
    * components will be ignored (eg grayscale output from YCbCr image),
    * we can skip most computations for the unused components.
    */
-  boolean component_needed;	/* do we need the value of this component? */
+  boolean component_needed;     /* do we need the value of this component? */
 
   /* These values are computed before starting a scan of the component. */
   /* The decompressor output side may not use these variables. */
-  int MCU_width;		/* number of blocks per MCU, horizontally */
-  int MCU_height;		/* number of blocks per MCU, vertically */
-  int MCU_blocks;		/* MCU_width * MCU_height */
-  int MCU_sample_width;		/* MCU width in samples, MCU_width*DCT_[h_]scaled_size */
-  int last_col_width;		/* # of non-dummy blocks across in last MCU */
-  int last_row_height;		/* # of non-dummy blocks down in last MCU */
+  int MCU_width;                /* number of blocks per MCU, horizontally */
+  int MCU_height;               /* number of blocks per MCU, vertically */
+  int MCU_blocks;               /* MCU_width * MCU_height */
+  int MCU_sample_width;         /* MCU width in samples, MCU_width*DCT_[h_]scaled_size */
+  int last_col_width;           /* # of non-dummy blocks across in last MCU */
+  int last_row_height;          /* # of non-dummy blocks down in last MCU */
 
   /* Saved quantization table for component; NULL if none yet saved.
    * See jdinput.c comments about the need for this information.
    * This field is currently used only for decompression.
    */
-  JQUANT_TBL * quant_table;
+  JQUANT_TBL *quant_table;
 
   /* Private per-component storage for DCT or IDCT subsystem. */
-  void * dct_table;
+  void *dct_table;
 } jpeg_component_info;
 
 
 /* The script for encoding a multiple-scan file is an array of these: */
 
 typedef struct {
-  int comps_in_scan;		/* number of components encoded in this scan */
+  int comps_in_scan;            /* number of components encoded in this scan */
   int component_index[MAX_COMPS_IN_SCAN]; /* their SOF/comp_info[] indexes */
-  int Ss, Se;			/* progressive JPEG spectral selection parms */
-  int Ah, Al;			/* progressive JPEG successive approx. parms */
+  int Ss, Se;                   /* progressive JPEG spectral selection parms */
+  int Ah, Al;                   /* progressive JPEG successive approx. parms */
 } jpeg_scan_info;
 
 /* The decompressor can save APPn and COM markers in a list of these: */
 
-typedef struct jpeg_marker_struct FAR * jpeg_saved_marker_ptr;
+typedef struct jpeg_marker_struct *jpeg_saved_marker_ptr;
 
 struct jpeg_marker_struct {
-  jpeg_saved_marker_ptr next;	/* next in list, or NULL */
-  UINT8 marker;			/* marker code: JPEG_COM, or JPEG_APP0+n */
-  unsigned int original_length;	/* # bytes of data in the file */
-  unsigned int data_length;	/* # bytes of data saved at data[] */
-  JOCTET FAR * data;		/* the data contained in the marker */
+  jpeg_saved_marker_ptr next;   /* next in list, or NULL */
+  UINT8 marker;                 /* marker code: JPEG_COM, or JPEG_APP0+n */
+  unsigned int original_length; /* # bytes of data in the file */
+  unsigned int data_length;     /* # bytes of data saved at data[] */
+  JOCTET *data;                 /* the data contained in the marker */
   /* the marker length word is not counted in data_length or original_length */
 };
 
@@ -220,103 +219,102 @@
 #define JCS_ALPHA_EXTENSIONS 1
 
 typedef enum {
-	JCS_UNKNOWN,		/* error/unspecified */
-	JCS_GRAYSCALE,		/* monochrome */
-	JCS_RGB,		/* red/green/blue as specified by the RGB_RED, RGB_GREEN,
-				   RGB_BLUE, and RGB_PIXELSIZE macros */
-	JCS_YCbCr,		/* Y/Cb/Cr (also known as YUV) */
-	JCS_CMYK,		/* C/M/Y/K */
-	JCS_YCCK,		/* Y/Cb/Cr/K */
-	JCS_EXT_RGB,		/* red/green/blue */
-	JCS_EXT_RGBX,		/* red/green/blue/x */
-	JCS_EXT_BGR,		/* blue/green/red */
-	JCS_EXT_BGRX,		/* blue/green/red/x */
-	JCS_EXT_XBGR,		/* x/blue/green/red */
-	JCS_EXT_XRGB,		/* x/red/green/blue */
-	/* When out_color_space it set to JCS_EXT_RGBX, JCS_EXT_BGRX,
-	   JCS_EXT_XBGR, or JCS_EXT_XRGB during decompression, the X byte is
-	   undefined, and in order to ensure the best performance,
-	   libjpeg-turbo can set that byte to whatever value it wishes.  Use
-	   the following colorspace constants to ensure that the X byte is set
-	   to 0xFF, so that it can be interpreted as an opaque alpha
-	   channel. */
-	JCS_EXT_RGBA,		/* red/green/blue/alpha */
-	JCS_EXT_BGRA,		/* blue/green/red/alpha */
-	JCS_EXT_ABGR,		/* alpha/blue/green/red */
-	JCS_EXT_ARGB,		/* alpha/red/green/blue */
-  JCS_RGB565      /* 5-bit red/6-bit green/5-bit blue */
+  JCS_UNKNOWN,            /* error/unspecified */
+  JCS_GRAYSCALE,          /* monochrome */
+  JCS_RGB,                /* red/green/blue as specified by the RGB_RED,
+                             RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE macros */
+  JCS_YCbCr,              /* Y/Cb/Cr (also known as YUV) */
+  JCS_CMYK,               /* C/M/Y/K */
+  JCS_YCCK,               /* Y/Cb/Cr/K */
+  JCS_EXT_RGB,            /* red/green/blue */
+  JCS_EXT_RGBX,           /* red/green/blue/x */
+  JCS_EXT_BGR,            /* blue/green/red */
+  JCS_EXT_BGRX,           /* blue/green/red/x */
+  JCS_EXT_XBGR,           /* x/blue/green/red */
+  JCS_EXT_XRGB,           /* x/red/green/blue */
+  /* When out_color_space it set to JCS_EXT_RGBX, JCS_EXT_BGRX, JCS_EXT_XBGR,
+     or JCS_EXT_XRGB during decompression, the X byte is undefined, and in
+     order to ensure the best performance, libjpeg-turbo can set that byte to
+     whatever value it wishes.  Use the following colorspace constants to
+     ensure that the X byte is set to 0xFF, so that it can be interpreted as an
+     opaque alpha channel. */
+  JCS_EXT_RGBA,           /* red/green/blue/alpha */
+  JCS_EXT_BGRA,           /* blue/green/red/alpha */
+  JCS_EXT_ABGR,           /* alpha/blue/green/red */
+  JCS_EXT_ARGB,           /* alpha/red/green/blue */
+  JCS_RGB565              /* 5-bit red/6-bit green/5-bit blue */
 } J_COLOR_SPACE;
 
 /* DCT/IDCT algorithm options. */
 
 typedef enum {
-	JDCT_ISLOW,		/* slow but accurate integer algorithm */
-	JDCT_IFAST,		/* faster, less accurate integer method */
-	JDCT_FLOAT		/* floating-point: accurate, fast on fast HW */
+  JDCT_ISLOW,             /* slow but accurate integer algorithm */
+  JDCT_IFAST,             /* faster, less accurate integer method */
+  JDCT_FLOAT              /* floating-point: accurate, fast on fast HW */
 } J_DCT_METHOD;
 
-#ifndef JDCT_DEFAULT		/* may be overridden in jconfig.h */
+#ifndef JDCT_DEFAULT            /* may be overridden in jconfig.h */
 #define JDCT_DEFAULT  JDCT_ISLOW
 #endif
-#ifndef JDCT_FASTEST		/* may be overridden in jconfig.h */
+#ifndef JDCT_FASTEST            /* may be overridden in jconfig.h */
 #define JDCT_FASTEST  JDCT_IFAST
 #endif
 
 /* Dithering options for decompression. */
 
 typedef enum {
-	JDITHER_NONE,		/* no dithering */
-	JDITHER_ORDERED,	/* simple ordered dither */
-	JDITHER_FS		/* Floyd-Steinberg error diffusion dither */
+  JDITHER_NONE,           /* no dithering */
+  JDITHER_ORDERED,        /* simple ordered dither */
+  JDITHER_FS              /* Floyd-Steinberg error diffusion dither */
 } J_DITHER_MODE;
 
 
 /* Common fields between JPEG compression and decompression master structs. */
 
 #define jpeg_common_fields \
-  struct jpeg_error_mgr * err;	/* Error handler module */\
-  struct jpeg_memory_mgr * mem;	/* Memory manager module */\
-  struct jpeg_progress_mgr * progress; /* Progress monitor, or NULL if none */\
-  void * client_data;		/* Available for use by application */\
-  boolean is_decompressor;	/* So common code can tell which is which */\
-  int global_state		/* For checking call sequence validity */
+  struct jpeg_error_mgr *err;   /* Error handler module */\
+  struct jpeg_memory_mgr *mem;  /* Memory manager module */\
+  struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */\
+  void *client_data;            /* Available for use by application */\
+  boolean is_decompressor;      /* So common code can tell which is which */\
+  int global_state              /* For checking call sequence validity */
 
 /* Routines that are to be used by both halves of the library are declared
  * to receive a pointer to this structure.  There are no actual instances of
  * jpeg_common_struct, only of jpeg_compress_struct and jpeg_decompress_struct.
  */
 struct jpeg_common_struct {
-  jpeg_common_fields;		/* Fields common to both master struct types */
+  jpeg_common_fields;           /* Fields common to both master struct types */
   /* Additional fields follow in an actual jpeg_compress_struct or
    * jpeg_decompress_struct.  All three structs must agree on these
    * initial fields!  (This would be a lot cleaner in C++.)
    */
 };
 
-typedef struct jpeg_common_struct * j_common_ptr;
-typedef struct jpeg_compress_struct * j_compress_ptr;
-typedef struct jpeg_decompress_struct * j_decompress_ptr;
+typedef struct jpeg_common_struct *j_common_ptr;
+typedef struct jpeg_compress_struct *j_compress_ptr;
+typedef struct jpeg_decompress_struct *j_decompress_ptr;
 
 
 /* Master record for a compression instance */
 
 struct jpeg_compress_struct {
-  jpeg_common_fields;		/* Fields shared with jpeg_decompress_struct */
+  jpeg_common_fields;           /* Fields shared with jpeg_decompress_struct */
 
   /* Destination for compressed data */
-  struct jpeg_destination_mgr * dest;
+  struct jpeg_destination_mgr *dest;
 
   /* Description of source image --- these fields must be filled in by
    * outer application before starting compression.  in_color_space must
    * be correct before you can even call jpeg_set_defaults().
    */
 
-  JDIMENSION image_width;	/* input image width */
-  JDIMENSION image_height;	/* input image height */
-  int input_components;		/* # of color components in input image */
-  J_COLOR_SPACE in_color_space;	/* colorspace of input image */
+  JDIMENSION image_width;       /* input image width */
+  JDIMENSION image_height;      /* input image height */
+  int input_components;         /* # of color components in input image */
+  J_COLOR_SPACE in_color_space; /* colorspace of input image */
 
-  double input_gamma;		/* image gamma of input image */
+  double input_gamma;           /* image gamma of input image */
 
   /* Compression parameters --- these fields must be set before calling
    * jpeg_start_compress().  We recommend calling jpeg_set_defaults() to
@@ -329,8 +327,8 @@
 #if JPEG_LIB_VERSION >= 70
   unsigned int scale_num, scale_denom; /* fraction by which to scale image */
 
-  JDIMENSION jpeg_width;	/* scaled JPEG image width */
-  JDIMENSION jpeg_height;	/* scaled JPEG image height */
+  JDIMENSION jpeg_width;        /* scaled JPEG image width */
+  JDIMENSION jpeg_height;       /* scaled JPEG image height */
   /* Dimensions of actual JPEG image that will be written to file,
    * derived from input dimensions by scaling factors above.
    * These fields are computed by jpeg_start_compress().
@@ -339,15 +337,15 @@
    */
 #endif
 
-  int data_precision;		/* bits of precision in image data */
+  int data_precision;           /* bits of precision in image data */
 
-  int num_components;		/* # of color components in JPEG image */
+  int num_components;           /* # of color components in JPEG image */
   J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */
 
-  jpeg_component_info * comp_info;
+  jpeg_component_info *comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
 
-  JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS];
+  JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS];
 #if JPEG_LIB_VERSION >= 70
   int q_scale_factor[NUM_QUANT_TBLS];
 #endif
@@ -355,30 +353,30 @@
    * and corresponding scale factors (percentage, initialized 100).
    */
 
-  JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
-  JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
+  JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
+  JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
   /* ptrs to Huffman coding tables, or NULL if not defined */
 
   UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
   UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
   UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
 
-  int num_scans;		/* # of entries in scan_info array */
-  const jpeg_scan_info * scan_info; /* script for multi-scan file, or NULL */
+  int num_scans;                /* # of entries in scan_info array */
+  const jpeg_scan_info *scan_info; /* script for multi-scan file, or NULL */
   /* The default value of scan_info is NULL, which causes a single-scan
    * sequential JPEG file to be emitted.  To create a multi-scan file,
    * set num_scans and scan_info to point to an array of scan definitions.
    */
 
-  boolean raw_data_in;		/* TRUE=caller supplies downsampled data */
-  boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
-  boolean optimize_coding;	/* TRUE=optimize entropy encoding parms */
-  boolean CCIR601_sampling;	/* TRUE=first samples are cosited */
+  boolean raw_data_in;          /* TRUE=caller supplies downsampled data */
+  boolean arith_code;           /* TRUE=arithmetic coding, FALSE=Huffman */
+  boolean optimize_coding;      /* TRUE=optimize entropy encoding parms */
+  boolean CCIR601_sampling;     /* TRUE=first samples are cosited */
 #if JPEG_LIB_VERSION >= 70
   boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */
 #endif
-  int smoothing_factor;		/* 1..100, or 0 for no input smoothing */
-  J_DCT_METHOD dct_method;	/* DCT algorithm selector */
+  int smoothing_factor;         /* 1..100, or 0 for no input smoothing */
+  J_DCT_METHOD dct_method;      /* DCT algorithm selector */
 
   /* The restart interval can be specified in absolute MCUs by setting
    * restart_interval, or in MCU rows by setting restart_in_rows
@@ -386,28 +384,28 @@
    * for each scan).
    */
   unsigned int restart_interval; /* MCUs per restart, or 0 for no restart */
-  int restart_in_rows;		/* if > 0, MCU rows per restart interval */
+  int restart_in_rows;          /* if > 0, MCU rows per restart interval */
 
   /* Parameters controlling emission of special markers. */
 
-  boolean write_JFIF_header;	/* should a JFIF marker be written? */
-  UINT8 JFIF_major_version;	/* What to write for the JFIF version number */
+  boolean write_JFIF_header;    /* should a JFIF marker be written? */
+  UINT8 JFIF_major_version;     /* What to write for the JFIF version number */
   UINT8 JFIF_minor_version;
   /* These three values are not used by the JPEG code, merely copied */
   /* into the JFIF APP0 marker.  density_unit can be 0 for unknown, */
   /* 1 for dots/inch, or 2 for dots/cm.  Note that the pixel aspect */
   /* ratio is defined by X_density/Y_density even when density_unit=0. */
-  UINT8 density_unit;		/* JFIF code for pixel size units */
-  UINT16 X_density;		/* Horizontal pixel density */
-  UINT16 Y_density;		/* Vertical pixel density */
-  boolean write_Adobe_marker;	/* should an Adobe marker be written? */
-  
+  UINT8 density_unit;           /* JFIF code for pixel size units */
+  UINT16 X_density;             /* Horizontal pixel density */
+  UINT16 Y_density;             /* Vertical pixel density */
+  boolean write_Adobe_marker;   /* should an Adobe marker be written? */
+
   /* State variable: index of next scanline to be written to
    * jpeg_write_scanlines().  Application may use this to control its
    * processing loop, e.g., "while (next_scanline < image_height)".
    */
 
-  JDIMENSION next_scanline;	/* 0 .. image_height-1  */
+  JDIMENSION next_scanline;     /* 0 .. image_height-1  */
 
   /* Remaining fields are known throughout compressor, but generally
    * should not be touched by a surrounding application.
@@ -416,59 +414,59 @@
   /*
    * These fields are computed during compression startup
    */
-  boolean progressive_mode;	/* TRUE if scan script uses progressive mode */
-  int max_h_samp_factor;	/* largest h_samp_factor */
-  int max_v_samp_factor;	/* largest v_samp_factor */
+  boolean progressive_mode;     /* TRUE if scan script uses progressive mode */
+  int max_h_samp_factor;        /* largest h_samp_factor */
+  int max_v_samp_factor;        /* largest v_samp_factor */
 
 #if JPEG_LIB_VERSION >= 70
-  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
-  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+  int min_DCT_h_scaled_size;    /* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;    /* smallest DCT_v_scaled_size of any component */
 #endif
 
-  JDIMENSION total_iMCU_rows;	/* # of iMCU rows to be input to coef ctlr */
+  JDIMENSION total_iMCU_rows;   /* # of iMCU rows to be input to coef ctlr */
   /* The coefficient controller receives data in units of MCU rows as defined
    * for fully interleaved scans (whether the JPEG file is interleaved or not).
    * There are v_samp_factor * DCTSIZE sample rows of each component in an
    * "iMCU" (interleaved MCU) row.
    */
-  
+
   /*
    * These fields are valid during any one scan.
    * They describe the components and MCUs actually appearing in the scan.
    */
-  int comps_in_scan;		/* # of JPEG components in this scan */
-  jpeg_component_info * cur_comp_info[MAX_COMPS_IN_SCAN];
+  int comps_in_scan;            /* # of JPEG components in this scan */
+  jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN];
   /* *cur_comp_info[i] describes component that appears i'th in SOS */
-  
-  JDIMENSION MCUs_per_row;	/* # of MCUs across the image */
-  JDIMENSION MCU_rows_in_scan;	/* # of MCU rows in the image */
-  
-  int blocks_in_MCU;		/* # of DCT blocks per MCU */
+
+  JDIMENSION MCUs_per_row;      /* # of MCUs across the image */
+  JDIMENSION MCU_rows_in_scan;  /* # of MCU rows in the image */
+
+  int blocks_in_MCU;            /* # of DCT blocks per MCU */
   int MCU_membership[C_MAX_BLOCKS_IN_MCU];
   /* MCU_membership[i] is index in cur_comp_info of component owning */
   /* i'th block in an MCU */
 
-  int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */
+  int Ss, Se, Ah, Al;           /* progressive JPEG parameters for scan */
 
 #if JPEG_LIB_VERSION >= 80
-  int block_size;		/* the basic DCT block size: 1..16 */
-  const int * natural_order;	/* natural-order position array */
-  int lim_Se;			/* min( Se, DCTSIZE2-1 ) */
+  int block_size;               /* the basic DCT block size: 1..16 */
+  const int *natural_order;     /* natural-order position array */
+  int lim_Se;                   /* min( Se, DCTSIZE2-1 ) */
 #endif
 
   /*
    * Links to compression subobjects (methods and private variables of modules)
    */
-  struct jpeg_comp_master * master;
-  struct jpeg_c_main_controller * main;
-  struct jpeg_c_prep_controller * prep;
-  struct jpeg_c_coef_controller * coef;
-  struct jpeg_marker_writer * marker;
-  struct jpeg_color_converter * cconvert;
-  struct jpeg_downsampler * downsample;
-  struct jpeg_forward_dct * fdct;
-  struct jpeg_entropy_encoder * entropy;
-  jpeg_scan_info * script_space; /* workspace for jpeg_simple_progression */
+  struct jpeg_comp_master *master;
+  struct jpeg_c_main_controller *main;
+  struct jpeg_c_prep_controller *prep;
+  struct jpeg_c_coef_controller *coef;
+  struct jpeg_marker_writer *marker;
+  struct jpeg_color_converter *cconvert;
+  struct jpeg_downsampler *downsample;
+  struct jpeg_forward_dct *fdct;
+  struct jpeg_entropy_encoder *entropy;
+  jpeg_scan_info *script_space; /* workspace for jpeg_simple_progression */
   int script_space_size;
 };
 
@@ -476,17 +474,17 @@
 /* Master record for a decompression instance */
 
 struct jpeg_decompress_struct {
-  jpeg_common_fields;		/* Fields shared with jpeg_compress_struct */
+  jpeg_common_fields;           /* Fields shared with jpeg_compress_struct */
 
   /* Source of compressed data */
-  struct jpeg_source_mgr * src;
+  struct jpeg_source_mgr *src;
 
   /* Basic description of image --- filled in by jpeg_read_header(). */
   /* Application may inspect these values to decide how to process image. */
 
-  JDIMENSION image_width;	/* nominal image width (from SOF marker) */
-  JDIMENSION image_height;	/* nominal image height */
-  int num_components;		/* # of color components in JPEG image */
+  JDIMENSION image_width;       /* nominal image width (from SOF marker) */
+  JDIMENSION image_height;      /* nominal image height */
+  int num_components;           /* # of color components in JPEG image */
   J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */
 
   /* Decompression processing parameters --- these fields must be set before
@@ -498,24 +496,24 @@
 
   unsigned int scale_num, scale_denom; /* fraction by which to scale image */
 
-  double output_gamma;		/* image gamma wanted in output */
+  double output_gamma;          /* image gamma wanted in output */
 
-  boolean buffered_image;	/* TRUE=multiple output passes */
-  boolean raw_data_out;		/* TRUE=downsampled data wanted */
+  boolean buffered_image;       /* TRUE=multiple output passes */
+  boolean raw_data_out;         /* TRUE=downsampled data wanted */
 
-  J_DCT_METHOD dct_method;	/* IDCT algorithm selector */
-  boolean do_fancy_upsampling;	/* TRUE=apply fancy upsampling */
-  boolean do_block_smoothing;	/* TRUE=apply interblock smoothing */
+  J_DCT_METHOD dct_method;      /* IDCT algorithm selector */
+  boolean do_fancy_upsampling;  /* TRUE=apply fancy upsampling */
+  boolean do_block_smoothing;   /* TRUE=apply interblock smoothing */
 
-  boolean quantize_colors;	/* TRUE=colormapped output wanted */
+  boolean quantize_colors;      /* TRUE=colormapped output wanted */
   /* the following are ignored if not quantize_colors: */
-  J_DITHER_MODE dither_mode;	/* type of color dithering to use */
-  boolean two_pass_quantize;	/* TRUE=use two-pass color quantization */
-  int desired_number_of_colors;	/* max # colors to use in created colormap */
+  J_DITHER_MODE dither_mode;    /* type of color dithering to use */
+  boolean two_pass_quantize;    /* TRUE=use two-pass color quantization */
+  int desired_number_of_colors; /* max # colors to use in created colormap */
   /* these are significant only in buffered-image mode: */
-  boolean enable_1pass_quant;	/* enable future use of 1-pass quantizer */
+  boolean enable_1pass_quant;   /* enable future use of 1-pass quantizer */
   boolean enable_external_quant;/* enable future use of external colormap */
-  boolean enable_2pass_quant;	/* enable future use of 2-pass quantizer */
+  boolean enable_2pass_quant;   /* enable future use of 2-pass quantizer */
 
   /* Description of actual output image that will be returned to application.
    * These fields are computed by jpeg_start_decompress().
@@ -523,14 +521,14 @@
    * in advance of calling jpeg_start_decompress().
    */
 
-  JDIMENSION output_width;	/* scaled image width */
-  JDIMENSION output_height;	/* scaled image height */
-  int out_color_components;	/* # of color components in out_color_space */
-  int output_components;	/* # of color components returned */
+  JDIMENSION output_width;      /* scaled image width */
+  JDIMENSION output_height;     /* scaled image height */
+  int out_color_components;     /* # of color components in out_color_space */
+  int output_components;        /* # of color components returned */
   /* output_components is 1 (a colormap index) when quantizing colors;
    * otherwise it equals out_color_components.
    */
-  int rec_outbuf_height;	/* min recommended height of scanline buffer */
+  int rec_outbuf_height;        /* min recommended height of scanline buffer */
   /* If the buffer passed to jpeg_read_scanlines() is less than this many rows
    * high, space and time will be wasted due to unnecessary data copying.
    * Usually rec_outbuf_height will be 1 or 2, at most 4.
@@ -542,8 +540,8 @@
    * jpeg_start_decompress or jpeg_start_output.
    * The map has out_color_components rows and actual_number_of_colors columns.
    */
-  int actual_number_of_colors;	/* number of entries in use */
-  JSAMPARRAY colormap;		/* The color map as a 2-D pixel array */
+  int actual_number_of_colors;  /* number of entries in use */
+  JSAMPARRAY colormap;          /* The color map as a 2-D pixel array */
 
   /* State variables: these variables indicate the progress of decompression.
    * The application may examine these but must not modify them.
@@ -553,20 +551,20 @@
    * Application may use this to control its processing loop, e.g.,
    * "while (output_scanline < output_height)".
    */
-  JDIMENSION output_scanline;	/* 0 .. output_height-1  */
+  JDIMENSION output_scanline;   /* 0 .. output_height-1  */
 
   /* Current input scan number and number of iMCU rows completed in scan.
    * These indicate the progress of the decompressor input side.
    */
-  int input_scan_number;	/* Number of SOS markers seen so far */
-  JDIMENSION input_iMCU_row;	/* Number of iMCU rows completed */
+  int input_scan_number;        /* Number of SOS markers seen so far */
+  JDIMENSION input_iMCU_row;    /* Number of iMCU rows completed */
 
   /* The "output scan number" is the notional scan being displayed by the
    * output side.  The decompressor will not allow output scan/row number
    * to get ahead of input scan/row, but it can fall arbitrarily far behind.
    */
-  int output_scan_number;	/* Nominal scan number being displayed */
-  JDIMENSION output_iMCU_row;	/* Number of iMCU rows read */
+  int output_scan_number;       /* Nominal scan number being displayed */
+  JDIMENSION output_iMCU_row;   /* Number of iMCU rows read */
 
   /* Current progression status.  coef_bits[c][i] indicates the precision
    * with which component c's DCT coefficient i (in zigzag order) is known.
@@ -575,7 +573,7 @@
    * (thus, 0 at completion of the progression).
    * This pointer is NULL when reading a non-progressive file.
    */
-  int (*coef_bits)[DCTSIZE2];	/* -1 or current Al value for each coef */
+  int (*coef_bits)[DCTSIZE2];   /* -1 or current Al value for each coef */
 
   /* Internal JPEG parameters --- the application usually need not look at
    * these fields.  Note that the decompressor output side may not use
@@ -586,27 +584,27 @@
    * datastreams when processing abbreviated JPEG datastreams.
    */
 
-  JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS];
+  JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS];
   /* ptrs to coefficient quantization tables, or NULL if not defined */
 
-  JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
-  JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
+  JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
+  JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
   /* ptrs to Huffman coding tables, or NULL if not defined */
 
   /* These parameters are never carried across datastreams, since they
    * are given in SOF/SOS markers or defined to be reset by SOI.
    */
 
-  int data_precision;		/* bits of precision in image data */
+  int data_precision;           /* bits of precision in image data */
 
-  jpeg_component_info * comp_info;
+  jpeg_component_info *comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
 
 #if JPEG_LIB_VERSION >= 80
-  boolean is_baseline;		/* TRUE if Baseline SOF0 encountered */
+  boolean is_baseline;          /* TRUE if Baseline SOF0 encountered */
 #endif
-  boolean progressive_mode;	/* TRUE if SOFn specifies progressive mode */
-  boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
+  boolean progressive_mode;     /* TRUE if SOFn specifies progressive mode */
+  boolean arith_code;           /* TRUE=arithmetic coding, FALSE=Huffman */
 
   UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
   UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
@@ -617,17 +615,17 @@
   /* These fields record data obtained from optional markers recognized by
    * the JPEG library.
    */
-  boolean saw_JFIF_marker;	/* TRUE iff a JFIF APP0 marker was found */
+  boolean saw_JFIF_marker;      /* TRUE iff a JFIF APP0 marker was found */
   /* Data copied from JFIF marker; only valid if saw_JFIF_marker is TRUE: */
-  UINT8 JFIF_major_version;	/* JFIF version number */
+  UINT8 JFIF_major_version;     /* JFIF version number */
   UINT8 JFIF_minor_version;
-  UINT8 density_unit;		/* JFIF code for pixel size units */
-  UINT16 X_density;		/* Horizontal pixel density */
-  UINT16 Y_density;		/* Vertical pixel density */
-  boolean saw_Adobe_marker;	/* TRUE iff an Adobe APP14 marker was found */
-  UINT8 Adobe_transform;	/* Color transform code from Adobe marker */
+  UINT8 density_unit;           /* JFIF code for pixel size units */
+  UINT16 X_density;             /* Horizontal pixel density */
+  UINT16 Y_density;             /* Vertical pixel density */
+  boolean saw_Adobe_marker;     /* TRUE iff an Adobe APP14 marker was found */
+  UINT8 Adobe_transform;        /* Color transform code from Adobe marker */
 
-  boolean CCIR601_sampling;	/* TRUE=first samples are cosited */
+  boolean CCIR601_sampling;     /* TRUE=first samples are cosited */
 
   /* Aside from the specific data retained from APPn markers known to the
    * library, the uninterpreted contents of any or all APPn and COM markers
@@ -642,17 +640,17 @@
   /*
    * These fields are computed during decompression startup
    */
-  int max_h_samp_factor;	/* largest h_samp_factor */
-  int max_v_samp_factor;	/* largest v_samp_factor */
+  int max_h_samp_factor;        /* largest h_samp_factor */
+  int max_v_samp_factor;        /* largest v_samp_factor */
 
 #if JPEG_LIB_VERSION >= 70
-  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
-  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+  int min_DCT_h_scaled_size;    /* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;    /* smallest DCT_v_scaled_size of any component */
 #else
-  int min_DCT_scaled_size;	/* smallest DCT_scaled_size of any component */
+  int min_DCT_scaled_size;      /* smallest DCT_scaled_size of any component */
 #endif
 
-  JDIMENSION total_iMCU_rows;	/* # of iMCU rows in image */
+  JDIMENSION total_iMCU_rows;   /* # of iMCU rows in image */
   /* The coefficient controller's input and output progress is measured in
    * units of "iMCU" (interleaved MCU) rows.  These are the same as MCU rows
    * in fully interleaved JPEG scans, but are used whether the scan is
@@ -661,33 +659,33 @@
    * v_samp_factor*DCT_[v_]scaled_size sample rows of a component per iMCU row.
    */
 
-  JSAMPLE * sample_range_limit; /* table for fast range-limiting */
+  JSAMPLE *sample_range_limit;  /* table for fast range-limiting */
 
   /*
    * These fields are valid during any one scan.
    * They describe the components and MCUs actually appearing in the scan.
    * Note that the decompressor output side must not use these fields.
    */
-  int comps_in_scan;		/* # of JPEG components in this scan */
-  jpeg_component_info * cur_comp_info[MAX_COMPS_IN_SCAN];
+  int comps_in_scan;            /* # of JPEG components in this scan */
+  jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN];
   /* *cur_comp_info[i] describes component that appears i'th in SOS */
 
-  JDIMENSION MCUs_per_row;	/* # of MCUs across the image */
-  JDIMENSION MCU_rows_in_scan;	/* # of MCU rows in the image */
+  JDIMENSION MCUs_per_row;      /* # of MCUs across the image */
+  JDIMENSION MCU_rows_in_scan;  /* # of MCU rows in the image */
 
-  int blocks_in_MCU;		/* # of DCT blocks per MCU */
+  int blocks_in_MCU;            /* # of DCT blocks per MCU */
   int MCU_membership[D_MAX_BLOCKS_IN_MCU];
   /* MCU_membership[i] is index in cur_comp_info of component owning */
   /* i'th block in an MCU */
 
-  int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */
+  int Ss, Se, Ah, Al;           /* progressive JPEG parameters for scan */
 
 #if JPEG_LIB_VERSION >= 80
   /* These fields are derived from Se of first SOS marker.
    */
-  int block_size;		/* the basic DCT block size: 1..16 */
-  const int * natural_order; /* natural-order position array for entropy decode */
-  int lim_Se;			/* min( Se, DCTSIZE2-1 ) for entropy decode */
+  int block_size;               /* the basic DCT block size: 1..16 */
+  const int *natural_order; /* natural-order position array for entropy decode */
+  int lim_Se;                   /* min( Se, DCTSIZE2-1 ) for entropy decode */
 #endif
 
   /* This field is shared between entropy decoder and marker parser.
@@ -699,17 +697,17 @@
   /*
    * Links to decompression subobjects (methods, private variables of modules)
    */
-  struct jpeg_decomp_master * master;
-  struct jpeg_d_main_controller * main;
-  struct jpeg_d_coef_controller * coef;
-  struct jpeg_d_post_controller * post;
-  struct jpeg_input_controller * inputctl;
-  struct jpeg_marker_reader * marker;
-  struct jpeg_entropy_decoder * entropy;
-  struct jpeg_inverse_dct * idct;
-  struct jpeg_upsampler * upsample;
-  struct jpeg_color_deconverter * cconvert;
-  struct jpeg_color_quantizer * cquantize;
+  struct jpeg_decomp_master *master;
+  struct jpeg_d_main_controller *main;
+  struct jpeg_d_coef_controller *coef;
+  struct jpeg_d_post_controller *post;
+  struct jpeg_input_controller *inputctl;
+  struct jpeg_marker_reader *marker;
+  struct jpeg_entropy_decoder *entropy;
+  struct jpeg_inverse_dct *idct;
+  struct jpeg_upsampler *upsample;
+  struct jpeg_color_deconverter *cconvert;
+  struct jpeg_color_quantizer *cquantize;
 };
 
 
@@ -725,17 +723,17 @@
 
 struct jpeg_error_mgr {
   /* Error exit handler: does not return to caller */
-  JMETHOD(void, error_exit, (j_common_ptr cinfo));
+  void (*error_exit) (j_common_ptr cinfo);
   /* Conditionally emit a trace or warning message */
-  JMETHOD(void, emit_message, (j_common_ptr cinfo, int msg_level));
+  void (*emit_message) (j_common_ptr cinfo, int msg_level);
   /* Routine that actually outputs a trace or error message */
-  JMETHOD(void, output_message, (j_common_ptr cinfo));
+  void (*output_message) (j_common_ptr cinfo);
   /* Format a message string for the most recent JPEG error or message */
-  JMETHOD(void, format_message, (j_common_ptr cinfo, char * buffer));
-#define JMSG_LENGTH_MAX  200	/* recommended size of format_message buffer */
+  void (*format_message) (j_common_ptr cinfo, char *buffer);
+#define JMSG_LENGTH_MAX  200    /* recommended size of format_message buffer */
   /* Reset error state variables at start of a new image */
-  JMETHOD(void, reset_error_mgr, (j_common_ptr cinfo));
-  
+  void (*reset_error_mgr) (j_common_ptr cinfo);
+
   /* The message ID code and any parameters are saved here.
    * A message can have one string parameter or up to 8 int parameters.
    */
@@ -745,18 +743,18 @@
     int i[8];
     char s[JMSG_STR_PARM_MAX];
   } msg_parm;
-  
+
   /* Standard state variables for error facility */
-  
-  int trace_level;		/* max msg_level that will be displayed */
-  
+
+  int trace_level;              /* max msg_level that will be displayed */
+
   /* For recoverable corrupt-data errors, we emit a warning message,
    * but keep going unless emit_message chooses to abort.  emit_message
    * should count warnings in num_warnings.  The surrounding application
    * can check for bad data by seeing if num_warnings is nonzero at the
    * end of processing.
    */
-  long num_warnings;		/* number of corrupt-data warnings */
+  long num_warnings;            /* number of corrupt-data warnings */
 
   /* These fields point to the table(s) of error message strings.
    * An application can change the table pointer to switch to a different
@@ -768,52 +766,52 @@
    * First table includes all errors generated by JPEG library itself.
    * Error code 0 is reserved for a "no such error string" message.
    */
-  const char * const * jpeg_message_table; /* Library errors */
+  const char * const *jpeg_message_table; /* Library errors */
   int last_jpeg_message;    /* Table contains strings 0..last_jpeg_message */
   /* Second table can be added by application (see cjpeg/djpeg for example).
    * It contains strings numbered first_addon_message..last_addon_message.
    */
-  const char * const * addon_message_table; /* Non-library errors */
-  int first_addon_message;	/* code for first string in addon table */
-  int last_addon_message;	/* code for last string in addon table */
+  const char * const *addon_message_table; /* Non-library errors */
+  int first_addon_message;      /* code for first string in addon table */
+  int last_addon_message;       /* code for last string in addon table */
 };
 
 
 /* Progress monitor object */
 
 struct jpeg_progress_mgr {
-  JMETHOD(void, progress_monitor, (j_common_ptr cinfo));
+  void (*progress_monitor) (j_common_ptr cinfo);
 
-  long pass_counter;		/* work units completed in this pass */
-  long pass_limit;		/* total number of work units in this pass */
-  int completed_passes;		/* passes completed so far */
-  int total_passes;		/* total number of passes expected */
+  long pass_counter;            /* work units completed in this pass */
+  long pass_limit;              /* total number of work units in this pass */
+  int completed_passes;         /* passes completed so far */
+  int total_passes;             /* total number of passes expected */
 };
 
 
 /* Data destination object for compression */
 
 struct jpeg_destination_mgr {
-  JOCTET * next_output_byte;	/* => next byte to write in buffer */
-  size_t free_in_buffer;	/* # of byte spaces remaining in buffer */
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
 
-  JMETHOD(void, init_destination, (j_compress_ptr cinfo));
-  JMETHOD(boolean, empty_output_buffer, (j_compress_ptr cinfo));
-  JMETHOD(void, term_destination, (j_compress_ptr cinfo));
+  void (*init_destination) (j_compress_ptr cinfo);
+  boolean (*empty_output_buffer) (j_compress_ptr cinfo);
+  void (*term_destination) (j_compress_ptr cinfo);
 };
 
 
 /* Data source object for decompression */
 
 struct jpeg_source_mgr {
-  const JOCTET * next_input_byte; /* => next byte to read from buffer */
-  size_t bytes_in_buffer;	/* # of bytes remaining in buffer */
+  const JOCTET *next_input_byte; /* => next byte to read from buffer */
+  size_t bytes_in_buffer;       /* # of bytes remaining in buffer */
 
-  JMETHOD(void, init_source, (j_decompress_ptr cinfo));
-  JMETHOD(boolean, fill_input_buffer, (j_decompress_ptr cinfo));
-  JMETHOD(void, skip_input_data, (j_decompress_ptr cinfo, long num_bytes));
-  JMETHOD(boolean, resync_to_restart, (j_decompress_ptr cinfo, int desired));
-  JMETHOD(void, term_source, (j_decompress_ptr cinfo));
+  void (*init_source) (j_decompress_ptr cinfo);
+  boolean (*fill_input_buffer) (j_decompress_ptr cinfo);
+  void (*skip_input_data) (j_decompress_ptr cinfo, long num_bytes);
+  boolean (*resync_to_restart) (j_decompress_ptr cinfo, int desired);
+  void (*term_source) (j_decompress_ptr cinfo);
 };
 
 
@@ -828,51 +826,42 @@
  * successful.
  */
 
-#define JPOOL_PERMANENT	0	/* lasts until master record is destroyed */
-#define JPOOL_IMAGE	1	/* lasts until done with image/datastream */
-#define JPOOL_NUMPOOLS	2
+#define JPOOL_PERMANENT 0       /* lasts until master record is destroyed */
+#define JPOOL_IMAGE     1       /* lasts until done with image/datastream */
+#define JPOOL_NUMPOOLS  2
 
-typedef struct jvirt_sarray_control * jvirt_sarray_ptr;
-typedef struct jvirt_barray_control * jvirt_barray_ptr;
+typedef struct jvirt_sarray_control *jvirt_sarray_ptr;
+typedef struct jvirt_barray_control *jvirt_barray_ptr;
 
 
 struct jpeg_memory_mgr {
   /* Method pointers */
-  JMETHOD(void *, alloc_small, (j_common_ptr cinfo, int pool_id,
-				size_t sizeofobject));
-  JMETHOD(void FAR *, alloc_large, (j_common_ptr cinfo, int pool_id,
-				     size_t sizeofobject));
-  JMETHOD(JSAMPARRAY, alloc_sarray, (j_common_ptr cinfo, int pool_id,
-				     JDIMENSION samplesperrow,
-				     JDIMENSION numrows));
-  JMETHOD(JBLOCKARRAY, alloc_barray, (j_common_ptr cinfo, int pool_id,
-				      JDIMENSION blocksperrow,
-				      JDIMENSION numrows));
-  JMETHOD(jvirt_sarray_ptr, request_virt_sarray, (j_common_ptr cinfo,
-						  int pool_id,
-						  boolean pre_zero,
-						  JDIMENSION samplesperrow,
-						  JDIMENSION numrows,
-						  JDIMENSION maxaccess));
-  JMETHOD(jvirt_barray_ptr, request_virt_barray, (j_common_ptr cinfo,
-						  int pool_id,
-						  boolean pre_zero,
-						  JDIMENSION blocksperrow,
-						  JDIMENSION numrows,
-						  JDIMENSION maxaccess));
-  JMETHOD(void, realize_virt_arrays, (j_common_ptr cinfo));
-  JMETHOD(JSAMPARRAY, access_virt_sarray, (j_common_ptr cinfo,
-					   jvirt_sarray_ptr ptr,
-					   JDIMENSION start_row,
-					   JDIMENSION num_rows,
-					   boolean writable));
-  JMETHOD(JBLOCKARRAY, access_virt_barray, (j_common_ptr cinfo,
-					    jvirt_barray_ptr ptr,
-					    JDIMENSION start_row,
-					    JDIMENSION num_rows,
-					    boolean writable));
-  JMETHOD(void, free_pool, (j_common_ptr cinfo, int pool_id));
-  JMETHOD(void, self_destruct, (j_common_ptr cinfo));
+  void *(*alloc_small) (j_common_ptr cinfo, int pool_id, size_t sizeofobject);
+  void *(*alloc_large) (j_common_ptr cinfo, int pool_id,
+                        size_t sizeofobject);
+  JSAMPARRAY (*alloc_sarray) (j_common_ptr cinfo, int pool_id,
+                              JDIMENSION samplesperrow, JDIMENSION numrows);
+  JBLOCKARRAY (*alloc_barray) (j_common_ptr cinfo, int pool_id,
+                               JDIMENSION blocksperrow, JDIMENSION numrows);
+  jvirt_sarray_ptr (*request_virt_sarray) (j_common_ptr cinfo, int pool_id,
+                                           boolean pre_zero,
+                                           JDIMENSION samplesperrow,
+                                           JDIMENSION numrows,
+                                           JDIMENSION maxaccess);
+  jvirt_barray_ptr (*request_virt_barray) (j_common_ptr cinfo, int pool_id,
+                                           boolean pre_zero,
+                                           JDIMENSION blocksperrow,
+                                           JDIMENSION numrows,
+                                           JDIMENSION maxaccess);
+  void (*realize_virt_arrays) (j_common_ptr cinfo);
+  JSAMPARRAY (*access_virt_sarray) (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
+                                    JDIMENSION start_row, JDIMENSION num_rows,
+                                    boolean writable);
+  JBLOCKARRAY (*access_virt_barray) (j_common_ptr cinfo, jvirt_barray_ptr ptr,
+                                     JDIMENSION start_row, JDIMENSION num_rows,
+                                     boolean writable);
+  void (*free_pool) (j_common_ptr cinfo, int pool_id);
+  void (*self_destruct) (j_common_ptr cinfo);
 
   /* Limit on memory allocation for this JPEG object.  (Note that this is
    * merely advisory, not a guaranteed maximum; it only affects the space
@@ -889,96 +878,21 @@
 /* Routine signature for application-supplied marker processing methods.
  * Need not pass marker code since it is stored in cinfo->unread_marker.
  */
-typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
+typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo);
 
 
-/* Declarations for routines called by application.
- * The JPP macro hides prototype parameters from compilers that can't cope.
- * Note JPP requires double parentheses.
+/* Originally, this macro was used as a way of defining function prototypes
+ * for both modern compilers as well as older compilers that did not support
+ * prototype parameters.  libjpeg-turbo has never supported these older,
+ * non-ANSI compilers, but the macro is still included because there is some
+ * software out there that uses it.
  */
 
-#ifdef HAVE_PROTOTYPES
-#define JPP(arglist)	arglist
-#else
-#define JPP(arglist)	()
-#endif
-
-
-/* Short forms of external names for systems with brain-damaged linkers.
- * We shorten external names to be unique in the first six letters, which
- * is good enough for all known systems.
- * (If your compiler itself needs names to be unique in less than 15 
- * characters, you are out of luck.  Get a better compiler.)
- */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_std_error		jStdError
-#define jpeg_CreateCompress	jCreaCompress
-#define jpeg_CreateDecompress	jCreaDecompress
-#define jpeg_destroy_compress	jDestCompress
-#define jpeg_destroy_decompress	jDestDecompress
-#define jpeg_stdio_dest		jStdDest
-#define jpeg_stdio_src		jStdSrc
-#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
-#define jpeg_mem_dest		jMemDest
-#define jpeg_mem_src		jMemSrc
-#endif
-#define jpeg_set_defaults	jSetDefaults
-#define jpeg_set_colorspace	jSetColorspace
-#define jpeg_default_colorspace	jDefColorspace
-#define jpeg_set_quality	jSetQuality
-#define jpeg_set_linear_quality	jSetLQuality
-#if JPEG_LIB_VERSION >= 70
-#define jpeg_default_qtables	jDefQTables
-#endif
-#define jpeg_add_quant_table	jAddQuantTable
-#define jpeg_quality_scaling	jQualityScaling
-#define jpeg_simple_progression	jSimProgress
-#define jpeg_suppress_tables	jSuppressTables
-#define jpeg_alloc_quant_table	jAlcQTable
-#define jpeg_alloc_huff_table	jAlcHTable
-#define jpeg_start_compress	jStrtCompress
-#define jpeg_write_scanlines	jWrtScanlines
-#define jpeg_finish_compress	jFinCompress
-#if JPEG_LIB_VERSION >= 70
-#define jpeg_calc_jpeg_dimensions	jCjpegDimensions
-#endif
-#define jpeg_write_raw_data	jWrtRawData
-#define jpeg_write_marker	jWrtMarker
-#define jpeg_write_m_header	jWrtMHeader
-#define jpeg_write_m_byte	jWrtMByte
-#define jpeg_write_tables	jWrtTables
-#define jpeg_read_header	jReadHeader
-#define jpeg_start_decompress	jStrtDecompress
-#define jpeg_read_scanlines	jReadScanlines
-#define jpeg_finish_decompress	jFinDecompress
-#define jpeg_read_raw_data	jReadRawData
-#define jpeg_has_multiple_scans	jHasMultScn
-#define jpeg_start_output	jStrtOutput
-#define jpeg_finish_output	jFinOutput
-#define jpeg_input_complete	jInComplete
-#define jpeg_new_colormap	jNewCMap
-#define jpeg_consume_input	jConsumeInput
-#if JPEG_LIB_VERSION >= 80
-#define jpeg_core_output_dimensions	jCoreDimensions
-#endif
-#define jpeg_calc_output_dimensions	jCalcDimensions
-#define jpeg_save_markers	jSaveMarkers
-#define jpeg_set_marker_processor	jSetMarker
-#define jpeg_read_coefficients	jReadCoefs
-#define jpeg_write_coefficients	jWrtCoefs
-#define jpeg_copy_critical_parameters	jCopyCrit
-#define jpeg_abort_compress	jAbrtCompress
-#define jpeg_abort_decompress	jAbrtDecompress
-#define jpeg_abort		jAbort
-#define jpeg_destroy		jDestroy
-#define jpeg_resync_to_restart	jResyncRestart
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
+#define JPP(arglist)    arglist
 
 
 /* Default error-management setup */
-EXTERN(struct jpeg_error_mgr *) jpeg_std_error
-	JPP((struct jpeg_error_mgr * err));
+EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr *err);
 
 /* Initialization of JPEG compression objects.
  * jpeg_create_compress() and jpeg_create_decompress() are the exported
@@ -989,97 +903,89 @@
  */
 #define jpeg_create_compress(cinfo) \
     jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
-			(size_t) sizeof(struct jpeg_compress_struct))
+                        (size_t) sizeof(struct jpeg_compress_struct))
 #define jpeg_create_decompress(cinfo) \
     jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
-			  (size_t) sizeof(struct jpeg_decompress_struct))
-EXTERN(void) jpeg_CreateCompress JPP((j_compress_ptr cinfo,
-				      int version, size_t structsize));
-EXTERN(void) jpeg_CreateDecompress JPP((j_decompress_ptr cinfo,
-					int version, size_t structsize));
+                          (size_t) sizeof(struct jpeg_decompress_struct))
+EXTERN(void) jpeg_CreateCompress (j_compress_ptr cinfo, int version,
+                                  size_t structsize);
+EXTERN(void) jpeg_CreateDecompress (j_decompress_ptr cinfo, int version,
+                                    size_t structsize);
 /* Destruction of JPEG compression objects */
-EXTERN(void) jpeg_destroy_compress JPP((j_compress_ptr cinfo));
-EXTERN(void) jpeg_destroy_decompress JPP((j_decompress_ptr cinfo));
+EXTERN(void) jpeg_destroy_compress (j_compress_ptr cinfo);
+EXTERN(void) jpeg_destroy_decompress (j_decompress_ptr cinfo);
 
 /* Standard data source and destination managers: stdio streams. */
 /* Caller is responsible for opening the file before and closing after. */
-EXTERN(void) jpeg_stdio_dest JPP((j_compress_ptr cinfo, FILE * outfile));
-EXTERN(void) jpeg_stdio_src JPP((j_decompress_ptr cinfo, FILE * infile));
+EXTERN(void) jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile);
+EXTERN(void) jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile);
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 /* Data source and destination managers: memory buffers. */
-EXTERN(void) jpeg_mem_dest JPP((j_compress_ptr cinfo,
-			       unsigned char ** outbuffer,
-			       unsigned long * outsize));
-EXTERN(void) jpeg_mem_src JPP((j_decompress_ptr cinfo,
-			      unsigned char * inbuffer,
-			      unsigned long insize));
+EXTERN(void) jpeg_mem_dest (j_compress_ptr cinfo, unsigned char **outbuffer,
+                            unsigned long *outsize);
+EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo,
+                           const unsigned char *inbuffer,
+                           unsigned long insize);
 #endif
 
 /* Default parameter setup for compression */
-EXTERN(void) jpeg_set_defaults JPP((j_compress_ptr cinfo));
+EXTERN(void) jpeg_set_defaults (j_compress_ptr cinfo);
 /* Compression parameter setup aids */
-EXTERN(void) jpeg_set_colorspace JPP((j_compress_ptr cinfo,
-				      J_COLOR_SPACE colorspace));
-EXTERN(void) jpeg_default_colorspace JPP((j_compress_ptr cinfo));
-EXTERN(void) jpeg_set_quality JPP((j_compress_ptr cinfo, int quality,
-				   boolean force_baseline));
-EXTERN(void) jpeg_set_linear_quality JPP((j_compress_ptr cinfo,
-					  int scale_factor,
-					  boolean force_baseline));
+EXTERN(void) jpeg_set_colorspace (j_compress_ptr cinfo,
+                                  J_COLOR_SPACE colorspace);
+EXTERN(void) jpeg_default_colorspace (j_compress_ptr cinfo);
+EXTERN(void) jpeg_set_quality (j_compress_ptr cinfo, int quality,
+                               boolean force_baseline);
+EXTERN(void) jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
+                                      boolean force_baseline);
 #if JPEG_LIB_VERSION >= 70
-EXTERN(void) jpeg_default_qtables JPP((j_compress_ptr cinfo,
-				       boolean force_baseline));
+EXTERN(void) jpeg_default_qtables (j_compress_ptr cinfo,
+                                   boolean force_baseline);
 #endif
-EXTERN(void) jpeg_add_quant_table JPP((j_compress_ptr cinfo, int which_tbl,
-				       const unsigned int *basic_table,
-				       int scale_factor,
-				       boolean force_baseline));
-EXTERN(int) jpeg_quality_scaling JPP((int quality));
-EXTERN(void) jpeg_simple_progression JPP((j_compress_ptr cinfo));
-EXTERN(void) jpeg_suppress_tables JPP((j_compress_ptr cinfo,
-				       boolean suppress));
-EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table JPP((j_common_ptr cinfo));
-EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table JPP((j_common_ptr cinfo));
+EXTERN(void) jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
+                                   const unsigned int *basic_table,
+                                   int scale_factor, boolean force_baseline);
+EXTERN(int) jpeg_quality_scaling (int quality);
+EXTERN(void) jpeg_simple_progression (j_compress_ptr cinfo);
+EXTERN(void) jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress);
+EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table (j_common_ptr cinfo);
+EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table (j_common_ptr cinfo);
 
 /* Main entry points for compression */
-EXTERN(void) jpeg_start_compress JPP((j_compress_ptr cinfo,
-				      boolean write_all_tables));
-EXTERN(JDIMENSION) jpeg_write_scanlines JPP((j_compress_ptr cinfo,
-					     JSAMPARRAY scanlines,
-					     JDIMENSION num_lines));
-EXTERN(void) jpeg_finish_compress JPP((j_compress_ptr cinfo));
+EXTERN(void) jpeg_start_compress (j_compress_ptr cinfo,
+                                  boolean write_all_tables);
+EXTERN(JDIMENSION) jpeg_write_scanlines (j_compress_ptr cinfo,
+                                         JSAMPARRAY scanlines,
+                                         JDIMENSION num_lines);
+EXTERN(void) jpeg_finish_compress (j_compress_ptr cinfo);
 
 #if JPEG_LIB_VERSION >= 70
 /* Precalculate JPEG dimensions for current compression parameters. */
-EXTERN(void) jpeg_calc_jpeg_dimensions JPP((j_compress_ptr cinfo));
+EXTERN(void) jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo);
 #endif
 
 /* Replaces jpeg_write_scanlines when writing raw downsampled data. */
-EXTERN(JDIMENSION) jpeg_write_raw_data JPP((j_compress_ptr cinfo,
-					    JSAMPIMAGE data,
-					    JDIMENSION num_lines));
+EXTERN(JDIMENSION) jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
+                                        JDIMENSION num_lines);
 
 /* Write a special marker.  See libjpeg.txt concerning safe usage. */
-EXTERN(void) jpeg_write_marker
-	JPP((j_compress_ptr cinfo, int marker,
-	     const JOCTET * dataptr, unsigned int datalen));
+EXTERN(void) jpeg_write_marker (j_compress_ptr cinfo, int marker,
+                                const JOCTET *dataptr, unsigned int datalen);
 /* Same, but piecemeal. */
-EXTERN(void) jpeg_write_m_header
-	JPP((j_compress_ptr cinfo, int marker, unsigned int datalen));
-EXTERN(void) jpeg_write_m_byte
-	JPP((j_compress_ptr cinfo, int val));
+EXTERN(void) jpeg_write_m_header (j_compress_ptr cinfo, int marker,
+                                  unsigned int datalen);
+EXTERN(void) jpeg_write_m_byte (j_compress_ptr cinfo, int val);
 
 /* Alternate compression function: just write an abbreviated table file */
-EXTERN(void) jpeg_write_tables JPP((j_compress_ptr cinfo));
+EXTERN(void) jpeg_write_tables (j_compress_ptr cinfo);
 
 /* Decompression startup: read start of JPEG datastream to see what's there */
-EXTERN(int) jpeg_read_header JPP((j_decompress_ptr cinfo,
-				  boolean require_image));
+EXTERN(int) jpeg_read_header (j_decompress_ptr cinfo, boolean require_image);
 /* Return value is one of: */
-#define JPEG_SUSPENDED		0 /* Suspended due to lack of input data */
-#define JPEG_HEADER_OK		1 /* Found valid image datastream */
-#define JPEG_HEADER_TABLES_ONLY	2 /* Found valid table-specs-only datastream */
+#define JPEG_SUSPENDED          0 /* Suspended due to lack of input data */
+#define JPEG_HEADER_OK          1 /* Found valid image datastream */
+#define JPEG_HEADER_TABLES_ONLY 2 /* Found valid table-specs-only datastream */
 /* If you pass require_image = TRUE (normal case), you need not check for
  * a TABLES_ONLY return code; an abbreviated file will cause an error exit.
  * JPEG_SUSPENDED is only possible if you use a data source module that can
@@ -1087,56 +993,55 @@
  */
 
 /* Main entry points for decompression */
-EXTERN(boolean) jpeg_start_decompress JPP((j_decompress_ptr cinfo));
-EXTERN(JDIMENSION) jpeg_read_scanlines JPP((j_decompress_ptr cinfo,
-					    JSAMPARRAY scanlines,
-					    JDIMENSION max_lines));
+EXTERN(boolean) jpeg_start_decompress (j_decompress_ptr cinfo);
+EXTERN(JDIMENSION) jpeg_read_scanlines (j_decompress_ptr cinfo,
+                                        JSAMPARRAY scanlines,
+                                        JDIMENSION max_lines);
 EXTERN(JDIMENSION) jpeg_skip_scanlines (j_decompress_ptr cinfo,
                                         JDIMENSION num_lines);
-EXTERN(boolean) jpeg_finish_decompress JPP((j_decompress_ptr cinfo));
+EXTERN(void) jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                                 JDIMENSION *width);
+EXTERN(boolean) jpeg_finish_decompress (j_decompress_ptr cinfo);
 
 /* Replaces jpeg_read_scanlines when reading raw downsampled data. */
-EXTERN(JDIMENSION) jpeg_read_raw_data JPP((j_decompress_ptr cinfo,
-					   JSAMPIMAGE data,
-					   JDIMENSION max_lines));
+EXTERN(JDIMENSION) jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
+                                       JDIMENSION max_lines);
 
 /* Additional entry points for buffered-image mode. */
-EXTERN(boolean) jpeg_has_multiple_scans JPP((j_decompress_ptr cinfo));
-EXTERN(boolean) jpeg_start_output JPP((j_decompress_ptr cinfo,
-				       int scan_number));
-EXTERN(boolean) jpeg_finish_output JPP((j_decompress_ptr cinfo));
-EXTERN(boolean) jpeg_input_complete JPP((j_decompress_ptr cinfo));
-EXTERN(void) jpeg_new_colormap JPP((j_decompress_ptr cinfo));
-EXTERN(int) jpeg_consume_input JPP((j_decompress_ptr cinfo));
+EXTERN(boolean) jpeg_has_multiple_scans (j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_start_output (j_decompress_ptr cinfo, int scan_number);
+EXTERN(boolean) jpeg_finish_output (j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_input_complete (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_new_colormap (j_decompress_ptr cinfo);
+EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo);
 /* Return value is one of: */
-/* #define JPEG_SUSPENDED	0    Suspended due to lack of input data */
-#define JPEG_REACHED_SOS	1 /* Reached start of new scan */
-#define JPEG_REACHED_EOI	2 /* Reached end of image */
-#define JPEG_ROW_COMPLETED	3 /* Completed one iMCU row */
-#define JPEG_SCAN_COMPLETED	4 /* Completed last iMCU row of a scan */
+/* #define JPEG_SUSPENDED       0    Suspended due to lack of input data */
+#define JPEG_REACHED_SOS        1 /* Reached start of new scan */
+#define JPEG_REACHED_EOI        2 /* Reached end of image */
+#define JPEG_ROW_COMPLETED      3 /* Completed one iMCU row */
+#define JPEG_SCAN_COMPLETED     4 /* Completed last iMCU row of a scan */
 
 /* Precalculate output dimensions for current decompression parameters. */
 #if JPEG_LIB_VERSION >= 80
-EXTERN(void) jpeg_core_output_dimensions JPP((j_decompress_ptr cinfo));
+EXTERN(void) jpeg_core_output_dimensions (j_decompress_ptr cinfo);
 #endif
-EXTERN(void) jpeg_calc_output_dimensions JPP((j_decompress_ptr cinfo));
+EXTERN(void) jpeg_calc_output_dimensions (j_decompress_ptr cinfo);
 
 /* Control saving of COM and APPn markers into marker_list. */
-EXTERN(void) jpeg_save_markers
-	JPP((j_decompress_ptr cinfo, int marker_code,
-	     unsigned int length_limit));
+EXTERN(void) jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
+                                unsigned int length_limit);
 
 /* Install a special processing method for COM or APPn markers. */
-EXTERN(void) jpeg_set_marker_processor
-	JPP((j_decompress_ptr cinfo, int marker_code,
-	     jpeg_marker_parser_method routine));
+EXTERN(void) jpeg_set_marker_processor (j_decompress_ptr cinfo,
+                                        int marker_code,
+                                        jpeg_marker_parser_method routine);
 
 /* Read or write raw DCT coefficients --- useful for lossless transcoding. */
-EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients JPP((j_decompress_ptr cinfo));
-EXTERN(void) jpeg_write_coefficients JPP((j_compress_ptr cinfo,
-					  jvirt_barray_ptr * coef_arrays));
-EXTERN(void) jpeg_copy_critical_parameters JPP((j_decompress_ptr srcinfo,
-						j_compress_ptr dstinfo));
+EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_write_coefficients (j_compress_ptr cinfo,
+                                      jvirt_barray_ptr *coef_arrays);
+EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
+                                            j_compress_ptr dstinfo);
 
 /* If you choose to abort compression or decompression before completing
  * jpeg_finish_(de)compress, then you need to clean up to release memory,
@@ -1144,28 +1049,27 @@
  * if you're done with the JPEG object, but if you want to clean it up and
  * reuse it, call this:
  */
-EXTERN(void) jpeg_abort_compress JPP((j_compress_ptr cinfo));
-EXTERN(void) jpeg_abort_decompress JPP((j_decompress_ptr cinfo));
+EXTERN(void) jpeg_abort_compress (j_compress_ptr cinfo);
+EXTERN(void) jpeg_abort_decompress (j_decompress_ptr cinfo);
 
 /* Generic versions of jpeg_abort and jpeg_destroy that work on either
  * flavor of JPEG object.  These may be more convenient in some places.
  */
-EXTERN(void) jpeg_abort JPP((j_common_ptr cinfo));
-EXTERN(void) jpeg_destroy JPP((j_common_ptr cinfo));
+EXTERN(void) jpeg_abort (j_common_ptr cinfo);
+EXTERN(void) jpeg_destroy (j_common_ptr cinfo);
 
 /* Default restart-marker-resync procedure for use by data source modules */
-EXTERN(boolean) jpeg_resync_to_restart JPP((j_decompress_ptr cinfo,
-					    int desired));
+EXTERN(boolean) jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired);
 
 
 /* These marker codes are exported since applications and data source modules
  * are likely to want to use them.
  */
 
-#define JPEG_RST0	0xD0	/* RST0 marker code */
-#define JPEG_EOI	0xD9	/* EOI marker code */
-#define JPEG_APP0	0xE0	/* APP0 marker code */
-#define JPEG_COM	0xFE	/* COM marker code */
+#define JPEG_RST0       0xD0    /* RST0 marker code */
+#define JPEG_EOI        0xD9    /* EOI marker code */
+#define JPEG_APP0       0xE0    /* APP0 marker code */
+#define JPEG_COM        0xFE    /* COM marker code */
 
 
 /* If we have a brain-damaged compiler that emits warnings (or worse, errors)
@@ -1174,7 +1078,7 @@
  */
 
 #ifdef INCOMPLETE_TYPES_BROKEN
-#ifndef JPEG_INTERNALS		/* will be defined in jpegint.h */
+#ifndef JPEG_INTERNALS          /* will be defined in jpegint.h */
 struct jvirt_sarray_control { long dummy; };
 struct jvirt_barray_control { long dummy; };
 struct jpeg_comp_master { long dummy; };
@@ -1209,8 +1113,8 @@
  */
 
 #ifdef JPEG_INTERNALS
-#include "jpegint.h"		/* fetch private declarations */
-#include "jerror.h"		/* fetch error codes too */
+#include "jpegint.h"            /* fetch private declarations */
+#include "jerror.h"             /* fetch error codes too */
 #endif
 
 #ifdef __cplusplus
diff --git a/jpeglibmangler.h b/jpeglibmangler.h
index 59f554a..ed87a53 100644
--- a/jpeglibmangler.h
+++ b/jpeglibmangler.h
@@ -83,6 +83,7 @@
 #define jpeg_start_decompress chromium_jpeg_start_decompress
 #define jpeg_read_scanlines chromium_jpeg_read_scanlines
 #define jpeg_skip_scanlines chromium_jpeg_skip_scanlines
+#define jpeg_crop_scanline chromium_jpeg_crop_scanline
 #define jpeg_finish_decompress chromium_jpeg_finish_decompress
 #define jpeg_read_raw_data chromium_jpeg_read_raw_data
 #define jpeg_has_multiple_scans chromium_jpeg_has_multiple_scans
diff --git a/jpegtran.1 b/jpegtran.1
new file mode 100644
index 0000000..7f3c853
--- /dev/null
+++ b/jpegtran.1
@@ -0,0 +1,290 @@
+.TH JPEGTRAN 1 "18 February 2016"
+.SH NAME
+jpegtran \- lossless transformation of JPEG files
+.SH SYNOPSIS
+.B jpegtran
+[
+.I options
+]
+[
+.I filename
+]
+.LP
+.SH DESCRIPTION
+.LP
+.B jpegtran
+performs various useful transformations of JPEG files.
+It can translate the coded representation from one variant of JPEG to another,
+for example from baseline JPEG to progressive JPEG or vice versa.  It can also
+perform some rearrangements of the image data, for example turning an image
+from landscape to portrait format by rotation.
+.PP
+For EXIF files and JPEG files containing Exif data, you may prefer to use
+.B exiftran
+instead.
+.PP
+.B jpegtran
+works by rearranging the compressed data (DCT coefficients), without
+ever fully decoding the image.  Therefore, its transformations are lossless:
+there is no image degradation at all, which would not be true if you used
+.B djpeg
+followed by
+.B cjpeg
+to accomplish the same conversion.  But by the same token,
+.B jpegtran
+cannot perform lossy operations such as changing the image quality.  However,
+while the image data is losslessly transformed, metadata can be removed.  See
+the
+.B \-copy
+option for specifics.
+.PP
+.B jpegtran
+reads the named JPEG/JFIF file, or the standard input if no file is
+named, and produces a JPEG/JFIF file on the standard output.
+.SH OPTIONS
+All switch names may be abbreviated; for example,
+.B \-optimize
+may be written
+.B \-opt
+or
+.BR \-o .
+Upper and lower case are equivalent.
+British spellings are also accepted (e.g.,
+.BR \-optimise ),
+though for brevity these are not mentioned below.
+.PP
+To specify the coded JPEG representation used in the output file,
+.B jpegtran
+accepts a subset of the switches recognized by
+.BR cjpeg :
+.TP
+.B \-optimize
+Perform optimization of entropy encoding parameters.
+.TP
+.B \-progressive
+Create progressive JPEG file.
+.TP
+.BI \-restart " N"
+Emit a JPEG restart marker every N MCU rows, or every N MCU blocks if "B" is
+attached to the number.
+.TP
+.B \-arithmetic
+Use arithmetic coding.
+.TP
+.BI \-scans " file"
+Use the scan script given in the specified text file.
+.PP
+See
+.BR cjpeg (1)
+for more details about these switches.
+If you specify none of these switches, you get a plain baseline-JPEG output
+file.  The quality setting and so forth are determined by the input file.
+.PP
+The image can be losslessly transformed by giving one of these switches:
+.TP
+.B \-flip horizontal
+Mirror image horizontally (left-right).
+.TP
+.B \-flip vertical
+Mirror image vertically (top-bottom).
+.TP
+.B \-rotate 90
+Rotate image 90 degrees clockwise.
+.TP
+.B \-rotate 180
+Rotate image 180 degrees.
+.TP
+.B \-rotate 270
+Rotate image 270 degrees clockwise (or 90 ccw).
+.TP
+.B \-transpose
+Transpose image (across UL-to-LR axis).
+.TP
+.B \-transverse
+Transverse transpose (across UR-to-LL axis).
+.PP
+The transpose transformation has no restrictions regarding image dimensions.
+The other transformations operate rather oddly if the image dimensions are not
+a multiple of the iMCU size (usually 8 or 16 pixels), because they can only
+transform complete blocks of DCT coefficient data in the desired way.
+.PP
+.BR jpegtran 's
+default behavior when transforming an odd-size image is designed
+to preserve exact reversibility and mathematical consistency of the
+transformation set.  As stated, transpose is able to flip the entire image
+area.  Horizontal mirroring leaves any partial iMCU column at the right edge
+untouched, but is able to flip all rows of the image.  Similarly, vertical
+mirroring leaves any partial iMCU row at the bottom edge untouched, but is
+able to flip all columns.  The other transforms can be built up as sequences
+of transpose and flip operations; for consistency, their actions on edge
+pixels are defined to be the same as the end result of the corresponding
+transpose-and-flip sequence.
+.PP
+For practical use, you may prefer to discard any untransformable edge pixels
+rather than having a strange-looking strip along the right and/or bottom edges
+of a transformed image.  To do this, add the
+.B \-trim
+switch:
+.TP
+.B \-trim
+Drop non-transformable edge blocks.
+.IP
+Obviously, a transformation with
+.B \-trim
+is not reversible, so strictly speaking
+.B jpegtran
+with this switch is not lossless.  Also, the expected mathematical
+equivalences between the transformations no longer hold.  For example,
+.B \-rot 270 -trim
+trims only the bottom edge, but
+.B \-rot 90 -trim
+followed by
+.B \-rot 180 -trim
+trims both edges.
+.TP
+.B \-perfect
+If you are only interested in perfect transformations, add the
+.B \-perfect
+switch.  This causes
+.B jpegtran
+to fail with an error if the transformation is not perfect.
+.IP
+For example, you may want to do
+.IP
+.B (jpegtran \-rot 90 -perfect
+.I foo.jpg
+.B || djpeg
+.I foo.jpg
+.B | pnmflip \-r90 | cjpeg)
+.IP
+to do a perfect rotation, if available, or an approximated one if not.
+.PP
+This version of \fBjpegtran\fR also offers a lossless crop option, which
+discards data outside of a given image region but losslessly preserves what is
+inside. Like the rotate and flip transforms, lossless crop is restricted by the
+current JPEG format; the upper left corner of the selected region must fall on
+an iMCU boundary.  If it doesn't, then it is silently moved up and/or left to
+the nearest iMCU boundary (the lower right corner is unchanged.)  Thus, the
+output image covers at least the requested region, but it may cover more.  The
+adjustment of the region dimensions may be optionally disabled by attaching
+an 'f' character ("force") to the width or height number.
+
+The image can be losslessly cropped by giving the switch:
+.TP
+.B \-crop WxH+X+Y
+Crop the image to a rectangular region of width W and height H, starting at
+point X,Y.  The lossless crop feature discards data outside of a given image
+region but losslessly preserves what is inside.  Like the rotate and flip
+transforms, lossless crop is restricted by the current JPEG format; the upper
+left corner of the selected region must fall on an iMCU boundary.  If it
+doesn't, then it is silently moved up and/or left to the nearest iMCU boundary
+(the lower right corner is unchanged.)
+.PP
+Other not-strictly-lossless transformation switches are:
+.TP
+.B \-grayscale
+Force grayscale output.
+.IP
+This option discards the chrominance channels if the input image is YCbCr
+(ie, a standard color JPEG), resulting in a grayscale JPEG file.  The
+luminance channel is preserved exactly, so this is a better method of reducing
+to grayscale than decompression, conversion, and recompression.  This switch
+is particularly handy for fixing a monochrome picture that was mistakenly
+encoded as a color JPEG.  (In such a case, the space savings from getting rid
+of the near-empty chroma channels won't be large; but the decoding time for
+a grayscale JPEG is substantially less than that for a color JPEG.)
+.PP
+.B jpegtran
+also recognizes these switches that control what to do with "extra" markers,
+such as comment blocks:
+.TP
+.B \-copy none
+Copy no extra markers from source file.  This setting suppresses all
+comments and other metadata in the source file.
+.TP
+.B \-copy comments
+Copy only comment markers.  This setting copies comments from the source file
+but discards any other metadata.
+.TP
+.B \-copy all
+Copy all extra markers.  This setting preserves miscellaneous markers
+found in the source file, such as JFIF thumbnails, Exif data, and Photoshop
+settings.  In some files, these extra markers can be sizable.  Note that this
+option will copy thumbnails as-is; they will not be transformed.
+.PP
+The default behavior is \fB-copy comments\fR.  (Note: in IJG releases v6 and
+v6a, \fBjpegtran\fR always did the equivalent of \fB-copy none\fR.)
+.PP
+Additional switches recognized by jpegtran are:
+.TP
+.BI \-maxmemory " N"
+Set limit for amount of memory to use in processing large images.  Value is
+in thousands of bytes, or millions of bytes if "M" is attached to the
+number.  For example,
+.B \-max 4m
+selects 4000000 bytes.  If more space is needed, temporary files will be used.
+.TP
+.BI \-outfile " name"
+Send output image to the named file, not to standard output.
+.TP
+.B \-verbose
+Enable debug printout.  More
+.BR \-v 's
+give more output.  Also, version information is printed at startup.
+.TP
+.B \-debug
+Same as
+.BR \-verbose .
+.TP
+.B \-version
+Print version information and exit.
+.SH EXAMPLES
+.LP
+This example converts a baseline JPEG file to progressive form:
+.IP
+.B jpegtran \-progressive
+.I foo.jpg
+.B >
+.I fooprog.jpg
+.PP
+This example rotates an image 90 degrees clockwise, discarding any
+unrotatable edge pixels:
+.IP
+.B jpegtran \-rot 90 -trim
+.I foo.jpg
+.B >
+.I foo90.jpg
+.SH ENVIRONMENT
+.TP
+.B JPEGMEM
+If this environment variable is set, its value is the default memory limit.
+The value is specified as described for the
+.B \-maxmemory
+switch.
+.B JPEGMEM
+overrides the default value specified when the program was compiled, and
+itself is overridden by an explicit
+.BR \-maxmemory .
+.SH SEE ALSO
+.BR cjpeg (1),
+.BR djpeg (1),
+.BR rdjpgcom (1),
+.BR wrjpgcom (1)
+.br
+Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
+Communications of the ACM, April 1991 (vol. 34, no. 4), pp. 30-44.
+.SH AUTHOR
+Independent JPEG Group
+.PP
+This file was modified by The libjpeg-turbo Project to include only information
+relevant to libjpeg-turbo and to wordsmith certain sections.
+.SH BUGS
+The transform options can't transform odd-size images perfectly.  Use
+.B \-trim
+or
+.B \-perfect
+if you don't like the results.
+.PP
+The entire image is read into memory and then written out again, even in
+cases where this isn't really necessary.  Expect swapping on large images,
+especially when using the more complex transform options.
diff --git a/jpegtran.c b/jpegtran.c
index 54c8ece..c44f21e 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-2010, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2010, 2014, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a command-line user interface for JPEG transcoding.
  * It is very similar to cjpeg.c, and partly to djpeg.c, but provides
@@ -13,18 +14,18 @@
  * provides some lossless and sort-of-lossless transformations of JPEG data.
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
-#include "transupp.h"		/* Support routines for jpegtran */
-#include "jversion.h"		/* for version message */
-#include "config.h"
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include "transupp.h"           /* Support routines for jpegtran */
+#include "jversion.h"           /* for version message */
+#include "jconfigint.h"
 
-#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
+#ifdef USE_CCOMMAND             /* command-line reader for Macintosh */
 #ifdef __MWERKS__
 #include <SIOUX.h>              /* Metrowerks needs this */
-#include <console.h>		/* ... and this */
+#include <console.h>            /* ... and this */
 #endif
 #ifdef THINK_C
-#include <console.h>		/* Think declares it here */
+#include <console.h>            /* Think declares it here */
 #endif
 #endif
 
@@ -38,9 +39,9 @@
  */
 
 
-static const char * progname;	/* program name for error messages */
-static char * outfilename;	/* for -outfile switch */
-static JCOPY_OPTION copyoption;	/* -copy switch */
+static const char *progname;    /* program name for error messages */
+static char *outfilename;       /* for -outfile switch */
+static JCOPY_OPTION copyoption; /* -copy switch */
 static jpeg_transform_info transformoption; /* image transformation options */
 
 
@@ -86,6 +87,7 @@
   fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
   fprintf(stderr, "  -outfile name  Specify name for output file\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "  -version       Print version information and exit\n");
   fprintf(stderr, "Switches for wizards:\n");
 #ifdef C_MULTISCAN_FILES_SUPPORTED
   fprintf(stderr, "  -scans file    Create multi-scan JPEG per script file\n");
@@ -106,12 +108,12 @@
     transformoption.transform = transform;
   } else {
     fprintf(stderr, "%s: can only do one image transformation at a time\n",
-	    progname);
+            progname);
     usage();
   }
 #else
   fprintf(stderr, "%s: sorry, image transformation was not compiled\n",
-	  progname);
+          progname);
   exit(EXIT_FAILURE);
 #endif
 }
@@ -119,7 +121,7 @@
 
 LOCAL(int)
 parse_switches (j_compress_ptr cinfo, int argc, char **argv,
-		int last_file_arg_seen, boolean for_real)
+                int last_file_arg_seen, boolean for_real)
 /* Parse optional switches.
  * Returns argv[] index of first file-name argument (== argc if none).
  * Any file names with indexes <= last_file_arg_seen are ignored;
@@ -130,9 +132,9 @@
  */
 {
   int argn;
-  char * arg;
+  char *arg;
   boolean simple_progressive;
-  char * scansarg = NULL;	/* saves -scans parm if any */
+  char *scansarg = NULL;        /* saves -scans parm if any */
 
   /* Set up default JPEG parameters. */
   simple_progressive = FALSE;
@@ -153,12 +155,12 @@
     if (*arg != '-') {
       /* Not a switch, must be a file name argument */
       if (argn <= last_file_arg_seen) {
-	outfilename = NULL;	/* -outfile applies to just one input file */
-	continue;		/* ignore this name if previously processed */
+        outfilename = NULL;     /* -outfile applies to just one input file */
+        continue;               /* ignore this name if previously processed */
       }
-      break;			/* else done parsing switches */
+      break;                    /* else done parsing switches */
     }
-    arg++;			/* advance past switch marker character */
+    arg++;                      /* advance past switch marker character */
 
     if (keymatch(arg, "arithmetic", 1)) {
       /* Use arithmetic coding. */
@@ -166,35 +168,35 @@
       cinfo->arith_code = TRUE;
 #else
       fprintf(stderr, "%s: sorry, arithmetic coding not supported\n",
-	      progname);
+              progname);
       exit(EXIT_FAILURE);
 #endif
 
     } else if (keymatch(arg, "copy", 2)) {
       /* Select which extra markers to copy. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (keymatch(argv[argn], "none", 1)) {
-	copyoption = JCOPYOPT_NONE;
+        copyoption = JCOPYOPT_NONE;
       } else if (keymatch(argv[argn], "comments", 1)) {
-	copyoption = JCOPYOPT_COMMENTS;
+        copyoption = JCOPYOPT_COMMENTS;
       } else if (keymatch(argv[argn], "all", 1)) {
-	copyoption = JCOPYOPT_ALL;
+        copyoption = JCOPYOPT_ALL;
       } else
-	usage();
+        usage();
 
     } else if (keymatch(arg, "crop", 2)) {
       /* Perform lossless cropping. */
 #if TRANSFORMS_SUPPORTED
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (! jtransform_parse_crop_spec(&transformoption, argv[argn])) {
-	fprintf(stderr, "%s: bogus -crop argument '%s'\n",
-		progname, argv[argn]);
-	exit(EXIT_FAILURE);
+        fprintf(stderr, "%s: bogus -crop argument '%s'\n",
+                progname, argv[argn]);
+        exit(EXIT_FAILURE);
       }
 #else
-      select_transform(JXFORM_NONE);	/* force an error */
+      select_transform(JXFORM_NONE);    /* force an error */
 #endif
 
     } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
@@ -203,32 +205,37 @@
       static boolean printed_version = FALSE;
 
       if (! printed_version) {
-	fprintf(stderr, "%s version %s (build %s)\n",
-		PACKAGE_NAME, VERSION, BUILD);
-	fprintf(stderr, "%s\n\n", JCOPYRIGHT);
-	fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n",
-		JVERSION);
-	printed_version = TRUE;
+        fprintf(stderr, "%s version %s (build %s)\n",
+                PACKAGE_NAME, VERSION, BUILD);
+        fprintf(stderr, "%s\n\n", JCOPYRIGHT);
+        fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n",
+                JVERSION);
+        printed_version = TRUE;
       }
       cinfo->err->trace_level++;
 
+    } else if (keymatch(arg, "version", 4)) {
+      fprintf(stderr, "%s version %s (build %s)\n",
+              PACKAGE_NAME, VERSION, BUILD);
+      exit(EXIT_SUCCESS);
+
     } else if (keymatch(arg, "flip", 1)) {
       /* Mirror left-right or top-bottom. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (keymatch(argv[argn], "horizontal", 1))
-	select_transform(JXFORM_FLIP_H);
+        select_transform(JXFORM_FLIP_H);
       else if (keymatch(argv[argn], "vertical", 1))
-	select_transform(JXFORM_FLIP_V);
+        select_transform(JXFORM_FLIP_V);
       else
-	usage();
+        usage();
 
     } else if (keymatch(arg, "grayscale", 1) || keymatch(arg, "greyscale",1)) {
       /* Force to grayscale. */
 #if TRANSFORMS_SUPPORTED
       transformoption.force_grayscale = TRUE;
 #else
-      select_transform(JXFORM_NONE);	/* force an error */
+      select_transform(JXFORM_NONE);    /* force an error */
 #endif
 
     } else if (keymatch(arg, "maxmemory", 3)) {
@@ -236,12 +243,12 @@
       long lval;
       char ch = 'x';
 
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
-	usage();
+        usage();
       if (ch == 'm' || ch == 'M')
-	lval *= 1000L;
+        lval *= 1000L;
       cinfo->mem->max_memory_to_use = lval * 1000L;
 
     } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
@@ -250,15 +257,15 @@
       cinfo->optimize_coding = TRUE;
 #else
       fprintf(stderr, "%s: sorry, entropy optimization was not compiled\n",
-	      progname);
+              progname);
       exit(EXIT_FAILURE);
 #endif
 
     } else if (keymatch(arg, "outfile", 4)) {
       /* Set output file name. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      outfilename = argv[argn];	/* save it away for later use */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      outfilename = argv[argn]; /* save it away for later use */
 
     } else if (keymatch(arg, "perfect", 2)) {
       /* Fail if there is any partial edge MCUs that the transform can't
@@ -272,7 +279,7 @@
       /* We must postpone execution until num_components is known. */
 #else
       fprintf(stderr, "%s: sorry, progressive output was not compiled\n",
-	      progname);
+              progname);
       exit(EXIT_FAILURE);
 #endif
 
@@ -281,43 +288,43 @@
       long lval;
       char ch = 'x';
 
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
-	usage();
+        usage();
       if (lval < 0 || lval > 65535L)
-	usage();
+        usage();
       if (ch == 'b' || ch == 'B') {
-	cinfo->restart_interval = (unsigned int) lval;
-	cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */
+        cinfo->restart_interval = (unsigned int) lval;
+        cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */
       } else {
-	cinfo->restart_in_rows = (int) lval;
-	/* restart_interval will be computed during startup */
+        cinfo->restart_in_rows = (int) lval;
+        /* restart_interval will be computed during startup */
       }
 
     } else if (keymatch(arg, "rotate", 2)) {
       /* Rotate 90, 180, or 270 degrees (measured clockwise). */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       if (keymatch(argv[argn], "90", 2))
-	select_transform(JXFORM_ROT_90);
+        select_transform(JXFORM_ROT_90);
       else if (keymatch(argv[argn], "180", 3))
-	select_transform(JXFORM_ROT_180);
+        select_transform(JXFORM_ROT_180);
       else if (keymatch(argv[argn], "270", 3))
-	select_transform(JXFORM_ROT_270);
+        select_transform(JXFORM_ROT_270);
       else
-	usage();
+        usage();
 
     } else if (keymatch(arg, "scans", 1)) {
       /* Set scan script. */
 #ifdef C_MULTISCAN_FILES_SUPPORTED
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
       scansarg = argv[argn];
       /* We must postpone reading the file in case -progressive appears. */
 #else
       fprintf(stderr, "%s: sorry, multi-scan output was not compiled\n",
-	      progname);
+              progname);
       exit(EXIT_FAILURE);
 #endif
 
@@ -334,7 +341,7 @@
       transformoption.trim = TRUE;
 
     } else {
-      usage();			/* bogus switch */
+      usage();                  /* bogus switch */
     }
   }
 
@@ -343,18 +350,18 @@
   if (for_real) {
 
 #ifdef C_PROGRESSIVE_SUPPORTED
-    if (simple_progressive)	/* process -progressive; -scans can override */
+    if (simple_progressive)     /* process -progressive; -scans can override */
       jpeg_simple_progression(cinfo);
 #endif
 
 #ifdef C_MULTISCAN_FILES_SUPPORTED
-    if (scansarg != NULL)	/* process -scans if it was present */
+    if (scansarg != NULL)       /* process -scans if it was present */
       if (! read_scan_script(cinfo, scansarg))
-	usage();
+        usage();
 #endif
   }
 
-  return argn;			/* return index of next arg (file name) */
+  return argn;                  /* return index of next arg (file name) */
 }
 
 
@@ -371,13 +378,13 @@
 #ifdef PROGRESS_REPORT
   struct cdjpeg_progress_mgr progress;
 #endif
-  jvirt_barray_ptr * src_coef_arrays;
-  jvirt_barray_ptr * dst_coef_arrays;
+  jvirt_barray_ptr *src_coef_arrays;
+  jvirt_barray_ptr *dst_coef_arrays;
   int file_index;
   /* We assume all-in-memory processing and can therefore use only a
-   * single file pointer for sequential input and output operation. 
+   * single file pointer for sequential input and output operation.
    */
-  FILE * fp;
+  FILE *fp;
 
   /* On Mac, fetch a command line. */
 #ifdef USE_CCOMMAND
@@ -386,7 +393,7 @@
 
   progname = argv[0];
   if (progname == NULL || progname[0] == 0)
-    progname = "jpegtran";	/* in case C library doesn't provide it */
+    progname = "jpegtran";      /* in case C library doesn't provide it */
 
   /* Initialize the JPEG decompression object with default error handling. */
   srcinfo.err = jpeg_std_error(&jsrcerr);
@@ -395,13 +402,6 @@
   dstinfo.err = jpeg_std_error(&jdsterr);
   jpeg_create_compress(&dstinfo);
 
-  /* Now safe to enable signal catcher.
-   * Note: we assume only the decompression object will have virtual arrays.
-   */
-#ifdef NEED_SIGNAL_CATCHER
-  enable_signal_catcher((j_common_ptr) &srcinfo);
-#endif
-
   /* Scan command line to find file names.
    * It is convenient to use just one switch-parsing routine, but the switch
    * values read here are mostly ignored; we will rescan the switches after
@@ -419,14 +419,14 @@
   if (outfilename == NULL) {
     if (file_index != argc-2) {
       fprintf(stderr, "%s: must name one input and one output file\n",
-	      progname);
+              progname);
       usage();
     }
     outfilename = argv[file_index+1];
   } else {
     if (file_index != argc-1) {
       fprintf(stderr, "%s: must name one input and one output file\n",
-	      progname);
+              progname);
       usage();
     }
   }
@@ -485,8 +485,8 @@
    */
 #if TRANSFORMS_SUPPORTED
   dst_coef_arrays = jtransform_adjust_parameters(&srcinfo, &dstinfo,
-						 src_coef_arrays,
-						 &transformoption);
+                                                 src_coef_arrays,
+                                                 &transformoption);
 #else
   dst_coef_arrays = src_coef_arrays;
 #endif
@@ -527,8 +527,8 @@
   /* Execute image transformation, if any */
 #if TRANSFORMS_SUPPORTED
   jtransform_execute_transformation(&srcinfo, &dstinfo,
-				    src_coef_arrays,
-				    &transformoption);
+                                    src_coef_arrays,
+                                    &transformoption);
 #endif
 
   /* Finish compression and release memory */
@@ -547,5 +547,5 @@
 
   /* All done. */
   exit(jsrcerr.num_warnings + jdsterr.num_warnings ?EXIT_WARNING:EXIT_SUCCESS);
-  return 0;			/* suppress no-return-value warnings */
+  return 0;                     /* suppress no-return-value warnings */
 }
diff --git a/jpegut.c b/jpegut.c
deleted file mode 100644
index cec0f72..0000000
--- a/jpegut.c
+++ /dev/null
@@ -1,387 +0,0 @@
-/* Copyright (C)2004 Landmark Graphics Corporation
- * Copyright (C)2005 Sun Microsystems, Inc.
- * Copyright (C)2009 D. R. Commander
- *
- * This library is free software and may be redistributed and/or modified under
- * the terms of the wxWindows Library License, Version 3.1 or (at your option)
- * any later version.  The full license is in the LICENSE.txt file included
- * with this distribution.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * wxWindows Library License for more details.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "./rrtimer.h"
-#include "./turbojpeg.h"
-
-#define _catch(f) {if((f)==-1) {printf("TJPEG: %s\n", tjGetErrorStr());  bailout();}}
-
-const char *_subnamel[NUMSUBOPT]={"4:4:4", "4:2:2", "4:2:0", "GRAY"};
-const char *_subnames[NUMSUBOPT]={"444", "422", "420", "GRAY"};
-
-int exitstatus=0;
-#define bailout() {exitstatus=-1;  goto finally;}
-
-int pixels[9][3]=
-{
-	{0, 255, 0},
-	{255, 0, 255},
-	{255, 255, 0},
-	{0, 0, 255},
-	{0, 255, 255},
-	{255, 0, 0},
-	{255, 255, 255},
-	{0, 0, 0},
-	{255, 0, 0}
-};
-
-void initbuf(unsigned char *buf, int w, int h, int ps, int flags)
-{
-	int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
-		_i, j;
-	if(flags&TJ_ALPHAFIRST) {roffset++;  goffset++;  boffset++;}
-	memset(buf, 0, w*h*ps);
-	for(_i=0; _i<16; _i++)
-	{
-		if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
-		for(j=0; j<w; j++)
-		{
-			buf[(w*i+j)*ps+roffset]=255;
-			if(((_i/8)+(j/8))%2==0)
-			{
-				buf[(w*i+j)*ps+goffset]=255;
-				buf[(w*i+j)*ps+boffset]=255;
-			}
-		}
-	}
-	for(_i=16; _i<h; _i++)
-	{
-		if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
-		for(j=0; j<w; j++)
-		{
-			if(((_i/8)+(j/8))%2!=0)
-			{
-				buf[(w*i+j)*ps+roffset]=255;
-				buf[(w*i+j)*ps+goffset]=255;
-			}
-		}
-	}
-}
-
-void dumpbuf(unsigned char *buf, int w, int h, int ps, int flags)
-{
-	int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
-		j;
-	for(i=0; i<h; i++)
-	{
-		for(j=0; j<w; j++)
-		{
-			printf("%.3d/%.3d/%.3d ", buf[(w*i+j)*ps+roffset],
-				buf[(w*i+j)*ps+roffset], buf[(w*i+j)*ps+roffset]);
-		}
-		printf("\n");
-	}
-}
-
-int checkbuf(unsigned char *buf, int w, int h, int ps, int subsamp, int flags)
-{
-	int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
-		_i, j;
-	if(flags&TJ_ALPHAFIRST) {roffset++;  goffset++;  boffset++;}
-	if(subsamp==TJ_GRAYSCALE)
-	{
-		for(_i=0; _i<16; _i++)
-		{
-			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
-			for(j=0; j<w; j++)
-			{
-				unsigned char r=buf[(w*i+j)*ps+roffset],
-					g=buf[(w*i+j)*ps+goffset],
-					b=buf[(w*i+j)*ps+boffset];
-				if(((_i/8)+(j/8))%2==0)
-				{
-					if(r<253 || g<253 || b<253) return 0;
-				}
-				else
-				{
-					if(r<74 || r>78 || g<74 || g>78 || b<74 || b>78) return 0;
-				}
-			}
-		}
-		for(_i=16; _i<h; _i++)
-		{
-			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
-			for(j=0; j<w; j++)
-			{
-				unsigned char r=buf[(w*i+j)*ps+roffset],
-					g=buf[(w*i+j)*ps+goffset],
-					b=buf[(w*i+j)*ps+boffset];
-				if(((_i/8)+(j/8))%2==0)
-				{
-					if(r>2 || g>2 || b>2) return 0;
-				}
-				else
-				{
-					if(r<224 || r>228 || g<224 || g>228 || b<224 || b>228) return 0;
-				}
-			}
-		}
-	}
-	else
-	{
-		for(_i=0; _i<16; _i++)
-		{
-			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
-			for(j=0; j<w; j++)
-			{
-				if(buf[(w*i+j)*ps+roffset]<253) return 0;
-				if(((_i/8)+(j/8))%2==0)
-				{
-					if(buf[(w*i+j)*ps+goffset]<253) return 0;
-					if(buf[(w*i+j)*ps+boffset]<253) return 0;
-				}
-				else
-				{
-					if(buf[(w*i+j)*ps+goffset]>2) return 0;
-					if(buf[(w*i+j)*ps+boffset]>2) return 0;
-				}
-			}
-		}
-		for(_i=16; _i<h; _i++)
-		{
-			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
-			for(j=0; j<w; j++)
-			{
-				if(buf[(w*i+j)*ps+boffset]>2) return 0;
-				if(((_i/8)+(j/8))%2==0)
-				{
-					if(buf[(w*i+j)*ps+roffset]>2) return 0;
-					if(buf[(w*i+j)*ps+goffset]>2) return 0;
-				}
-				else
-				{
-					if(buf[(w*i+j)*ps+roffset]<253) return 0;
-					if(buf[(w*i+j)*ps+goffset]<253) return 0;
-				}
-			}
-		}
-	}
-	return 1;
-}
-
-void writejpeg(unsigned char *jpegbuf, unsigned long jpgbufsize, char *filename)
-{
-	FILE *outfile=NULL;
-	if((outfile=fopen(filename, "wb"))==NULL)
-	{
-		printf("ERROR: Could not open %s for writing.\n", filename);
-		bailout();
-	}
-	if(fwrite(jpegbuf, jpgbufsize, 1, outfile)!=1)
-	{
-		printf("ERROR: Could not write to %s.\n", filename);
-		bailout();
-	}
-
-	finally:
-	if(outfile) fclose(outfile);
-}
-
-void gentestjpeg(tjhandle hnd, unsigned char *jpegbuf, unsigned long *size,
-	int w, int h, int ps, char *basefilename, int subsamp, int qual, int flags)
-{
-	char tempstr[1024];  unsigned char *bmpbuf=NULL;
-	const char *pixformat;  double t;
-
-	if(flags&TJ_BGR)
-	{
-		if(ps==3) pixformat="BGR";
-		else {if(flags&TJ_ALPHAFIRST) pixformat="ABGR";  else pixformat="BGRA";}
-	}
-	else
-	{
-		if(ps==3) pixformat="RGB";
-		else {if(flags&TJ_ALPHAFIRST) pixformat="ARGB";  else pixformat="RGBA";}
-	}
-	printf("%s %s -> %s Q%d ... ", pixformat,
-		(flags&TJ_BOTTOMUP)?"Bottom-Up":"Top-Down ", _subnamel[subsamp], qual);
-
-	if((bmpbuf=(unsigned char *)malloc(w*h*ps+1))==NULL)
-	{
-		printf("ERROR: Could not allocate buffer\n");  bailout();
-	}
-	initbuf(bmpbuf, w, h, ps, flags);
-	memset(jpegbuf, 0, TJBUFSIZE(w, h));
-
-	t=rrtime();
-	_catch(tjCompress(hnd, bmpbuf, w, 0, h, ps, jpegbuf, size, subsamp, qual, flags));
-	t=rrtime()-t;
-
-	sprintf(tempstr, "%s_enc_%s_%s_%sQ%d.jpg", basefilename, pixformat,
-		(flags&TJ_BOTTOMUP)? "BU":"TD", _subnames[subsamp], qual);
-	writejpeg(jpegbuf, *size, tempstr);
-	printf("Done.  %f ms\n  Result in %s\n", t*1000., tempstr);
-
-	finally:
-	if(bmpbuf) free(bmpbuf);
-}
-
-void gentestbmp(tjhandle hnd, unsigned char *jpegbuf, unsigned long jpegsize,
-	int w, int h, int ps, char *basefilename, int subsamp, int qual, int flags)
-{
-	unsigned char *bmpbuf=NULL;
-	const char *pixformat;  int _w=0, _h=0;  double t;
-
-	if(flags&TJ_BGR)
-	{
-		if(ps==3) pixformat="BGR";
-		else {if(flags&TJ_ALPHAFIRST) pixformat="ABGR";  else pixformat="BGRA";}
-	}
-	else
-	{
-		if(ps==3) pixformat="RGB";
-		else {if(flags&TJ_ALPHAFIRST) pixformat="ARGB";  else pixformat="RGBA";}
-	}
-	printf("JPEG -> %s %s ... ", pixformat, (flags&TJ_BOTTOMUP)?"Bottom-Up":"Top-Down ");
-
-	_catch(tjDecompressHeader(hnd, jpegbuf, jpegsize, &_w, &_h));
-	if(_w!=w || _h!=h)
-	{
-		printf("Incorrect JPEG header\n");  bailout();
-	}
-
-	if((bmpbuf=(unsigned char *)malloc(w*h*ps+1))==NULL)
-	{
-		printf("ERROR: Could not allocate buffer\n");  bailout();
-	}
-	memset(bmpbuf, 0, w*ps*h);
-
-	t=rrtime();
-	_catch(tjDecompress(hnd, jpegbuf, jpegsize, bmpbuf, w, w*ps, h, ps, flags));
-	t=rrtime()-t;
-
-	if(checkbuf(bmpbuf, w, h, ps, subsamp, flags)) printf("Passed.");
-	else {printf("FAILED!");  dumpbuf(bmpbuf, w, h, ps, flags);}
-
-	printf("  %f ms\n\n", t*1000.);
-
-	finally:
-	if(bmpbuf) free(bmpbuf);
-}
-
-void dotest(int w, int h, int ps, int subsamp, char *basefilename)
-{
-	tjhandle hnd=NULL, dhnd=NULL;  unsigned char *jpegbuf=NULL;
-	unsigned long size;
-
-	if((jpegbuf=(unsigned char *)malloc(TJBUFSIZE(w, h))) == NULL)
-	{
-		puts("ERROR: Could not allocate buffer.");  bailout();
-	}
-
-	if((hnd=tjInitCompress())==NULL)
-		{printf("Error in tjInitCompress():\n%s\n", tjGetErrorStr());  bailout();}
-	if((dhnd=tjInitDecompress())==NULL)
-		{printf("Error in tjInitDecompress():\n%s\n", tjGetErrorStr());  bailout();}
-
-	gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, 0);
-	gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, 0);
-
-	gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_BGR);
-	gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_BGR);
-
-	gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_BOTTOMUP);
-	gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_BOTTOMUP);
-
-	gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_BGR|TJ_BOTTOMUP);
-	gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_BGR|TJ_BOTTOMUP);
-
-	if(ps==4)
-	{
-		gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST);
-		gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST);
-
-		gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR);
-		gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR);
-
-		gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BOTTOMUP);
-		gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BOTTOMUP);
-
-		gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR|TJ_BOTTOMUP);
-		gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR|TJ_BOTTOMUP);
-	}
-
-	finally:
-	if(hnd) tjDestroy(hnd);
-	if(dhnd) tjDestroy(dhnd);
-
-	if(jpegbuf) free(jpegbuf);
-}
-
-#define MAXLENGTH 2048
-
-void dotest1(void)
-{
-	int i, j, i2;  unsigned char *bmpbuf=NULL, *jpgbuf=NULL;
-	tjhandle hnd=NULL;  unsigned long size;
-	if((hnd=tjInitCompress())==NULL)
-		{printf("Error in tjInitCompress():\n%s\n", tjGetErrorStr());  bailout();}
-	printf("Buffer size regression test\n");
-	for(j=1; j<48; j++)
-	{
-		for(i=1; i<(j==1?MAXLENGTH:48); i++)
-		{
-			if(i%100==0) printf("%.4d x %.4d\b\b\b\b\b\b\b\b\b\b\b", i, j);
-			if((bmpbuf=(unsigned char *)malloc(i*j*4))==NULL
-			|| (jpgbuf=(unsigned char *)malloc(TJBUFSIZE(i, j)))==NULL)
-			{
-				printf("Memory allocation failure\n");  bailout();
-			}
-			memset(bmpbuf, 0, i*j*4);
-			for(i2=0; i2<i*j; i2++)
-			{
-				bmpbuf[i2*4]=pixels[i2%9][2];
-				bmpbuf[i2*4+1]=pixels[i2%9][1];
-				bmpbuf[i2*2+2]=pixels[i2%9][0];
-			}
-			_catch(tjCompress(hnd, bmpbuf, i, i*4, j, 4,
-				jpgbuf, &size, TJ_444, 100, TJ_BGR));
-			free(bmpbuf);  bmpbuf=NULL;  free(jpgbuf);  jpgbuf=NULL;
-
-			if((bmpbuf=(unsigned char *)malloc(j*i*4))==NULL
-			|| (jpgbuf=(unsigned char *)malloc(TJBUFSIZE(j, i)))==NULL)
-			{
-				printf("Memory allocation failure\n");  bailout();
-			}
-			for(i2=0; i2<j*i*4; i2++)
-			{
-				if(i2%2==0) bmpbuf[i2]=0xFF;
-				else bmpbuf[i2]=0;
-			}
-			_catch(tjCompress(hnd, bmpbuf, j, j*4, i, 4,
-				jpgbuf, &size, TJ_444, 100, TJ_BGR));
-			free(bmpbuf);  bmpbuf=NULL;  free(jpgbuf);  jpgbuf=NULL;
-		}
-	}
-	printf("Done.      \n");
-
-	finally:
-	if(bmpbuf) free(bmpbuf);  if(jpgbuf) free(jpgbuf);
-	if(hnd) tjDestroy(hnd);
-}
-
-int main(int argc, char *argv[])
-{
-	dotest(35, 41, 3, TJ_444, "test");
-	dotest(35, 41, 4, TJ_444, "test");
-	dotest(35, 41, 3, TJ_GRAYSCALE, "test");
-	dotest(35, 41, 4, TJ_GRAYSCALE, "test");
-	dotest1();
-
-	return exitstatus;
-}
diff --git a/jpgtest.cxx b/jpgtest.cxx
deleted file mode 100644
index b1c5e1a..0000000
--- a/jpgtest.cxx
+++ /dev/null
@@ -1,392 +0,0 @@
-/* Copyright (C)2004 Landmark Graphics Corporation
- * Copyright (C)2005, 2006 Sun Microsystems, Inc.
- * Copyright (C)2009 D. R. Commander
- *
- * This library is free software and may be redistributed and/or modified under
- * the terms of the wxWindows Library License, Version 3.1 or (at your option)
- * any later version.  The full license is in the LICENSE.txt file included
- * with this distribution.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * wxWindows Library License for more details.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "./bmp.h"
-#include "./rrutil.h"
-#include "./rrtimer.h"
-#include "./turbojpeg.h"
-
-#define _catch(f) {if((f)==-1) {printf("Error in %s:\n%s\n", #f, tjGetErrorStr());  goto bailout;}}
-
-int forcemmx=0, forcesse=0, forcesse2=0, forcesse3=0, fastupsample=0;
-const int _ps[BMPPIXELFORMATS]={3, 4, 3, 4, 4, 4};
-const int _flags[BMPPIXELFORMATS]={0, 0, TJ_BGR, TJ_BGR,
-	TJ_BGR|TJ_ALPHAFIRST, TJ_ALPHAFIRST};
-const int _rindex[BMPPIXELFORMATS]={0, 0, 2, 2, 3, 1};
-const int _gindex[BMPPIXELFORMATS]={1, 1, 1, 1, 2, 2};
-const int _bindex[BMPPIXELFORMATS]={2, 2, 0, 0, 1, 3};
-const char *_pfname[]={"RGB", "RGBA", "BGR", "BGRA", "ABGR", "ARGB"};
-const char *_subnamel[NUMSUBOPT]={"4:4:4", "4:2:2", "4:2:0", "GRAY"};
-const char *_subnames[NUMSUBOPT]={"444", "422", "420", "GRAY"};
-
-void printsigfig(double val, int figs)
-{
-	char format[80];
-	double _l=log10(val);  int l;
-	if(_l<0.)
-	{
-		l=(int)fabs(_l);
-		sprintf(format, "%%%d.%df", figs+l+2, figs+l);
-	}
-	else
-	{
-		l=(int)_l+1;
-		if(figs<=l) sprintf(format, "%%.0f");
-		else sprintf(format, "%%%d.%df", figs+1, figs-l);
-	}	
-	printf(format, val);
-}
-
-void dotest(unsigned char *srcbuf, int w, int h, BMPPIXELFORMAT pf, int bu,
-	int jpegsub, int qual, char *filename, int dotile, int useppm, int quiet)
-{
-	char tempstr[1024];
-	FILE *outfile;  tjhandle hnd;
-	unsigned char **jpegbuf=NULL, *rgbbuf=NULL;
-	rrtimer timer; double elapsed;
-	int jpgbufsize=0, i, j, tilesizex, tilesizey, numtilesx, numtilesy, ITER;
-	unsigned long *comptilesize=NULL;
-	int flags=(forcemmx?TJ_FORCEMMX:0)|(forcesse?TJ_FORCESSE:0)
-		|(forcesse2?TJ_FORCESSE2:0)|(forcesse3?TJ_FORCESSE3:0)
-		|(fastupsample?TJ_FASTUPSAMPLE:0);
-	int ps=_ps[pf];
-	int pitch=w*ps;
-
-	flags |= _flags[pf];
-	if(bu) flags |= TJ_BOTTOMUP;
-
-	if((rgbbuf=(unsigned char *)malloc(pitch*h)) == NULL)
-	{
-		puts("ERROR: Could not allocate image buffer.");
-		exit(1);
-	}
-
-	if(!quiet) printf("\n>>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", _pfname[pf],
-		bu?"Bottom-up":"Top-down", _subnamel[jpegsub], qual);
-	if(dotile) {tilesizex=tilesizey=4;}  else {tilesizex=w;  tilesizey=h;}
-
-	do
-	{
-		tilesizex*=2;  if(tilesizex>w) tilesizex=w;
-		tilesizey*=2;  if(tilesizey>h) tilesizey=h;
-		numtilesx=(w+tilesizex-1)/tilesizex;
-		numtilesy=(h+tilesizey-1)/tilesizey;
-		if((comptilesize=(unsigned long *)malloc(sizeof(unsigned long)*numtilesx*numtilesy)) == NULL
-		|| (jpegbuf=(unsigned char **)malloc(sizeof(unsigned char *)*numtilesx*numtilesy)) == NULL)
-		{
-			puts("ERROR: Could not allocate image buffers.");
-			goto bailout;
-		}
-		memset(jpegbuf, 0, sizeof(unsigned char *)*numtilesx*numtilesy);
-		for(i=0; i<numtilesx*numtilesy; i++)
-		{
-			if((jpegbuf[i]=(unsigned char *)malloc(TJBUFSIZE(tilesizex, tilesizey))) == NULL)
-			{
-				puts("ERROR: Could not allocate image buffers.");
-				goto bailout;
-			}
-		}
-
-		// Compression test
-		if(quiet) printf("%s\t%s\t%s\t%d\t",  _pfname[pf], bu?"BU":"TD",
-			_subnamel[jpegsub], qual);
-		for(i=0; i<h; i++) memcpy(&rgbbuf[pitch*i], &srcbuf[w*ps*i], w*ps);
-		if((hnd=tjInitCompress())==NULL)
-		{
-			printf("Error in tjInitCompress():\n%s\n", tjGetErrorStr());
-			goto bailout;
-		}
-		_catch(tjCompress(hnd, rgbbuf, tilesizex, pitch, tilesizey, ps,
-			jpegbuf[0], &comptilesize[0], jpegsub, qual, flags));
-		ITER=0;
-		timer.start();
-		do
-		{
-			jpgbufsize=0;  int tilen=0;
-			for(i=0; i<h; i+=tilesizey)
-			{
-				for(j=0; j<w; j+=tilesizex)
-				{
-					int tempw=min(tilesizex, w-j), temph=min(tilesizey, h-i);
-					_catch(tjCompress(hnd, &rgbbuf[pitch*i+j*ps], tempw, pitch,
-						temph, ps, jpegbuf[tilen], &comptilesize[tilen], jpegsub, qual,
-						flags));
-					jpgbufsize+=comptilesize[tilen];
-					tilen++;
-				}
-			}
-			ITER++;
-		} while((elapsed=timer.elapsed())<5.);
-		_catch(tjDestroy(hnd));
-		if(quiet)
-		{
-			if(tilesizex==w && tilesizey==h) printf("Full     \t");
-			else printf("%-4d %-4d\t", tilesizex, tilesizey);
-			printsigfig((double)(w*h)/1000000.*(double)ITER/elapsed, 4);
-			printf("\t");
-			printsigfig((double)(w*h*ps)/(double)jpgbufsize, 4);
-			printf("\t");
-		}
-		else
-		{
-			if(tilesizex==w && tilesizey==h) printf("\nFull image\n");
-			else printf("\nTile size: %d x %d\n", tilesizex, tilesizey);
-			printf("C--> Frame rate:           %f fps\n", (double)ITER/elapsed);
-			printf("     Output image size:    %d bytes\n", jpgbufsize);
-			printf("     Compression ratio:    %f:1\n",
-				(double)(w*h*ps)/(double)jpgbufsize);
-			printf("     Source throughput:    %f Megapixels/sec\n",
-				(double)(w*h)/1000000.*(double)ITER/elapsed);
-			printf("     Output bit stream:    %f Megabits/sec\n",
-				(double)jpgbufsize*8./1000000.*(double)ITER/elapsed);
-		}
-		if(tilesizex==w && tilesizey==h)
-		{
-			sprintf(tempstr, "%s_%sQ%d.jpg", filename, _subnames[jpegsub], qual);
-			if((outfile=fopen(tempstr, "wb"))==NULL)
-			{
-				puts("ERROR: Could not open reference image");
-				exit(1);
-			}
-			if(fwrite(jpegbuf[0], jpgbufsize, 1, outfile)!=1)
-			{
-				puts("ERROR: Could not write reference image");
-				exit(1);
-			}
-			fclose(outfile);
-			if(!quiet) printf("Reference image written to %s\n", tempstr);
-		}
-
-		// Decompression test
-		memset(rgbbuf, 127, pitch*h);  // Grey image means decompressor did nothing
-		if((hnd=tjInitDecompress())==NULL)
-		{
-			printf("Error in tjInitDecompress():\n%s\n", tjGetErrorStr());
-			goto bailout;
-		}
-		_catch(tjDecompress(hnd, jpegbuf[0], jpgbufsize, rgbbuf, tilesizex, pitch,
-			tilesizey, ps, flags));
-		ITER=0;
-		timer.start();
-		do
-		{
-			int tilen=0;
-			for(i=0; i<h; i+=tilesizey)
-			{
-				for(j=0; j<w; j+=tilesizex)
-				{
-					int tempw=min(tilesizex, w-j), temph=min(tilesizey, h-i);
-					_catch(tjDecompress(hnd, jpegbuf[tilen], comptilesize[tilen],
-						&rgbbuf[pitch*i+ps*j], tempw, pitch, temph, ps, flags));
-					tilen++;
-				}
-			}
-			ITER++;
-		}	while((elapsed=timer.elapsed())<5.);
-		_catch(tjDestroy(hnd));
-		if(quiet)
-		{
-			printsigfig((double)(w*h)/1000000.*(double)ITER/elapsed, 4);
-			printf("\n");
-		}
-		else
-		{
-			printf("D--> Frame rate:           %f fps\n", (double)ITER/elapsed);
-			printf("     Dest. throughput:     %f Megapixels/sec\n",
-				(double)(w*h)/1000000.*(double)ITER/elapsed);
-		}
-		if(tilesizex==w && tilesizey==h)
-			sprintf(tempstr, "%s_%sQ%d_full.%s", filename, _subnames[jpegsub], qual,
-				useppm?"ppm":"bmp");
-		else sprintf(tempstr, "%s_%sQ%d_%dx%d.%s", filename, _subnames[jpegsub],
-			qual, tilesizex, tilesizey, useppm?"ppm":"bmp");
-		if(savebmp(tempstr, rgbbuf, w, h, pf, pitch, bu)==-1)
-		{
-			printf("ERROR saving bitmap: %s\n", bmpgeterr());
-			goto bailout;
-		}
-		sprintf(strrchr(tempstr, '.'), "-err.%s", useppm?"ppm":"bmp");
-		if(!quiet)
-			printf("Computing compression error and saving to %s.\n", tempstr);
-		if(jpegsub==TJ_GRAYSCALE)
-		{
-			for(j=0; j<h; j++)
-			{
-				for(i=0; i<w*ps; i+=ps)
-				{
-					int y=(int)((double)srcbuf[w*ps*j+i+_rindex[pf]]*0.299
-						+ (double)srcbuf[w*ps*j+i+_gindex[pf]]*0.587
-						+ (double)srcbuf[w*ps*j+i+_bindex[pf]]*0.114 + 0.5);
-					if(y>255) y=255;  if(y<0) y=0;
-					rgbbuf[pitch*j+i+_rindex[pf]]=abs(rgbbuf[pitch*j+i+_rindex[pf]]-y);
-					rgbbuf[pitch*j+i+_gindex[pf]]=abs(rgbbuf[pitch*j+i+_gindex[pf]]-y);
-					rgbbuf[pitch*j+i+_bindex[pf]]=abs(rgbbuf[pitch*j+i+_bindex[pf]]-y);
-				}
-			}
-		}		
-		else
-		{
-			for(j=0; j<h; j++) for(i=0; i<w*ps; i++)
-				rgbbuf[pitch*j+i]=abs(rgbbuf[pitch*j+i]-srcbuf[w*ps*j+i]);
-		}
-		if(savebmp(tempstr, rgbbuf, w, h, pf, pitch, bu)==-1)
-		{
-			printf("ERROR saving bitmap: %s\n", bmpgeterr());
-			goto bailout;
-		}
-
-		// Cleanup
-		if(jpegbuf)
-		{
-			for(i=0; i<numtilesx*numtilesy; i++)
-				{if(jpegbuf[i]) free(jpegbuf[i]);  jpegbuf[i]=NULL;}
-			free(jpegbuf);  jpegbuf=NULL;
-		}
-		if(comptilesize) {free(comptilesize);  comptilesize=NULL;}
-	} while(tilesizex<w || tilesizey<h);
-
-	if(rgbbuf) {free(rgbbuf);  rgbbuf=NULL;}
-	return;
-
-	bailout:
-	if(jpegbuf)
-	{
-		for(i=0; i<numtilesx*numtilesy; i++)
-			{if(jpegbuf[i]) free(jpegbuf[i]);  jpegbuf[i]=NULL;}
-		free(jpegbuf);  jpegbuf=NULL;
-	}
-	if(comptilesize) {free(comptilesize);  comptilesize=NULL;}
-	if(rgbbuf) {free(rgbbuf);  rgbbuf=NULL;}
-	return;
-}
-
-
-int main(int argc, char *argv[])
-{
-	unsigned char *bmpbuf=NULL;  int w, h, i, useppm=0;
-	int qual, dotile=0, quiet=0, hiqual=-1;  char *temp;
-	BMPPIXELFORMAT pf=BMP_BGR;
-	int bu=0;
-
-	printf("\n");
-
-	if(argc<3)
-	{
-		printf("USAGE: %s <Inputfile (BMP|PPM)> <%% Quality>\n\n", argv[0]);
-		printf("       [-tile]\n");
-		printf("       Test performance of the codec when the image is encoded\n");
-		printf("       as separate tiles of varying sizes.\n\n");
-		printf("       [-forcemmx] [-forcesse] [-forcesse2] [-forcesse3]\n");
-		printf("       Force MMX, SSE, or SSE2 code paths in Intel codec\n\n");
-		printf("       [-rgb | -bgr | -rgba | -bgra | -abgr | -argb]\n");
-		printf("       Test the specified color conversion path in the codec (default: BGR)\n\n");
-		printf("       [-fastupsample]\n");
-		printf("       Use fast, inaccurate upsampling code to perform 4:2:2 and 4:2:0\n");
-		printf("       YUV decoding in libjpeg decompressor\n\n");
-		printf("       [-quiet]\n");
-		printf("       Output in tabular rather than verbose format\n\n");
-		printf("       NOTE: If the quality is specified as a range, i.e. 90-100, a separate\n");
-		printf("       test will be performed for all quality values in the range.\n");
-		exit(1);
-	}
-	if((qual=atoi(argv[2]))<1 || qual>100)
-	{
-		puts("ERROR: Quality must be between 1 and 100.");
-		exit(1);
-	}
-	if((temp=strchr(argv[2], '-'))!=NULL && strlen(temp)>1
-		&& sscanf(&temp[1], "%d", &hiqual)==1 && hiqual>qual && hiqual>=1
-		&& hiqual<=100) {}
-	else hiqual=qual;
-
-	if(argc>3)
-	{
-		for(i=3; i<argc; i++)
-		{
-			if(!stricmp(argv[i], "-tile")) dotile=1;
-			if(!stricmp(argv[i], "-forcesse3"))
-			{
-				printf("Using SSE3 code\n");
-				forcesse3=1;
-			}
-			if(!stricmp(argv[i], "-forcesse2"))
-			{
-				printf("Using SSE2 code\n");
-				forcesse2=1;
-			}
-			if(!stricmp(argv[i], "-forcesse"))
-			{
-				printf("Using SSE code\n");
-				forcesse=1;
-			}
-			if(!stricmp(argv[i], "-forcemmx"))
-			{
-				printf("Using MMX code\n");
-				forcemmx=1;
-			}
-			if(!stricmp(argv[i], "-fastupsample"))
-			{
-				printf("Using fast upsampling code\n");
-				fastupsample=1;
-			}
-			if(!stricmp(argv[i], "-rgb")) pf=BMP_RGB;
-			if(!stricmp(argv[i], "-rgba")) pf=BMP_RGBA;
-			if(!stricmp(argv[i], "-bgr")) pf=BMP_BGR;
-			if(!stricmp(argv[i], "-bgra")) pf=BMP_BGRA;
-			if(!stricmp(argv[i], "-abgr")) pf=BMP_ABGR;
-			if(!stricmp(argv[i], "-argb")) pf=BMP_ARGB;
-			if(!stricmp(argv[i], "-bottomup")) bu=1;
-			if(!stricmp(argv[i], "-quiet")) quiet=1;
-		}
-	}
-
-	if(loadbmp(argv[1], &bmpbuf, &w, &h, pf, 1, bu)==-1)
-	{
-		printf("ERROR loading bitmap: %s\n", bmpgeterr());  exit(1);
-	}
-
-	temp=strrchr(argv[1], '.');
-	if(temp!=NULL)
-	{
-		if(!stricmp(temp, ".ppm")) useppm=1;
-		*temp='\0';
-	}
-
-	if(quiet)
-	{
-		printf("All performance values in Mpixels/sec\n\n");
-		printf("Bitmap\tBitmap\tJPEG\tJPEG\tTile Size\tCompr\tCompr\tDecomp\n");
-		printf("Format\tOrder\tFormat\tQual\t X    Y  \tPerf \tRatio\tPerf\n\n");
-	}
-
-	for(i=hiqual; i>=qual; i--)
-		dotest(bmpbuf, w, h, pf, bu, TJ_GRAYSCALE, i, argv[1], dotile, useppm, quiet);
-	if(quiet) printf("\n");
-	for(i=hiqual; i>=qual; i--)
-		dotest(bmpbuf, w, h, pf, bu, TJ_420, i, argv[1], dotile, useppm, quiet);
-	if(quiet) printf("\n");
-	for(i=hiqual; i>=qual; i--)
-		dotest(bmpbuf, w, h, pf, bu, TJ_422, i, argv[1], dotile, useppm, quiet);
-	if(quiet) printf("\n");
-	for(i=hiqual; i>=qual; i--)
-		dotest(bmpbuf, w, h, pf, bu, TJ_444, i, argv[1], dotile, useppm, quiet);
-
-	if(bmpbuf) free(bmpbuf);
-	return 0;
-}
diff --git a/jquant1.c b/jquant1.c
index aa2c59a..e781481 100644
--- a/jquant1.c
+++ b/jquant1.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, D. R. Commander
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2009, 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains 1-pass color quantization (color mapping) routines.
  * These routines provide mapping to a fixed color map using equally spaced
@@ -70,9 +71,9 @@
  * table in both directions.
  */
 
-#define ODITHER_SIZE  16	/* dimension of dither matrix */
+#define ODITHER_SIZE  16        /* dimension of dither matrix */
 /* NB: if ODITHER_SIZE is not a power of 2, ODITHER_MASK uses will break */
-#define ODITHER_CELLS (ODITHER_SIZE*ODITHER_SIZE)	/* # cells in matrix */
+#define ODITHER_CELLS (ODITHER_SIZE*ODITHER_SIZE)       /* # cells in matrix */
 #define ODITHER_MASK  (ODITHER_SIZE-1) /* mask for wrapping around counters */
 
 typedef int ODITHER_MATRIX[ODITHER_SIZE][ODITHER_SIZE];
@@ -107,8 +108,8 @@
  * Errors are accumulated into the array fserrors[], at a resolution of
  * 1/16th of a pixel count.  The error at a given pixel is propagated
  * to its not-yet-processed neighbors using the standard F-S fractions,
- *		...	(here)	7/16
- *		3/16	5/16	1/16
+ *              ...     (here)  7/16
+ *              3/16    5/16    1/16
  * We work left-to-right on even rows, right-to-left on odd rows.
  *
  * We can get away with a single array (holding one row's worth of errors)
@@ -121,52 +122,49 @@
  * The fserrors[] array is indexed [component#][position].
  * We provide (#columns + 2) entries per component; the extra entry at each
  * end saves us from special-casing the first and last pixels.
- *
- * Note: on a wide image, we might not have enough room in a PC's near data
- * segment to hold the error array; so it is allocated with alloc_large.
  */
 
 #if BITS_IN_JSAMPLE == 8
-typedef INT16 FSERROR;		/* 16 bits should be enough */
-typedef int LOCFSERROR;		/* use 'int' for calculation temps */
+typedef INT16 FSERROR;          /* 16 bits should be enough */
+typedef int LOCFSERROR;         /* use 'int' for calculation temps */
 #else
-typedef INT32 FSERROR;		/* may need more than 16 bits */
-typedef INT32 LOCFSERROR;	/* be sure calculation temps are big enough */
+typedef JLONG FSERROR;          /* may need more than 16 bits */
+typedef JLONG LOCFSERROR;       /* be sure calculation temps are big enough */
 #endif
 
-typedef FSERROR FAR *FSERRPTR;	/* pointer to error array (in FAR storage!) */
+typedef FSERROR *FSERRPTR;  /* pointer to error array */
 
 
 /* Private subobject */
 
-#define MAX_Q_COMPS 4		/* max components I can handle */
+#define MAX_Q_COMPS 4           /* max components I can handle */
 
 typedef struct {
   struct jpeg_color_quantizer pub; /* public fields */
 
   /* Initially allocated colormap is saved here */
-  JSAMPARRAY sv_colormap;	/* The color map as a 2-D pixel array */
-  int sv_actual;		/* number of entries in use */
+  JSAMPARRAY sv_colormap;       /* The color map as a 2-D pixel array */
+  int sv_actual;                /* number of entries in use */
 
-  JSAMPARRAY colorindex;	/* Precomputed mapping for speed */
+  JSAMPARRAY colorindex;        /* Precomputed mapping for speed */
   /* colorindex[i][j] = index of color closest to pixel value j in component i,
    * premultiplied as described above.  Since colormap indexes must fit into
    * JSAMPLEs, the entries of this array will too.
    */
-  boolean is_padded;		/* is the colorindex padded for odither? */
+  boolean is_padded;            /* is the colorindex padded for odither? */
 
-  int Ncolors[MAX_Q_COMPS];	/* # of values alloced to each component */
+  int Ncolors[MAX_Q_COMPS];     /* # of values alloced to each component */
 
   /* Variables for ordered dithering */
-  int row_index;		/* cur row's vertical index in dither matrix */
+  int row_index;                /* cur row's vertical index in dither matrix */
   ODITHER_MATRIX_PTR odither[MAX_Q_COMPS]; /* one dither array per component */
 
   /* Variables for Floyd-Steinberg dithering */
   FSERRPTR fserrors[MAX_Q_COMPS]; /* accumulated errors */
-  boolean on_odd_row;		/* flag to remember which row we are on */
+  boolean on_odd_row;           /* flag to remember which row we are on */
 } my_cquantizer;
 
-typedef my_cquantizer * my_cquantize_ptr;
+typedef my_cquantizer *my_cquantize_ptr;
 
 
 /*
@@ -205,11 +203,11 @@
   iroot = 1;
   do {
     iroot++;
-    temp = iroot;		/* set temp = iroot ** nc */
+    temp = iroot;               /* set temp = iroot ** nc */
     for (i = 1; i < nc; i++)
       temp *= iroot;
   } while (temp <= (long) max_colors); /* repeat till iroot exceeds root */
-  iroot--;			/* now iroot = floor(root) */
+  iroot--;                      /* now iroot = floor(root) */
 
   /* Must have at least 2 color values per component */
   if (iroot < 2)
@@ -233,10 +231,10 @@
       j = (cinfo->out_color_space == JCS_RGB ? RGB_order[i] : i);
       /* calculate new total_colors if Ncolors[j] is incremented */
       temp = total_colors / Ncolors[j];
-      temp *= Ncolors[j]+1;	/* done in long arith to avoid oflo */
+      temp *= Ncolors[j]+1;     /* done in long arith to avoid oflo */
       if (temp > (long) max_colors)
-	break;			/* won't fit, done with this pass */
-      Ncolors[j]++;		/* OK, apply the increment */
+        break;                  /* won't fit, done with this pass */
+      Ncolors[j]++;             /* OK, apply the increment */
       total_colors = (int) temp;
       changed = TRUE;
     }
@@ -256,7 +254,7 @@
    * (Forcing the upper and lower values to the limits ensures that
    * dithering can't produce a color outside the selected gamut.)
    */
-  return (int) (((INT32) j * MAXJSAMPLE + maxj/2) / maxj);
+  return (int) (((JLONG) j * MAXJSAMPLE + maxj/2) / maxj);
 }
 
 
@@ -266,7 +264,7 @@
 /* Must have largest(j=0) >= 0, and largest(j=maxj) >= MAXJSAMPLE */
 {
   /* Breakpoints are halfway between values returned by output_value */
-  return (int) (((INT32) (2*j + 1) * MAXJSAMPLE + maxj) / (2*maxj));
+  return (int) (((JLONG) (2*j + 1) * MAXJSAMPLE + maxj) / (2*maxj));
 }
 
 
@@ -278,8 +276,8 @@
 create_colormap (j_decompress_ptr cinfo)
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
-  JSAMPARRAY colormap;		/* Created colormap */
-  int total_colors;		/* Number of distinct output colors */
+  JSAMPARRAY colormap;          /* Created colormap */
+  int total_colors;             /* Number of distinct output colors */
   int i,j,k, nci, blksize, blkdist, ptr, val;
 
   /* Select number of colors for each component */
@@ -288,8 +286,8 @@
   /* Report selected color counts */
   if (cinfo->out_color_components == 3)
     TRACEMS4(cinfo, 1, JTRC_QUANT_3_NCOLORS,
-	     total_colors, cquantize->Ncolors[0],
-	     cquantize->Ncolors[1], cquantize->Ncolors[2]);
+             total_colors, cquantize->Ncolors[0],
+             cquantize->Ncolors[1], cquantize->Ncolors[2]);
   else
     TRACEMS1(cinfo, 1, JTRC_QUANT_NCOLORS, total_colors);
 
@@ -314,12 +312,12 @@
       val = output_value(cinfo, i, j, nci-1);
       /* Fill in all colormap entries that have this value of this component */
       for (ptr = j * blksize; ptr < total_colors; ptr += blkdist) {
-	/* fill in blksize entries beginning at ptr */
-	for (k = 0; k < blksize; k++)
-	  colormap[i][ptr+k] = (JSAMPLE) val;
+        /* fill in blksize entries beginning at ptr */
+        for (k = 0; k < blksize; k++)
+          colormap[i][ptr+k] = (JSAMPLE) val;
       }
     }
-    blkdist = blksize;		/* blksize of this color is blkdist of next */
+    blkdist = blksize;          /* blksize of this color is blkdist of next */
   }
 
   /* Save the colormap in private storage,
@@ -377,16 +375,16 @@
     val = 0;
     k = largest_input_value(cinfo, i, 0, nci-1);
     for (j = 0; j <= MAXJSAMPLE; j++) {
-      while (j > k)		/* advance val if past boundary */
-	k = largest_input_value(cinfo, i, ++val, nci-1);
+      while (j > k)             /* advance val if past boundary */
+        k = largest_input_value(cinfo, i, ++val, nci-1);
       /* premultiply so that no multiplication needed in main processing */
       indexptr[j] = (JSAMPLE) (val * blksize);
     }
     /* Pad at both ends if necessary */
     if (pad)
       for (j = 1; j <= MAXJSAMPLE; j++) {
-	indexptr[-j] = indexptr[0];
-	indexptr[MAXJSAMPLE+j] = indexptr[MAXJSAMPLE];
+        indexptr[-j] = indexptr[0];
+        indexptr[MAXJSAMPLE+j] = indexptr[MAXJSAMPLE];
       }
   }
 }
@@ -402,21 +400,21 @@
 {
   ODITHER_MATRIX_PTR odither;
   int j,k;
-  INT32 num,den;
+  JLONG num,den;
 
   odither = (ODITHER_MATRIX_PTR)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(ODITHER_MATRIX));
+                                sizeof(ODITHER_MATRIX));
   /* The inter-value distance for this color is MAXJSAMPLE/(ncolors-1).
    * Hence the dither value for the matrix cell with fill order f
    * (f=0..N-1) should be (N-1-2*f)/(2*N) * MAXJSAMPLE/(ncolors-1).
    * On 16-bit-int machine, be careful to avoid overflow.
    */
-  den = 2 * ODITHER_CELLS * ((INT32) (ncolors - 1));
+  den = 2 * ODITHER_CELLS * ((JLONG) (ncolors - 1));
   for (j = 0; j < ODITHER_SIZE; j++) {
     for (k = 0; k < ODITHER_SIZE; k++) {
-      num = ((INT32) (ODITHER_CELLS-1 - 2*((int)base_dither_matrix[j][k])))
-	    * MAXJSAMPLE;
+      num = ((JLONG) (ODITHER_CELLS-1 - 2*((int)base_dither_matrix[j][k])))
+            * MAXJSAMPLE;
       /* Ensure round towards zero despite C's lack of consistency
        * about rounding negative values in integer division...
        */
@@ -429,7 +427,7 @@
 
 /*
  * Create the ordered-dither tables.
- * Components having the same number of representative colors may 
+ * Components having the same number of representative colors may
  * share a dither table.
  */
 
@@ -442,14 +440,14 @@
 
   for (i = 0; i < cinfo->out_color_components; i++) {
     nci = cquantize->Ncolors[i]; /* # of distinct values for this color */
-    odither = NULL;		/* search for matching prior component */
+    odither = NULL;             /* search for matching prior component */
     for (j = 0; j < i; j++) {
       if (nci == cquantize->Ncolors[j]) {
-	odither = cquantize->odither[j];
-	break;
+        odither = cquantize->odither[j];
+        break;
       }
     }
-    if (odither == NULL)	/* need a new table? */
+    if (odither == NULL)        /* need a new table? */
       odither = make_odither_array(cinfo, nci);
     cquantize->odither[i] = odither;
   }
@@ -462,7 +460,7 @@
 
 METHODDEF(void)
 color_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-		JSAMPARRAY output_buf, int num_rows)
+                JSAMPARRAY output_buf, int num_rows)
 /* General case, no dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
@@ -480,7 +478,7 @@
     for (col = width; col > 0; col--) {
       pixcode = 0;
       for (ci = 0; ci < nc; ci++) {
-	pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]);
+        pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]);
       }
       *ptrout++ = (JSAMPLE) pixcode;
     }
@@ -490,7 +488,7 @@
 
 METHODDEF(void)
 color_quantize3 (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-		 JSAMPARRAY output_buf, int num_rows)
+                 JSAMPARRAY output_buf, int num_rows)
 /* Fast path for out_color_components==3, no dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
@@ -518,15 +516,15 @@
 
 METHODDEF(void)
 quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-		     JSAMPARRAY output_buf, int num_rows)
+                     JSAMPARRAY output_buf, int num_rows)
 /* General case, with ordered dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
   register JSAMPROW input_ptr;
   register JSAMPROW output_ptr;
   JSAMPROW colorindex_ci;
-  int * dither;			/* points to active row of dither matrix */
-  int row_index, col_index;	/* current indexes into dither matrix */
+  int *dither;                  /* points to active row of dither matrix */
+  int row_index, col_index;     /* current indexes into dither matrix */
   int nc = cinfo->out_color_components;
   int ci;
   int row;
@@ -535,8 +533,7 @@
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    jzero_far((void FAR *) output_buf[row],
-	      (size_t) (width * SIZEOF(JSAMPLE)));
+    jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE)));
     row_index = cquantize->row_index;
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
@@ -546,17 +543,17 @@
       col_index = 0;
 
       for (col = width; col > 0; col--) {
-	/* Form pixel value + dither, range-limit to 0..MAXJSAMPLE,
-	 * select output value, accumulate into output code for this pixel.
-	 * Range-limiting need not be done explicitly, as we have extended
-	 * the colorindex table to produce the right answers for out-of-range
-	 * inputs.  The maximum dither is +- MAXJSAMPLE; this sets the
-	 * required amount of padding.
-	 */
-	*output_ptr += colorindex_ci[GETJSAMPLE(*input_ptr)+dither[col_index]];
-	input_ptr += nc;
-	output_ptr++;
-	col_index = (col_index + 1) & ODITHER_MASK;
+        /* Form pixel value + dither, range-limit to 0..MAXJSAMPLE,
+         * select output value, accumulate into output code for this pixel.
+         * Range-limiting need not be done explicitly, as we have extended
+         * the colorindex table to produce the right answers for out-of-range
+         * inputs.  The maximum dither is +- MAXJSAMPLE; this sets the
+         * required amount of padding.
+         */
+        *output_ptr += colorindex_ci[GETJSAMPLE(*input_ptr)+dither[col_index]];
+        input_ptr += nc;
+        output_ptr++;
+        col_index = (col_index + 1) & ODITHER_MASK;
       }
     }
     /* Advance row index for next row */
@@ -568,7 +565,7 @@
 
 METHODDEF(void)
 quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-		      JSAMPARRAY output_buf, int num_rows)
+                      JSAMPARRAY output_buf, int num_rows)
 /* Fast path for out_color_components==3, with ordered dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
@@ -578,10 +575,10 @@
   JSAMPROW colorindex0 = cquantize->colorindex[0];
   JSAMPROW colorindex1 = cquantize->colorindex[1];
   JSAMPROW colorindex2 = cquantize->colorindex[2];
-  int * dither0;		/* points to active row of dither matrix */
-  int * dither1;
-  int * dither2;
-  int row_index, col_index;	/* current indexes into dither matrix */
+  int *dither0;                 /* points to active row of dither matrix */
+  int *dither1;
+  int *dither2;
+  int row_index, col_index;     /* current indexes into dither matrix */
   int row;
   JDIMENSION col;
   JDIMENSION width = cinfo->output_width;
@@ -597,11 +594,11 @@
 
     for (col = width; col > 0; col--) {
       pixcode  = GETJSAMPLE(colorindex0[GETJSAMPLE(*input_ptr++) +
-					dither0[col_index]]);
+                                        dither0[col_index]]);
       pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*input_ptr++) +
-					dither1[col_index]]);
+                                        dither1[col_index]]);
       pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*input_ptr++) +
-					dither2[col_index]]);
+                                        dither2[col_index]]);
       *output_ptr++ = (JSAMPLE) pixcode;
       col_index = (col_index + 1) & ODITHER_MASK;
     }
@@ -613,24 +610,24 @@
 
 METHODDEF(void)
 quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-		    JSAMPARRAY output_buf, int num_rows)
+                    JSAMPARRAY output_buf, int num_rows)
 /* General case, with Floyd-Steinberg dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
-  register LOCFSERROR cur;	/* current error or pixel value */
-  LOCFSERROR belowerr;		/* error for pixel below cur */
-  LOCFSERROR bpreverr;		/* error for below/prev col */
-  LOCFSERROR bnexterr;		/* error for below/next col */
+  register LOCFSERROR cur;      /* current error or pixel value */
+  LOCFSERROR belowerr;          /* error for pixel below cur */
+  LOCFSERROR bpreverr;          /* error for below/prev col */
+  LOCFSERROR bnexterr;          /* error for below/next col */
   LOCFSERROR delta;
-  register FSERRPTR errorptr;	/* => fserrors[] at column before current */
+  register FSERRPTR errorptr;   /* => fserrors[] at column before current */
   register JSAMPROW input_ptr;
   register JSAMPROW output_ptr;
   JSAMPROW colorindex_ci;
   JSAMPROW colormap_ci;
   int pixcode;
   int nc = cinfo->out_color_components;
-  int dir;			/* 1 for left-to-right, -1 for right-to-left */
-  int dirnc;			/* dir * nc */
+  int dir;                      /* 1 for left-to-right, -1 for right-to-left */
+  int dirnc;                    /* dir * nc */
   int ci;
   int row;
   JDIMENSION col;
@@ -640,23 +637,22 @@
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    jzero_far((void FAR *) output_buf[row],
-	      (size_t) (width * SIZEOF(JSAMPLE)));
+    jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE)));
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
       output_ptr = output_buf[row];
       if (cquantize->on_odd_row) {
-	/* work right to left in this row */
-	input_ptr += (width-1) * nc; /* so point to rightmost pixel */
-	output_ptr += width-1;
-	dir = -1;
-	dirnc = -nc;
-	errorptr = cquantize->fserrors[ci] + (width+1); /* => entry after last column */
+        /* work right to left in this row */
+        input_ptr += (width-1) * nc; /* so point to rightmost pixel */
+        output_ptr += width-1;
+        dir = -1;
+        dirnc = -nc;
+        errorptr = cquantize->fserrors[ci] + (width+1); /* => entry after last column */
       } else {
-	/* work left to right in this row */
-	dir = 1;
-	dirnc = nc;
-	errorptr = cquantize->fserrors[ci]; /* => entry before first column */
+        /* work left to right in this row */
+        dir = 1;
+        dirnc = nc;
+        errorptr = cquantize->fserrors[ci]; /* => entry before first column */
       }
       colorindex_ci = cquantize->colorindex[ci];
       colormap_ci = cquantize->sv_colormap[ci];
@@ -666,47 +662,47 @@
       belowerr = bpreverr = 0;
 
       for (col = width; col > 0; col--) {
-	/* cur holds the error propagated from the previous pixel on the
-	 * current line.  Add the error propagated from the previous line
-	 * to form the complete error correction term for this pixel, and
-	 * round the error term (which is expressed * 16) to an integer.
-	 * RIGHT_SHIFT rounds towards minus infinity, so adding 8 is correct
-	 * for either sign of the error value.
-	 * Note: errorptr points to *previous* column's array entry.
-	 */
-	cur = RIGHT_SHIFT(cur + errorptr[dir] + 8, 4);
-	/* Form pixel value + error, and range-limit to 0..MAXJSAMPLE.
-	 * The maximum error is +- MAXJSAMPLE; this sets the required size
-	 * of the range_limit array.
-	 */
-	cur += GETJSAMPLE(*input_ptr);
-	cur = GETJSAMPLE(range_limit[cur]);
-	/* Select output value, accumulate into output code for this pixel */
-	pixcode = GETJSAMPLE(colorindex_ci[cur]);
-	*output_ptr += (JSAMPLE) pixcode;
-	/* Compute actual representation error at this pixel */
-	/* Note: we can do this even though we don't have the final */
-	/* pixel code, because the colormap is orthogonal. */
-	cur -= GETJSAMPLE(colormap_ci[pixcode]);
-	/* Compute error fractions to be propagated to adjacent pixels.
-	 * Add these into the running sums, and simultaneously shift the
-	 * next-line error sums left by 1 column.
-	 */
-	bnexterr = cur;
-	delta = cur * 2;
-	cur += delta;		/* form error * 3 */
-	errorptr[0] = (FSERROR) (bpreverr + cur);
-	cur += delta;		/* form error * 5 */
-	bpreverr = belowerr + cur;
-	belowerr = bnexterr;
-	cur += delta;		/* form error * 7 */
-	/* At this point cur contains the 7/16 error value to be propagated
-	 * to the next pixel on the current line, and all the errors for the
-	 * next line have been shifted over. We are therefore ready to move on.
-	 */
-	input_ptr += dirnc;	/* advance input ptr to next column */
-	output_ptr += dir;	/* advance output ptr to next column */
-	errorptr += dir;	/* advance errorptr to current column */
+        /* cur holds the error propagated from the previous pixel on the
+         * current line.  Add the error propagated from the previous line
+         * to form the complete error correction term for this pixel, and
+         * round the error term (which is expressed * 16) to an integer.
+         * RIGHT_SHIFT rounds towards minus infinity, so adding 8 is correct
+         * for either sign of the error value.
+         * Note: errorptr points to *previous* column's array entry.
+         */
+        cur = RIGHT_SHIFT(cur + errorptr[dir] + 8, 4);
+        /* Form pixel value + error, and range-limit to 0..MAXJSAMPLE.
+         * The maximum error is +- MAXJSAMPLE; this sets the required size
+         * of the range_limit array.
+         */
+        cur += GETJSAMPLE(*input_ptr);
+        cur = GETJSAMPLE(range_limit[cur]);
+        /* Select output value, accumulate into output code for this pixel */
+        pixcode = GETJSAMPLE(colorindex_ci[cur]);
+        *output_ptr += (JSAMPLE) pixcode;
+        /* Compute actual representation error at this pixel */
+        /* Note: we can do this even though we don't have the final */
+        /* pixel code, because the colormap is orthogonal. */
+        cur -= GETJSAMPLE(colormap_ci[pixcode]);
+        /* Compute error fractions to be propagated to adjacent pixels.
+         * Add these into the running sums, and simultaneously shift the
+         * next-line error sums left by 1 column.
+         */
+        bnexterr = cur;
+        delta = cur * 2;
+        cur += delta;           /* form error * 3 */
+        errorptr[0] = (FSERROR) (bpreverr + cur);
+        cur += delta;           /* form error * 5 */
+        bpreverr = belowerr + cur;
+        belowerr = bnexterr;
+        cur += delta;           /* form error * 7 */
+        /* At this point cur contains the 7/16 error value to be propagated
+         * to the next pixel on the current line, and all the errors for the
+         * next line have been shifted over. We are therefore ready to move on.
+         */
+        input_ptr += dirnc;     /* advance input ptr to next column */
+        output_ptr += dir;      /* advance output ptr to next column */
+        errorptr += dir;        /* advance errorptr to current column */
       }
       /* Post-loop cleanup: we must unload the final error value into the
        * final fserrors[] entry.  Note we need not unload belowerr because
@@ -730,7 +726,7 @@
   size_t arraysize;
   int i;
 
-  arraysize = (size_t) ((cinfo->output_width + 2) * SIZEOF(FSERROR));
+  arraysize = (size_t) ((cinfo->output_width + 2) * sizeof(FSERROR));
   for (i = 0; i < cinfo->out_color_components; i++) {
     cquantize->fserrors[i] = (FSERRPTR)
       (*cinfo->mem->alloc_large)((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
@@ -766,7 +762,7 @@
       cquantize->pub.color_quantize = quantize3_ord_dither;
     else
       cquantize->pub.color_quantize = quantize_ord_dither;
-    cquantize->row_index = 0;	/* initialize state for ordered dither */
+    cquantize->row_index = 0;   /* initialize state for ordered dither */
     /* If user changed to ordered dither from another mode,
      * we must recreate the color index table with padding.
      * This will cost extra space, but probably isn't very likely.
@@ -784,9 +780,9 @@
     if (cquantize->fserrors[0] == NULL)
       alloc_fs_workspace(cinfo);
     /* Initialize the propagated errors to zero. */
-    arraysize = (size_t) ((cinfo->output_width + 2) * SIZEOF(FSERROR));
+    arraysize = (size_t) ((cinfo->output_width + 2) * sizeof(FSERROR));
     for (i = 0; i < cinfo->out_color_components; i++)
-      jzero_far((void FAR *) cquantize->fserrors[i], arraysize);
+      jzero_far((void *) cquantize->fserrors[i], arraysize);
     break;
   default:
     ERREXIT(cinfo, JERR_NOT_COMPILED);
@@ -829,13 +825,13 @@
 
   cquantize = (my_cquantize_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_cquantizer));
+                                sizeof(my_cquantizer));
   cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize;
   cquantize->pub.start_pass = start_pass_1_quant;
   cquantize->pub.finish_pass = finish_pass_1_quant;
   cquantize->pub.new_color_map = new_color_map_1_quant;
   cquantize->fserrors[0] = NULL; /* Flag FS workspace not allocated */
-  cquantize->odither[0] = NULL;	/* Also flag odither arrays not allocated */
+  cquantize->odither[0] = NULL; /* Also flag odither arrays not allocated */
 
   /* Make sure my internal arrays won't overflow */
   if (cinfo->out_color_components > MAX_Q_COMPS)
@@ -849,10 +845,10 @@
   create_colorindex(cinfo);
 
   /* Allocate Floyd-Steinberg workspace now if requested.
-   * We do this now since it is FAR storage and may affect the memory
-   * manager's space calculations.  If the user changes to FS dither
-   * mode in a later pass, we will allocate the space then, and will
-   * possibly overrun the max_memory_to_use setting.
+   * We do this now since it may affect the memory manager's space
+   * calculations.  If the user changes to FS dither mode in a later pass, we
+   * will allocate the space then, and will possibly overrun the
+   * max_memory_to_use setting.
    */
   if (cinfo->dither_mode == JDITHER_FS)
     alloc_fs_workspace(cinfo);
diff --git a/jquant2.c b/jquant2.c
index 9b060e5..cfbd0f1 100644
--- a/jquant2.c
+++ b/jquant2.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2009, 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains 2-pass color quantization (color mapping) routines.
  * These routines provide selection of a custom color map for an image,
@@ -43,7 +44,7 @@
  * color space, and repeatedly splits the "largest" remaining box until we
  * have as many boxes as desired colors.  Then the mean color in each
  * remaining box becomes one of the possible output colors.
- * 
+ *
  * The second pass over the image maps each input pixel to the closest output
  * color (optionally after applying a Floyd-Steinberg dithering correction).
  * This mapping is logically trivial, but making it go fast enough requires
@@ -72,9 +73,9 @@
  * probably need to change these scale factors.
  */
 
-#define R_SCALE 2		/* scale R distances by this much */
-#define G_SCALE 3		/* scale G distances by this much */
-#define B_SCALE 1		/* and B by this much */
+#define R_SCALE 2               /* scale R distances by this much */
+#define G_SCALE 3               /* scale G distances by this much */
+#define B_SCALE 1               /* and B by this much */
 
 static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
 #define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]]
@@ -102,9 +103,7 @@
  * machines, we can't just allocate the histogram in one chunk.  Instead
  * of a true 3-D array, we use a row of pointers to 2-D arrays.  Each
  * pointer corresponds to a C0 value (typically 2^5 = 32 pointers) and
- * each 2-D array has 2^6*2^5 = 2048 or 2^6*2^6 = 4096 entries.  Note that
- * on 80x86 machines, the pointer row is in near memory but the actual
- * arrays are in far memory (same arrangement as we use for image arrays).
+ * each 2-D array has 2^6*2^5 = 2048 or 2^6*2^6 = 4096 entries.
  */
 
 #define MAXNUMCOLORS  (MAXJSAMPLE+1) /* maximum size of colormap */
@@ -112,9 +111,9 @@
 /* These will do the right thing for either R,G,B or B,G,R color order,
  * but you may not like the results for other color orders.
  */
-#define HIST_C0_BITS  5		/* bits of precision in R/B histogram */
-#define HIST_C1_BITS  6		/* bits of precision in G histogram */
-#define HIST_C2_BITS  5		/* bits of precision in B/R histogram */
+#define HIST_C0_BITS  5         /* bits of precision in R/B histogram */
+#define HIST_C1_BITS  6         /* bits of precision in G histogram */
+#define HIST_C2_BITS  5         /* bits of precision in B/R histogram */
 
 /* Number of elements along histogram axes. */
 #define HIST_C0_ELEMS  (1<<HIST_C0_BITS)
@@ -127,13 +126,13 @@
 #define C2_SHIFT  (BITS_IN_JSAMPLE-HIST_C2_BITS)
 
 
-typedef UINT16 histcell;	/* histogram cell; prefer an unsigned type */
+typedef UINT16 histcell;        /* histogram cell; prefer an unsigned type */
 
-typedef histcell FAR * histptr;	/* for pointers to histogram cells */
+typedef histcell *histptr; /* for pointers to histogram cells */
 
 typedef histcell hist1d[HIST_C2_ELEMS]; /* typedefs for the array */
-typedef hist1d FAR * hist2d;	/* type for the 2nd-level pointers */
-typedef hist2d * hist3d;	/* type for top-level pointer */
+typedef hist1d *hist2d;         /* type for the 2nd-level pointers */
+typedef hist2d *hist3d;         /* type for top-level pointer */
 
 
 /* Declarations for Floyd-Steinberg dithering.
@@ -141,8 +140,8 @@
  * Errors are accumulated into the array fserrors[], at a resolution of
  * 1/16th of a pixel count.  The error at a given pixel is propagated
  * to its not-yet-processed neighbors using the standard F-S fractions,
- *		...	(here)	7/16
- *		3/16	5/16	1/16
+ *              ...     (here)  7/16
+ *              3/16    5/16    1/16
  * We work left-to-right on even rows, right-to-left on odd rows.
  *
  * We can get away with a single array (holding one row's worth of errors)
@@ -155,20 +154,17 @@
  * The fserrors[] array has (#columns + 2) entries; the extra entry at
  * each end saves us from special-casing the first and last pixels.
  * Each entry is three values long, one value for each color component.
- *
- * Note: on a wide image, we might not have enough room in a PC's near data
- * segment to hold the error array; so it is allocated with alloc_large.
  */
 
 #if BITS_IN_JSAMPLE == 8
-typedef INT16 FSERROR;		/* 16 bits should be enough */
-typedef int LOCFSERROR;		/* use 'int' for calculation temps */
+typedef INT16 FSERROR;          /* 16 bits should be enough */
+typedef int LOCFSERROR;         /* use 'int' for calculation temps */
 #else
-typedef INT32 FSERROR;		/* may need more than 16 bits */
-typedef INT32 LOCFSERROR;	/* be sure calculation temps are big enough */
+typedef JLONG FSERROR;          /* may need more than 16 bits */
+typedef JLONG LOCFSERROR;       /* be sure calculation temps are big enough */
 #endif
 
-typedef FSERROR FAR *FSERRPTR;	/* pointer to error array (in FAR storage!) */
+typedef FSERROR *FSERRPTR;      /* pointer to error array */
 
 
 /* Private subobject */
@@ -177,21 +173,21 @@
   struct jpeg_color_quantizer pub; /* public fields */
 
   /* Space for the eventually created colormap is stashed here */
-  JSAMPARRAY sv_colormap;	/* colormap allocated at init time */
-  int desired;			/* desired # of colors = size of colormap */
+  JSAMPARRAY sv_colormap;       /* colormap allocated at init time */
+  int desired;                  /* desired # of colors = size of colormap */
 
   /* Variables for accumulating image statistics */
-  hist3d histogram;		/* pointer to the histogram */
+  hist3d histogram;             /* pointer to the histogram */
 
-  boolean needs_zeroed;		/* TRUE if next pass must zero histogram */
+  boolean needs_zeroed;         /* TRUE if next pass must zero histogram */
 
   /* Variables for Floyd-Steinberg dithering */
-  FSERRPTR fserrors;		/* accumulated errors */
-  boolean on_odd_row;		/* flag to remember which row we are on */
-  int * error_limiter;		/* table for clamping the applied error */
+  FSERRPTR fserrors;            /* accumulated errors */
+  boolean on_odd_row;           /* flag to remember which row we are on */
+  int *error_limiter;           /* table for clamping the applied error */
 } my_cquantizer;
 
-typedef my_cquantizer * my_cquantize_ptr;
+typedef my_cquantizer *my_cquantize_ptr;
 
 
 /*
@@ -205,7 +201,7 @@
 
 METHODDEF(void)
 prescan_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-		  JSAMPARRAY output_buf, int num_rows)
+                  JSAMPARRAY output_buf, int num_rows)
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
   register JSAMPROW ptr;
@@ -220,11 +216,11 @@
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the histogram */
       histp = & histogram[GETJSAMPLE(ptr[0]) >> C0_SHIFT]
-			 [GETJSAMPLE(ptr[1]) >> C1_SHIFT]
-			 [GETJSAMPLE(ptr[2]) >> C2_SHIFT];
+                         [GETJSAMPLE(ptr[1]) >> C1_SHIFT]
+                         [GETJSAMPLE(ptr[2]) >> C2_SHIFT];
       /* increment, check for overflow and undo increment if so. */
       if (++(*histp) <= 0)
-	(*histp)--;
+        (*histp)--;
       ptr += 3;
     }
   }
@@ -244,12 +240,12 @@
   int c1min, c1max;
   int c2min, c2max;
   /* The volume (actually 2-norm) of the box */
-  INT32 volume;
+  JLONG volume;
   /* The number of nonzero histogram cells within this box */
   long colorcount;
 } box;
 
-typedef box * boxptr;
+typedef box *boxptr;
 
 
 LOCAL(boxptr)
@@ -261,7 +257,7 @@
   register int i;
   register long maxc = 0;
   boxptr which = NULL;
-  
+
   for (i = 0, boxp = boxlist; i < numboxes; i++, boxp++) {
     if (boxp->colorcount > maxc && boxp->volume > 0) {
       which = boxp;
@@ -279,9 +275,9 @@
 {
   register boxptr boxp;
   register int i;
-  register INT32 maxv = 0;
+  register JLONG maxv = 0;
   boxptr which = NULL;
-  
+
   for (i = 0, boxp = boxlist; i < numboxes; i++, boxp++) {
     if (boxp->volume > maxv) {
       which = boxp;
@@ -302,77 +298,77 @@
   histptr histp;
   int c0,c1,c2;
   int c0min,c0max,c1min,c1max,c2min,c2max;
-  INT32 dist0,dist1,dist2;
+  JLONG dist0,dist1,dist2;
   long ccount;
-  
+
   c0min = boxp->c0min;  c0max = boxp->c0max;
   c1min = boxp->c1min;  c1max = boxp->c1max;
   c2min = boxp->c2min;  c2max = boxp->c2max;
-  
+
   if (c0max > c0min)
     for (c0 = c0min; c0 <= c0max; c0++)
       for (c1 = c1min; c1 <= c1max; c1++) {
-	histp = & histogram[c0][c1][c2min];
-	for (c2 = c2min; c2 <= c2max; c2++)
-	  if (*histp++ != 0) {
-	    boxp->c0min = c0min = c0;
-	    goto have_c0min;
-	  }
+        histp = & histogram[c0][c1][c2min];
+        for (c2 = c2min; c2 <= c2max; c2++)
+          if (*histp++ != 0) {
+            boxp->c0min = c0min = c0;
+            goto have_c0min;
+          }
       }
  have_c0min:
   if (c0max > c0min)
     for (c0 = c0max; c0 >= c0min; c0--)
       for (c1 = c1min; c1 <= c1max; c1++) {
-	histp = & histogram[c0][c1][c2min];
-	for (c2 = c2min; c2 <= c2max; c2++)
-	  if (*histp++ != 0) {
-	    boxp->c0max = c0max = c0;
-	    goto have_c0max;
-	  }
+        histp = & histogram[c0][c1][c2min];
+        for (c2 = c2min; c2 <= c2max; c2++)
+          if (*histp++ != 0) {
+            boxp->c0max = c0max = c0;
+            goto have_c0max;
+          }
       }
  have_c0max:
   if (c1max > c1min)
     for (c1 = c1min; c1 <= c1max; c1++)
       for (c0 = c0min; c0 <= c0max; c0++) {
-	histp = & histogram[c0][c1][c2min];
-	for (c2 = c2min; c2 <= c2max; c2++)
-	  if (*histp++ != 0) {
-	    boxp->c1min = c1min = c1;
-	    goto have_c1min;
-	  }
+        histp = & histogram[c0][c1][c2min];
+        for (c2 = c2min; c2 <= c2max; c2++)
+          if (*histp++ != 0) {
+            boxp->c1min = c1min = c1;
+            goto have_c1min;
+          }
       }
  have_c1min:
   if (c1max > c1min)
     for (c1 = c1max; c1 >= c1min; c1--)
       for (c0 = c0min; c0 <= c0max; c0++) {
-	histp = & histogram[c0][c1][c2min];
-	for (c2 = c2min; c2 <= c2max; c2++)
-	  if (*histp++ != 0) {
-	    boxp->c1max = c1max = c1;
-	    goto have_c1max;
-	  }
+        histp = & histogram[c0][c1][c2min];
+        for (c2 = c2min; c2 <= c2max; c2++)
+          if (*histp++ != 0) {
+            boxp->c1max = c1max = c1;
+            goto have_c1max;
+          }
       }
  have_c1max:
   if (c2max > c2min)
     for (c2 = c2min; c2 <= c2max; c2++)
       for (c0 = c0min; c0 <= c0max; c0++) {
-	histp = & histogram[c0][c1min][c2];
-	for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
-	  if (*histp != 0) {
-	    boxp->c2min = c2min = c2;
-	    goto have_c2min;
-	  }
+        histp = & histogram[c0][c1min][c2];
+        for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
+          if (*histp != 0) {
+            boxp->c2min = c2min = c2;
+            goto have_c2min;
+          }
       }
  have_c2min:
   if (c2max > c2min)
     for (c2 = c2max; c2 >= c2min; c2--)
       for (c0 = c0min; c0 <= c0max; c0++) {
-	histp = & histogram[c0][c1min][c2];
-	for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
-	  if (*histp != 0) {
-	    boxp->c2max = c2max = c2;
-	    goto have_c2max;
-	  }
+        histp = & histogram[c0][c1min][c2];
+        for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
+          if (*histp != 0) {
+            boxp->c2max = c2max = c2;
+            goto have_c2max;
+          }
       }
  have_c2max:
 
@@ -388,16 +384,16 @@
   dist1 = ((c1max - c1min) << C1_SHIFT) * C1_SCALE;
   dist2 = ((c2max - c2min) << C2_SHIFT) * C2_SCALE;
   boxp->volume = dist0*dist0 + dist1*dist1 + dist2*dist2;
-  
+
   /* Now scan remaining volume of box and compute population */
   ccount = 0;
   for (c0 = c0min; c0 <= c0max; c0++)
     for (c1 = c1min; c1 <= c1max; c1++) {
       histp = & histogram[c0][c1][c2min];
       for (c2 = c2min; c2 <= c2max; c2++, histp++)
-	if (*histp != 0) {
-	  ccount++;
-	}
+        if (*histp != 0) {
+          ccount++;
+        }
     }
   boxp->colorcount = ccount;
 }
@@ -405,7 +401,7 @@
 
 LOCAL(int)
 median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
-	    int desired_colors)
+            int desired_colors)
 /* Repeatedly select and split the largest box until we have enough boxes */
 {
   int n,lb;
@@ -421,9 +417,9 @@
     } else {
       b1 = find_biggest_volume(boxlist, numboxes);
     }
-    if (b1 == NULL)		/* no splittable boxes left! */
+    if (b1 == NULL)             /* no splittable boxes left! */
       break;
-    b2 = &boxlist[numboxes];	/* where new box will go */
+    b2 = &boxlist[numboxes];    /* where new box will go */
     /* Copy the color bounds to the new box. */
     b2->c0max = b1->c0max; b2->c1max = b1->c1max; b2->c2max = b1->c2max;
     b2->c0min = b1->c0min; b2->c1min = b1->c1min; b2->c2min = b1->c2min;
@@ -495,24 +491,24 @@
   long c0total = 0;
   long c1total = 0;
   long c2total = 0;
-  
+
   c0min = boxp->c0min;  c0max = boxp->c0max;
   c1min = boxp->c1min;  c1max = boxp->c1max;
   c2min = boxp->c2min;  c2max = boxp->c2max;
-  
+
   for (c0 = c0min; c0 <= c0max; c0++)
     for (c1 = c1min; c1 <= c1max; c1++) {
       histp = & histogram[c0][c1][c2min];
       for (c2 = c2min; c2 <= c2max; c2++) {
-	if ((count = *histp++) != 0) {
-	  total += count;
-	  c0total += ((c0 << C0_SHIFT) + ((1<<C0_SHIFT)>>1)) * count;
-	  c1total += ((c1 << C1_SHIFT) + ((1<<C1_SHIFT)>>1)) * count;
-	  c2total += ((c2 << C2_SHIFT) + ((1<<C2_SHIFT)>>1)) * count;
-	}
+        if ((count = *histp++) != 0) {
+          total += count;
+          c0total += ((c0 << C0_SHIFT) + ((1<<C0_SHIFT)>>1)) * count;
+          c1total += ((c1 << C1_SHIFT) + ((1<<C1_SHIFT)>>1)) * count;
+          c2total += ((c2 << C2_SHIFT) + ((1<<C2_SHIFT)>>1)) * count;
+        }
       }
     }
-  
+
   cinfo->colormap[0][icolor] = (JSAMPLE) ((c0total + (total>>1)) / total);
   cinfo->colormap[1][icolor] = (JSAMPLE) ((c1total + (total>>1)) / total);
   cinfo->colormap[2][icolor] = (JSAMPLE) ((c2total + (total>>1)) / total);
@@ -529,7 +525,7 @@
 
   /* Allocate workspace for box list */
   boxlist = (boxptr) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, desired_colors * SIZEOF(box));
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, desired_colors * sizeof(box));
   /* Initialize one box containing whole space */
   numboxes = 1;
   boxlist[0].c0min = 0;
@@ -576,7 +572,7 @@
  * distance from every colormap entry to every histogram cell.  Unfortunately,
  * it needs a work array to hold the best-distance-so-far for each histogram
  * cell (because the inner loop has to be over cells, not colormap entries).
- * The work array elements have to be INT32s, so the work array would need
+ * The work array elements have to be JLONGs, so the work array would need
  * 256Kb at our recommended precision.  This is not feasible in DOS machines.
  *
  * To get around these problems, we apply Thomas' method to compute the
@@ -628,7 +624,7 @@
 
 LOCAL(int)
 find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
-		    JSAMPLE colorlist[])
+                    JSAMPLE colorlist[])
 /* Locate the colormap entries close enough to an update box to be candidates
  * for the nearest entry to some cell(s) in the update box.  The update box
  * is specified by the center coordinates of its first cell.  The number of
@@ -642,8 +638,8 @@
   int maxc0, maxc1, maxc2;
   int centerc0, centerc1, centerc2;
   int i, x, ncolors;
-  INT32 minmaxdist, min_dist, max_dist, tdist;
-  INT32 mindist[MAXNUMCOLORS];	/* min distance to colormap entry i */
+  JLONG minmaxdist, min_dist, max_dist, tdist;
+  JLONG mindist[MAXNUMCOLORS];  /* min distance to colormap entry i */
 
   /* Compute true coordinates of update box's upper corner and center.
    * Actually we compute the coordinates of the center of the upper-corner
@@ -685,11 +681,11 @@
       /* within cell range so no contribution to min_dist */
       min_dist = 0;
       if (x <= centerc0) {
-	tdist = (x - maxc0) * C0_SCALE;
-	max_dist = tdist*tdist;
+        tdist = (x - maxc0) * C0_SCALE;
+        max_dist = tdist*tdist;
       } else {
-	tdist = (x - minc0) * C0_SCALE;
-	max_dist = tdist*tdist;
+        tdist = (x - minc0) * C0_SCALE;
+        max_dist = tdist*tdist;
       }
     }
 
@@ -707,11 +703,11 @@
     } else {
       /* within cell range so no contribution to min_dist */
       if (x <= centerc1) {
-	tdist = (x - maxc1) * C1_SCALE;
-	max_dist += tdist*tdist;
+        tdist = (x - maxc1) * C1_SCALE;
+        max_dist += tdist*tdist;
       } else {
-	tdist = (x - minc1) * C1_SCALE;
-	max_dist += tdist*tdist;
+        tdist = (x - minc1) * C1_SCALE;
+        max_dist += tdist*tdist;
       }
     }
 
@@ -729,15 +725,15 @@
     } else {
       /* within cell range so no contribution to min_dist */
       if (x <= centerc2) {
-	tdist = (x - maxc2) * C2_SCALE;
-	max_dist += tdist*tdist;
+        tdist = (x - maxc2) * C2_SCALE;
+        max_dist += tdist*tdist;
       } else {
-	tdist = (x - minc2) * C2_SCALE;
-	max_dist += tdist*tdist;
+        tdist = (x - minc2) * C2_SCALE;
+        max_dist += tdist*tdist;
       }
     }
 
-    mindist[i] = min_dist;	/* save away the results */
+    mindist[i] = min_dist;      /* save away the results */
     if (max_dist < minmaxdist)
       minmaxdist = max_dist;
   }
@@ -757,7 +753,7 @@
 
 LOCAL(void)
 find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
-		  int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[])
+                  int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[])
 /* Find the closest colormap entry for each cell in the update box,
  * given the list of candidate colors prepared by find_nearby_colors.
  * Return the indexes of the closest entries in the bestcolor[] array.
@@ -767,31 +763,31 @@
 {
   int ic0, ic1, ic2;
   int i, icolor;
-  register INT32 * bptr;	/* pointer into bestdist[] array */
-  JSAMPLE * cptr;		/* pointer into bestcolor[] array */
-  INT32 dist0, dist1;		/* initial distance values */
-  register INT32 dist2;		/* current distance in inner loop */
-  INT32 xx0, xx1;		/* distance increments */
-  register INT32 xx2;
-  INT32 inc0, inc1, inc2;	/* initial values for increments */
+  register JLONG *bptr;         /* pointer into bestdist[] array */
+  JSAMPLE *cptr;                /* pointer into bestcolor[] array */
+  JLONG dist0, dist1;           /* initial distance values */
+  register JLONG dist2;         /* current distance in inner loop */
+  JLONG xx0, xx1;               /* distance increments */
+  register JLONG xx2;
+  JLONG inc0, inc1, inc2;       /* initial values for increments */
   /* This array holds the distance to the nearest-so-far color for each cell */
-  INT32 bestdist[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS];
+  JLONG bestdist[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS];
 
   /* Initialize best-distance for each cell of the update box */
   bptr = bestdist;
   for (i = BOX_C0_ELEMS*BOX_C1_ELEMS*BOX_C2_ELEMS-1; i >= 0; i--)
     *bptr++ = 0x7FFFFFFFL;
-  
+
   /* For each color selected by find_nearby_colors,
    * compute its distance to the center of each cell in the box.
    * If that's less than best-so-far, update best distance and color number.
    */
-  
+
   /* Nominal steps between cell centers ("x" in Thomas article) */
 #define STEP_C0  ((1 << C0_SHIFT) * C0_SCALE)
 #define STEP_C1  ((1 << C1_SHIFT) * C1_SCALE)
 #define STEP_C2  ((1 << C2_SHIFT) * C2_SCALE)
-  
+
   for (i = 0; i < numcolors; i++) {
     icolor = GETJSAMPLE(colorlist[i]);
     /* Compute (square of) distance from minc0/c1/c2 to this color */
@@ -813,20 +809,20 @@
       dist1 = dist0;
       xx1 = inc1;
       for (ic1 = BOX_C1_ELEMS-1; ic1 >= 0; ic1--) {
-	dist2 = dist1;
-	xx2 = inc2;
-	for (ic2 = BOX_C2_ELEMS-1; ic2 >= 0; ic2--) {
-	  if (dist2 < *bptr) {
-	    *bptr = dist2;
-	    *cptr = (JSAMPLE) icolor;
-	  }
-	  dist2 += xx2;
-	  xx2 += 2 * STEP_C2 * STEP_C2;
-	  bptr++;
-	  cptr++;
-	}
-	dist1 += xx1;
-	xx1 += 2 * STEP_C1 * STEP_C1;
+        dist2 = dist1;
+        xx2 = inc2;
+        for (ic2 = BOX_C2_ELEMS-1; ic2 >= 0; ic2--) {
+          if (dist2 < *bptr) {
+            *bptr = dist2;
+            *cptr = (JSAMPLE) icolor;
+          }
+          dist2 += xx2;
+          xx2 += 2 * STEP_C2 * STEP_C2;
+          bptr++;
+          cptr++;
+        }
+        dist1 += xx1;
+        xx1 += 2 * STEP_C1 * STEP_C1;
       }
       dist0 += xx0;
       xx0 += 2 * STEP_C0 * STEP_C0;
@@ -843,13 +839,13 @@
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
-  int minc0, minc1, minc2;	/* lower left corner of update box */
+  int minc0, minc1, minc2;      /* lower left corner of update box */
   int ic0, ic1, ic2;
-  register JSAMPLE * cptr;	/* pointer into bestcolor[] array */
-  register histptr cachep;	/* pointer into main cache array */
+  register JSAMPLE *cptr;       /* pointer into bestcolor[] array */
+  register histptr cachep;      /* pointer into main cache array */
   /* This array lists the candidate colormap indexes. */
   JSAMPLE colorlist[MAXNUMCOLORS];
-  int numcolors;		/* number of candidate colors */
+  int numcolors;                /* number of candidate colors */
   /* This array holds the actually closest colormap index for each cell. */
   JSAMPLE bestcolor[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS];
 
@@ -865,7 +861,7 @@
   minc0 = (c0 << BOX_C0_SHIFT) + ((1 << C0_SHIFT) >> 1);
   minc1 = (c1 << BOX_C1_SHIFT) + ((1 << C1_SHIFT) >> 1);
   minc2 = (c2 << BOX_C2_SHIFT) + ((1 << C2_SHIFT) >> 1);
-  
+
   /* Determine which colormap entries are close enough to be candidates
    * for the nearest entry to some cell in the update box.
    */
@@ -873,10 +869,10 @@
 
   /* Determine the actually nearest colors. */
   find_best_colors(cinfo, minc0, minc1, minc2, numcolors, colorlist,
-		   bestcolor);
+                   bestcolor);
 
   /* Save the best color numbers (plus 1) in the main cache array */
-  c0 <<= BOX_C0_LOG;		/* convert ID back to base cell indexes */
+  c0 <<= BOX_C0_LOG;            /* convert ID back to base cell indexes */
   c1 <<= BOX_C1_LOG;
   c2 <<= BOX_C2_LOG;
   cptr = bestcolor;
@@ -884,7 +880,7 @@
     for (ic1 = 0; ic1 < BOX_C1_ELEMS; ic1++) {
       cachep = & histogram[c0+ic0][c1+ic1][c2];
       for (ic2 = 0; ic2 < BOX_C2_ELEMS; ic2++) {
-	*cachep++ = (histcell) (GETJSAMPLE(*cptr++) + 1);
+        *cachep++ = (histcell) (GETJSAMPLE(*cptr++) + 1);
       }
     }
   }
@@ -897,7 +893,7 @@
 
 METHODDEF(void)
 pass2_no_dither (j_decompress_ptr cinfo,
-		 JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows)
+                 JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows)
 /* This version performs no dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
@@ -921,7 +917,7 @@
       /* If we have not seen this color before, find nearest colormap entry */
       /* and update the cache */
       if (*cachep == 0)
-	fill_inverse_cmap(cinfo, c0,c1,c2);
+        fill_inverse_cmap(cinfo, c0,c1,c2);
       /* Now emit the colormap index for this cell */
       *outptr++ = (JSAMPLE) (*cachep - 1);
     }
@@ -931,20 +927,20 @@
 
 METHODDEF(void)
 pass2_fs_dither (j_decompress_ptr cinfo,
-		 JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows)
+                 JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows)
 /* This version performs Floyd-Steinberg dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
-  register LOCFSERROR cur0, cur1, cur2;	/* current error or pixel value */
+  register LOCFSERROR cur0, cur1, cur2; /* current error or pixel value */
   LOCFSERROR belowerr0, belowerr1, belowerr2; /* error for pixel below cur */
   LOCFSERROR bpreverr0, bpreverr1, bpreverr2; /* error for below/prev col */
-  register FSERRPTR errorptr;	/* => fserrors[] at column before current */
-  JSAMPROW inptr;		/* => current input pixel */
-  JSAMPROW outptr;		/* => current output pixel */
+  register FSERRPTR errorptr;   /* => fserrors[] at column before current */
+  JSAMPROW inptr;               /* => current input pixel */
+  JSAMPROW outptr;              /* => current output pixel */
   histptr cachep;
-  int dir;			/* +1 or -1 depending on direction */
-  int dir3;			/* 3*dir, for advancing inptr & errorptr */
+  int dir;                      /* +1 or -1 depending on direction */
+  int dir3;                     /* 3*dir, for advancing inptr & errorptr */
   int row;
   JDIMENSION col;
   JDIMENSION width = cinfo->output_width;
@@ -960,7 +956,7 @@
     outptr = output_buf[row];
     if (cquantize->on_odd_row) {
       /* work right to left in this row */
-      inptr += (width-1) * 3;	/* so point to rightmost pixel */
+      inptr += (width-1) * 3;   /* so point to rightmost pixel */
       outptr += width-1;
       dir = -1;
       dir3 = -3;
@@ -1012,53 +1008,44 @@
       /* If we have not seen this color before, find nearest colormap */
       /* entry and update the cache */
       if (*cachep == 0)
-	fill_inverse_cmap(cinfo, cur0>>C0_SHIFT,cur1>>C1_SHIFT,cur2>>C2_SHIFT);
+        fill_inverse_cmap(cinfo, cur0>>C0_SHIFT,cur1>>C1_SHIFT,cur2>>C2_SHIFT);
       /* Now emit the colormap index for this cell */
       { register int pixcode = *cachep - 1;
-	*outptr = (JSAMPLE) pixcode;
-	/* Compute representation error for this pixel */
-	cur0 -= GETJSAMPLE(colormap0[pixcode]);
-	cur1 -= GETJSAMPLE(colormap1[pixcode]);
-	cur2 -= GETJSAMPLE(colormap2[pixcode]);
+        *outptr = (JSAMPLE) pixcode;
+        /* Compute representation error for this pixel */
+        cur0 -= GETJSAMPLE(colormap0[pixcode]);
+        cur1 -= GETJSAMPLE(colormap1[pixcode]);
+        cur2 -= GETJSAMPLE(colormap2[pixcode]);
       }
       /* Compute error fractions to be propagated to adjacent pixels.
        * Add these into the running sums, and simultaneously shift the
        * next-line error sums left by 1 column.
        */
-      { register LOCFSERROR bnexterr, delta;
+      { register LOCFSERROR bnexterr;
 
-	bnexterr = cur0;	/* Process component 0 */
-	delta = cur0 * 2;
-	cur0 += delta;		/* form error * 3 */
-	errorptr[0] = (FSERROR) (bpreverr0 + cur0);
-	cur0 += delta;		/* form error * 5 */
-	bpreverr0 = belowerr0 + cur0;
-	belowerr0 = bnexterr;
-	cur0 += delta;		/* form error * 7 */
-	bnexterr = cur1;	/* Process component 1 */
-	delta = cur1 * 2;
-	cur1 += delta;		/* form error * 3 */
-	errorptr[1] = (FSERROR) (bpreverr1 + cur1);
-	cur1 += delta;		/* form error * 5 */
-	bpreverr1 = belowerr1 + cur1;
-	belowerr1 = bnexterr;
-	cur1 += delta;		/* form error * 7 */
-	bnexterr = cur2;	/* Process component 2 */
-	delta = cur2 * 2;
-	cur2 += delta;		/* form error * 3 */
-	errorptr[2] = (FSERROR) (bpreverr2 + cur2);
-	cur2 += delta;		/* form error * 5 */
-	bpreverr2 = belowerr2 + cur2;
-	belowerr2 = bnexterr;
-	cur2 += delta;		/* form error * 7 */
+        bnexterr = cur0;        /* Process component 0 */
+        errorptr[0] = (FSERROR) (bpreverr0 + cur0 * 3);
+        bpreverr0 = belowerr0 + cur0 * 5;
+        belowerr0 = bnexterr;
+        cur0 *= 7;
+        bnexterr = cur1;        /* Process component 1 */
+        errorptr[1] = (FSERROR) (bpreverr1 + cur1 * 3);
+        bpreverr1 = belowerr1 + cur1 * 5;
+        belowerr1 = bnexterr;
+        cur1 *= 7;
+        bnexterr = cur2;        /* Process component 2 */
+        errorptr[2] = (FSERROR) (bpreverr2 + cur2 * 3);
+        bpreverr2 = belowerr2 + cur2 * 5;
+        belowerr2 = bnexterr;
+        cur2 *= 7;
       }
       /* At this point curN contains the 7/16 error value to be propagated
        * to the next pixel on the current line, and all the errors for the
        * next line have been shifted over.  We are therefore ready to move on.
        */
-      inptr += dir3;		/* Advance pixel pointers to next column */
+      inptr += dir3;            /* Advance pixel pointers to next column */
       outptr += dir;
-      errorptr += dir3;		/* advance errorptr to current column */
+      errorptr += dir3;         /* advance errorptr to current column */
     }
     /* Post-loop cleanup: we must unload the final error values into the
      * final fserrors[] entry.  Note we need not unload belowerrN because
@@ -1093,12 +1080,12 @@
 /* Allocate and fill in the error_limiter table */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
-  int * table;
+  int *table;
   int in, out;
 
   table = (int *) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE*2+1) * SIZEOF(int));
-  table += MAXJSAMPLE;		/* so can index -MAXJSAMPLE .. +MAXJSAMPLE */
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE*2+1) * sizeof(int));
+  table += MAXJSAMPLE;          /* so can index -MAXJSAMPLE .. +MAXJSAMPLE */
   cquantize->error_limiter = table;
 
 #define STEPSIZE ((MAXJSAMPLE+1)/16)
@@ -1181,16 +1168,16 @@
 
     if (cinfo->dither_mode == JDITHER_FS) {
       size_t arraysize = (size_t) ((cinfo->output_width + 2) *
-				   (3 * SIZEOF(FSERROR)));
+                                   (3 * sizeof(FSERROR)));
       /* Allocate Floyd-Steinberg workspace if we didn't already. */
       if (cquantize->fserrors == NULL)
-	cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
-	  ((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
+        cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
+          ((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
       /* Initialize the propagated errors to zero. */
-      jzero_far((void FAR *) cquantize->fserrors, arraysize);
+      jzero_far((void *) cquantize->fserrors, arraysize);
       /* Make the error-limit table if we didn't already. */
       if (cquantize->error_limiter == NULL)
-	init_error_limit(cinfo);
+        init_error_limit(cinfo);
       cquantize->on_odd_row = FALSE;
     }
 
@@ -1198,8 +1185,8 @@
   /* Zero the histogram or inverse color map, if necessary */
   if (cquantize->needs_zeroed) {
     for (i = 0; i < HIST_C0_ELEMS; i++) {
-      jzero_far((void FAR *) histogram[i],
-		HIST_C1_ELEMS*HIST_C2_ELEMS * SIZEOF(histcell));
+      jzero_far((void *) histogram[i],
+                HIST_C1_ELEMS*HIST_C2_ELEMS * sizeof(histcell));
     }
     cquantize->needs_zeroed = FALSE;
   }
@@ -1232,11 +1219,11 @@
 
   cquantize = (my_cquantize_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_cquantizer));
+                                sizeof(my_cquantizer));
   cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize;
   cquantize->pub.start_pass = start_pass_2_quant;
   cquantize->pub.new_color_map = new_color_map_2_quant;
-  cquantize->fserrors = NULL;	/* flag optional arrays not allocated */
+  cquantize->fserrors = NULL;   /* flag optional arrays not allocated */
   cquantize->error_limiter = NULL;
 
   /* Make sure jdmaster didn't give me a case I can't handle */
@@ -1245,17 +1232,17 @@
 
   /* Allocate the histogram/inverse colormap storage */
   cquantize->histogram = (hist3d) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, HIST_C0_ELEMS * SIZEOF(hist2d));
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, HIST_C0_ELEMS * sizeof(hist2d));
   for (i = 0; i < HIST_C0_ELEMS; i++) {
     cquantize->histogram[i] = (hist2d) (*cinfo->mem->alloc_large)
       ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       HIST_C1_ELEMS*HIST_C2_ELEMS * SIZEOF(histcell));
+       HIST_C1_ELEMS*HIST_C2_ELEMS * sizeof(histcell));
   }
   cquantize->needs_zeroed = TRUE; /* histogram is garbage now */
 
   /* Allocate storage for the completed colormap, if required.
-   * We do this now since it is FAR storage and may affect
-   * the memory manager's space calculations.
+   * We do this now since it may affect the memory manager's space
+   * calculations.
    */
   if (cinfo->enable_2pass_quant) {
     /* Make sure color count is acceptable */
@@ -1278,14 +1265,15 @@
     cinfo->dither_mode = JDITHER_FS;
 
   /* Allocate Floyd-Steinberg workspace if necessary.
-   * This isn't really needed until pass 2, but again it is FAR storage.
-   * Although we will cope with a later change in dither_mode,
-   * we do not promise to honor max_memory_to_use if dither_mode changes.
+   * This isn't really needed until pass 2, but again it may affect the memory
+   * manager's space calculations.  Although we will cope with a later change
+   * in dither_mode, we do not promise to honor max_memory_to_use if
+   * dither_mode changes.
    */
   if (cinfo->dither_mode == JDITHER_FS) {
     cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
       ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (size_t) ((cinfo->output_width + 2) * (3 * SIZEOF(FSERROR))));
+       (size_t) ((cinfo->output_width + 2) * (3 * sizeof(FSERROR))));
     /* Might as well create the error-limiting table too. */
     init_error_limit(cinfo);
   }
diff --git a/jsimd.h b/jsimd.h
index 282bff2..f2e2484 100644
--- a/jsimd.h
+++ b/jsimd.h
@@ -3,100 +3,91 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright 2011, 2014 D. R. Commander
- * 
+ * Copyright 2015 Matthieu Darbois
+ *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * For conditions of distribution and use, see copyright notice in jsimdext.inc
  *
  */
 
-/* Short forms of external names for systems with brain-damaged linkers. */
+#include "jchuff.h"             /* Declarations shared with jcphuff.c */
 
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jsimd_can_rgb_ycc                 jSCanRgbYcc
-#define jsimd_can_rgb_gray                jSCanRgbGry
-#define jsimd_can_ycc_rgb                 jSCanYccRgb
-#define jsimd_rgb_ycc_convert             jSRgbYccConv
-#define jsimd_rgb_gray_convert            jSRgbGryConv
-#define jsimd_ycc_rgb_convert             jSYccRgbConv
-#define jsimd_can_h2v2_downsample         jSCanH2V2Down
-#define jsimd_can_h2v1_downsample         jSCanH2V1Down
-#define jsimd_h2v2_downsample             jSH2V2Down
-#define jsimd_h2v1_downsample             jSH2V1Down
-#define jsimd_can_h2v2_upsample           jSCanH2V2Up
-#define jsimd_can_h2v1_upsample           jSCanH2V1Up
-#define jsimd_h2v2_upsample               jSH2V2Up
-#define jsimd_h2v1_upsample               jSH2V1Up
-#define jsimd_can_h2v2_fancy_upsample     jSCanH2V2FUp
-#define jsimd_can_h2v1_fancy_upsample     jSCanH2V1FUp
-#define jsimd_h2v2_fancy_upsample         jSH2V2FUp
-#define jsimd_h2v1_fancy_upsample         jSH2V1FUp
-#define jsimd_can_h2v2_merged_upsample    jSCanH2V2MUp
-#define jsimd_can_h2v1_merged_upsample    jSCanH2V1MUp
-#define jsimd_h2v2_merged_upsample        jSH2V2MUp
-#define jsimd_h2v1_merged_upsample        jSH2V1MUp
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
-EXTERN(int) jsimd_can_rgb_ycc JPP((void));
-EXTERN(int) jsimd_can_rgb_gray JPP((void));
-EXTERN(int) jsimd_can_ycc_rgb JPP((void));
-EXTERN(int) jsimd_can_ycc_rgb565 JPP((void));
+EXTERN(int) jsimd_can_rgb_ycc (void);
+EXTERN(int) jsimd_can_rgb_gray (void);
+EXTERN(int) jsimd_can_ycc_rgb (void);
+EXTERN(int) jsimd_can_ycc_rgb565 (void);
+EXTERN(int) jsimd_c_can_null_convert (void);
 
 EXTERN(void) jsimd_rgb_ycc_convert
-        JPP((j_compress_ptr cinfo,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_rgb_gray_convert
-        JPP((j_compress_ptr cinfo,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_ycc_rgb_convert
-        JPP((j_decompress_ptr cinfo,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_rgb565_convert
-        JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_c_null_convert
+        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 
-EXTERN(int) jsimd_can_h2v2_downsample JPP((void));
-EXTERN(int) jsimd_can_h2v1_downsample JPP((void));
+EXTERN(int) jsimd_can_h2v2_downsample (void);
+EXTERN(int) jsimd_can_h2v1_downsample (void);
 
 EXTERN(void) jsimd_h2v2_downsample
-        JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
-             JSAMPARRAY input_data, JSAMPARRAY output_data));
-EXTERN(void) jsimd_h2v1_downsample
-        JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
-             JSAMPARRAY input_data, JSAMPARRAY output_data));
+        (j_compress_ptr cinfo, jpeg_component_info *compptr,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
 
-EXTERN(int) jsimd_can_h2v2_upsample JPP((void));
-EXTERN(int) jsimd_can_h2v1_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v2_smooth_downsample (void);
+
+EXTERN(void) jsimd_h2v2_smooth_downsample
+        (j_compress_ptr cinfo, jpeg_component_info *compptr,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample
+        (j_compress_ptr cinfo, jpeg_component_info *compptr,
+        JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(int) jsimd_can_h2v2_upsample (void);
+EXTERN(int) jsimd_can_h2v1_upsample (void);
+EXTERN(int) jsimd_can_int_upsample (void);
 
 EXTERN(void) jsimd_h2v2_upsample
-        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v1_upsample
-        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_int_upsample
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
-EXTERN(int) jsimd_can_h2v2_fancy_upsample JPP((void));
-EXTERN(int) jsimd_can_h2v1_fancy_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v2_fancy_upsample (void);
+EXTERN(int) jsimd_can_h2v1_fancy_upsample (void);
 
 EXTERN(void) jsimd_h2v2_fancy_upsample
-        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v1_fancy_upsample
-        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
-EXTERN(int) jsimd_can_h2v2_merged_upsample JPP((void));
-EXTERN(int) jsimd_can_h2v1_merged_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v2_merged_upsample (void);
+EXTERN(int) jsimd_can_h2v1_merged_upsample (void);
 
 EXTERN(void) jsimd_h2v2_merged_upsample
-        JPP((j_decompress_ptr cinfo,
-             JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-             JSAMPARRAY output_buf));
+        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_merged_upsample
-        JPP((j_decompress_ptr cinfo,
-             JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-             JSAMPARRAY output_buf));
+        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
 
+EXTERN(int) jsimd_can_huff_encode_one_block (void);
+
+EXTERN(JOCTET*) jsimd_huff_encode_one_block
+        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+         c_derived_tbl *dctbl, c_derived_tbl *actbl);
diff --git a/jsimd_none.c b/jsimd_none.c
index 5418e80..90dc965 100644
--- a/jsimd_none.c
+++ b/jsimd_none.c
@@ -3,7 +3,8 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright 2009-2011, 2014 D. R. Commander
- * 
+ * Copyright 2015 Matthieu Darbois
+ *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * For conditions of distribution and use, see copyright notice in jsimdext.inc
@@ -42,6 +43,12 @@
   return 0;
 }
 
+GLOBAL(int)
+jsimd_c_can_null_convert (void)
+{
+  return 0;
+}
+
 GLOBAL(void)
 jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
@@ -70,6 +77,13 @@
 {
 }
 
+GLOBAL(void)
+jsimd_c_null_convert (j_compress_ptr cinfo,
+                      JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                      JDIMENSION output_row, int num_rows)
+{
+}
+
 GLOBAL(int)
 jsimd_can_h2v2_downsample (void)
 {
@@ -82,14 +96,27 @@
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample (void)
+{
+  return 0;
+}
+
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
@@ -106,19 +133,31 @@
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_int_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
 GLOBAL(void)
 jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr, 
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr, 
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
 }
 
@@ -136,17 +175,17 @@
 
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr, 
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr, 
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
 }
 
@@ -192,13 +231,13 @@
 
 GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM * workspace)
+                DCTELEM *workspace)
 {
 }
 
 GLOBAL(void)
 jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT * workspace)
+                      FAST_FLOAT *workspace)
 {
 }
 
@@ -221,17 +260,17 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM * data)
+jsimd_fdct_islow (DCTELEM *data)
 {
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM * data)
+jsimd_fdct_ifast (DCTELEM *data)
 {
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT * data)
+jsimd_fdct_float (FAST_FLOAT *data)
 {
 }
 
@@ -248,14 +287,14 @@
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-                DCTELEM * workspace)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
 {
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                      FAST_FLOAT * workspace)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
 {
 }
 
@@ -271,20 +310,46 @@
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_idct_6x6 (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12 (void)
+{
+  return 0;
+}
+
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
 }
 
+GLOBAL(void)
+jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+}
+
 GLOBAL(int)
 jsimd_can_idct_islow (void)
 {
@@ -304,23 +369,36 @@
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
 {
 }
 
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  return NULL;
+}
diff --git a/jsimddct.h b/jsimddct.h
index a1c7440..b19ab48 100644
--- a/jsimddct.h
+++ b/jsimddct.h
@@ -2,101 +2,73 @@
  * jsimddct.h
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * 
+ *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * For conditions of distribution and use, see copyright notice in jsimdext.inc
  *
  */
 
-/* Short forms of external names for systems with brain-damaged linkers. */
+EXTERN(int) jsimd_can_convsamp (void);
+EXTERN(int) jsimd_can_convsamp_float (void);
 
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jsimd_can_convsamp                jSCanConv
-#define jsimd_can_convsamp_float          jSCanConvF
-#define jsimd_convsamp                    jSConv
-#define jsimd_convsamp_float              jSConvF
-#define jsimd_can_fdct_islow              jSCanFDCTIS
-#define jsimd_can_fdct_ifast              jSCanFDCTIF
-#define jsimd_can_fdct_float              jSCanFDCTFl
-#define jsimd_fdct_islow                  jSFDCTIS
-#define jsimd_fdct_ifast                  jSFDCTIF
-#define jsimd_fdct_float                  jSFDCTFl
-#define jsimd_can_quantize                jSCanQuant
-#define jsimd_can_quantize_float          jSCanQuantF
-#define jsimd_quantize                    jSQuant
-#define jsimd_quantize_float              jSQuantF
-#define jsimd_can_idct_2x2                jSCanIDCT22
-#define jsimd_can_idct_4x4                jSCanIDCT44
-#define jsimd_idct_2x2                    jSIDCT22
-#define jsimd_idct_4x4                    jSIDCT44
-#define jsimd_can_idct_islow              jSCanIDCTIS
-#define jsimd_can_idct_ifast              jSCanIDCTIF
-#define jsimd_can_idct_float              jSCanIDCTFl
-#define jsimd_idct_islow                  jSIDCTIS
-#define jsimd_idct_ifast                  jSIDCTIF
-#define jsimd_idct_float                  jSIDCTFl
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
+EXTERN(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                             DCTELEM *workspace);
+EXTERN(void) jsimd_convsamp_float (JSAMPARRAY sample_data,
+                                   JDIMENSION start_col,
+                                   FAST_FLOAT *workspace);
 
-EXTERN(int) jsimd_can_convsamp JPP((void));
-EXTERN(int) jsimd_can_convsamp_float JPP((void));
+EXTERN(int) jsimd_can_fdct_islow (void);
+EXTERN(int) jsimd_can_fdct_ifast (void);
+EXTERN(int) jsimd_can_fdct_float (void);
 
-EXTERN(void) jsimd_convsamp JPP((JSAMPARRAY sample_data,
-                                 JDIMENSION start_col,
-                                 DCTELEM * workspace));
-EXTERN(void) jsimd_convsamp_float JPP((JSAMPARRAY sample_data,
-                                       JDIMENSION start_col,
-                                       FAST_FLOAT * workspace));
+EXTERN(void) jsimd_fdct_islow (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast (DCTELEM *data);
+EXTERN(void) jsimd_fdct_float (FAST_FLOAT *data);
 
-EXTERN(int) jsimd_can_fdct_islow JPP((void));
-EXTERN(int) jsimd_can_fdct_ifast JPP((void));
-EXTERN(int) jsimd_can_fdct_float JPP((void));
+EXTERN(int) jsimd_can_quantize (void);
+EXTERN(int) jsimd_can_quantize_float (void);
 
-EXTERN(void) jsimd_fdct_islow JPP((DCTELEM * data));
-EXTERN(void) jsimd_fdct_ifast JPP((DCTELEM * data));
-EXTERN(void) jsimd_fdct_float JPP((FAST_FLOAT * data));
+EXTERN(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                             DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                                   FAST_FLOAT *workspace);
 
-EXTERN(int) jsimd_can_quantize JPP((void));
-EXTERN(int) jsimd_can_quantize_float JPP((void));
+EXTERN(int) jsimd_can_idct_2x2 (void);
+EXTERN(int) jsimd_can_idct_4x4 (void);
+EXTERN(int) jsimd_can_idct_6x6 (void);
+EXTERN(int) jsimd_can_idct_12x12 (void);
 
-EXTERN(void) jsimd_quantize JPP((JCOEFPTR coef_block,
-                                 DCTELEM * divisors,
-                                 DCTELEM * workspace));
-EXTERN(void) jsimd_quantize_float JPP((JCOEFPTR coef_block,
-                                       FAST_FLOAT * divisors,
-                                       FAST_FLOAT * workspace));
+EXTERN(void) jsimd_idct_2x2 (j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                             JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4 (j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                             JDIMENSION output_col);
+EXTERN(void) jsimd_idct_6x6 (j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                             JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12 (j_decompress_ptr cinfo,
+                               jpeg_component_info *compptr,
+                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                               JDIMENSION output_col);
 
-EXTERN(int) jsimd_can_idct_2x2 JPP((void));
-EXTERN(int) jsimd_can_idct_4x4 JPP((void));
+EXTERN(int) jsimd_can_idct_islow (void);
+EXTERN(int) jsimd_can_idct_ifast (void);
+EXTERN(int) jsimd_can_idct_float (void);
 
-EXTERN(void) jsimd_idct_2x2 JPP((j_decompress_ptr cinfo,
-                                 jpeg_component_info * compptr,
-                                 JCOEFPTR coef_block,
-                                 JSAMPARRAY output_buf,
-                                 JDIMENSION output_col));
-EXTERN(void) jsimd_idct_4x4 JPP((j_decompress_ptr cinfo,
-                                 jpeg_component_info * compptr,
-                                 JCOEFPTR coef_block,
-                                 JSAMPARRAY output_buf,
-                                 JDIMENSION output_col));
-
-EXTERN(int) jsimd_can_idct_islow JPP((void));
-EXTERN(int) jsimd_can_idct_ifast JPP((void));
-EXTERN(int) jsimd_can_idct_float JPP((void));
-
-EXTERN(void) jsimd_idct_islow JPP((j_decompress_ptr cinfo,
-                                   jpeg_component_info * compptr,
-                                   JCOEFPTR coef_block,
-                                   JSAMPARRAY output_buf,
-                                   JDIMENSION output_col));
-EXTERN(void) jsimd_idct_ifast JPP((j_decompress_ptr cinfo,
-                                   jpeg_component_info * compptr,
-                                   JCOEFPTR coef_block,
-                                   JSAMPARRAY output_buf,
-                                   JDIMENSION output_col));
-EXTERN(void) jsimd_idct_float JPP((j_decompress_ptr cinfo,
-                                   jpeg_component_info * compptr,
-                                   JCOEFPTR coef_block,
-                                   JSAMPARRAY output_buf,
-                                   JDIMENSION output_col));
-
+EXTERN(void) jsimd_idct_islow (j_decompress_ptr cinfo,
+                               jpeg_component_info *compptr,
+                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                               JDIMENSION output_col);
+EXTERN(void) jsimd_idct_ifast (j_decompress_ptr cinfo,
+                               jpeg_component_info *compptr,
+                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                               JDIMENSION output_col);
+EXTERN(void) jsimd_idct_float (j_decompress_ptr cinfo,
+                               jpeg_component_info *compptr,
+                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                               JDIMENSION output_col);
diff --git a/jstdhuff.c b/jstdhuff.c
new file mode 100644
index 0000000..e202e8e
--- /dev/null
+++ b/jstdhuff.c
@@ -0,0 +1,135 @@
+/*
+ * jstdhuff.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2013, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains routines to set the default Huffman tables, if they are
+ * not already set.
+ */
+
+/*
+ * Huffman table setup routines
+ */
+
+LOCAL(void)
+add_huff_table (j_common_ptr cinfo,
+                JHUFF_TBL **htblptr, const UINT8 *bits, const UINT8 *val)
+/* Define a Huffman table */
+{
+  int nsymbols, len;
+
+  if (*htblptr == NULL)
+    *htblptr = jpeg_alloc_huff_table(cinfo);
+  else
+    return;
+
+  /* Copy the number-of-symbols-of-each-code-length counts */
+  MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
+
+  /* Validate the counts.  We do this here mainly so we can copy the right
+   * number of symbols from the val[] array, without risking marching off
+   * the end of memory.  jchuff.c will do a more thorough test later.
+   */
+  nsymbols = 0;
+  for (len = 1; len <= 16; len++)
+    nsymbols += bits[len];
+  if (nsymbols < 1 || nsymbols > 256)
+    ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
+
+  MEMCOPY((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
+  MEMZERO(&((*htblptr)->huffval[nsymbols]), (256 - nsymbols) * sizeof(UINT8));
+
+  /* Initialize sent_table FALSE so table will be written to JPEG file. */
+  (*htblptr)->sent_table = FALSE;
+}
+
+
+LOCAL(void)
+std_huff_tables (j_common_ptr cinfo)
+/* Set up the standard Huffman tables (cf. JPEG standard section K.3) */
+/* IMPORTANT: these are only valid for 8-bit data precision! */
+{
+  JHUFF_TBL **dc_huff_tbl_ptrs, **ac_huff_tbl_ptrs;
+
+  static const UINT8 bits_dc_luminance[17] =
+    { /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
+  static const UINT8 val_dc_luminance[] =
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+
+  static const UINT8 bits_dc_chrominance[17] =
+    { /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
+  static const UINT8 val_dc_chrominance[] =
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+
+  static const UINT8 bits_ac_luminance[17] =
+    { /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d };
+  static const UINT8 val_ac_luminance[] =
+    { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+      0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+      0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+      0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+      0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+      0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+      0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+      0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+      0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+      0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+      0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+      0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+      0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+      0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+      0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+      0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+      0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+      0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+      0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+      0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+      0xf9, 0xfa };
+
+  static const UINT8 bits_ac_chrominance[17] =
+    { /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 };
+  static const UINT8 val_ac_chrominance[] =
+    { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+      0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+      0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+      0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+      0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+      0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+      0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+      0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+      0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+      0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+      0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+      0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+      0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+      0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+      0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+      0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+      0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+      0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+      0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+      0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+      0xf9, 0xfa };
+
+  if (cinfo->is_decompressor) {
+    dc_huff_tbl_ptrs = ((j_decompress_ptr)cinfo)->dc_huff_tbl_ptrs;
+    ac_huff_tbl_ptrs = ((j_decompress_ptr)cinfo)->ac_huff_tbl_ptrs;
+  } else {
+    dc_huff_tbl_ptrs = ((j_compress_ptr)cinfo)->dc_huff_tbl_ptrs;
+    ac_huff_tbl_ptrs = ((j_compress_ptr)cinfo)->ac_huff_tbl_ptrs;
+  }
+
+  add_huff_table(cinfo, &dc_huff_tbl_ptrs[0], bits_dc_luminance,
+                 val_dc_luminance);
+  add_huff_table(cinfo, &ac_huff_tbl_ptrs[0], bits_ac_luminance,
+                 val_ac_luminance);
+  add_huff_table(cinfo, &dc_huff_tbl_ptrs[1], bits_dc_chrominance,
+                 val_dc_chrominance);
+  add_huff_table(cinfo, &ac_huff_tbl_ptrs[1], bits_ac_chrominance,
+                 val_ac_chrominance);
+}
diff --git a/jutils.c b/jutils.c
index d18a955..f9d3502 100644
--- a/jutils.c
+++ b/jutils.c
@@ -1,9 +1,12 @@
 /*
  * jutils.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code
+ * relevant to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains tables and miscellaneous utility routines needed
  * for both compression and decompression.
@@ -21,7 +24,7 @@
  * of a DCT block read in natural order (left to right, top to bottom).
  */
 
-#if 0				/* This table is not actually needed in v6a */
+#if 0                           /* This table is not actually needed in v6a */
 
 const int jpeg_zigzag_order[DCTSIZE2] = {
    0,  1,  5,  6, 14, 15, 27, 28,
@@ -87,30 +90,10 @@
 }
 
 
-/* On normal machines we can apply MEMCOPY() and MEMZERO() to sample arrays
- * and coefficient-block arrays.  This won't work on 80x86 because the arrays
- * are FAR and we're assuming a small-pointer memory model.  However, some
- * DOS compilers provide far-pointer versions of memcpy() and memset() even
- * in the small-model libraries.  These will be used if USE_FMEM is defined.
- * Otherwise, the routines below do it the hard way.  (The performance cost
- * is not all that great, because these routines aren't very heavily used.)
- */
-
-#ifndef NEED_FAR_POINTERS	/* normal case, same as regular macros */
-#define FMEMCOPY(dest,src,size)	MEMCOPY(dest,src,size)
-#define FMEMZERO(target,size)	MEMZERO(target,size)
-#else				/* 80x86 case, define if we can */
-#ifdef USE_FMEM
-#define FMEMCOPY(dest,src,size)	_fmemcpy((void FAR *)(dest), (const void FAR *)(src), (size_t)(size))
-#define FMEMZERO(target,size)	_fmemset((void FAR *)(target), 0, (size_t)(size))
-#endif
-#endif
-
-
 GLOBAL(void)
 jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
-		   JSAMPARRAY output_array, int dest_row,
-		   int num_rows, JDIMENSION num_cols)
+                   JSAMPARRAY output_array, int dest_row,
+                   int num_rows, JDIMENSION num_cols)
 /* Copy some rows of samples from one place to another.
  * num_rows rows are copied from input_array[source_row++]
  * to output_array[dest_row++]; these areas may overlap for duplication.
@@ -118,11 +101,7 @@
  */
 {
   register JSAMPROW inptr, outptr;
-#ifdef FMEMCOPY
-  register size_t count = (size_t) (num_cols * SIZEOF(JSAMPLE));
-#else
-  register JDIMENSION count;
-#endif
+  register size_t count = (size_t) (num_cols * sizeof(JSAMPLE));
   register int row;
 
   input_array += source_row;
@@ -131,49 +110,24 @@
   for (row = num_rows; row > 0; row--) {
     inptr = *input_array++;
     outptr = *output_array++;
-#ifdef FMEMCOPY
-    FMEMCOPY(outptr, inptr, count);
-#else
-    for (count = num_cols; count > 0; count--)
-      *outptr++ = *inptr++;	/* needn't bother with GETJSAMPLE() here */
-#endif
+    MEMCOPY(outptr, inptr, count);
   }
 }
 
 
 GLOBAL(void)
 jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
-		 JDIMENSION num_blocks)
+                 JDIMENSION num_blocks)
 /* Copy a row of coefficient blocks from one place to another. */
 {
-#ifdef FMEMCOPY
-  FMEMCOPY(output_row, input_row, num_blocks * (DCTSIZE2 * SIZEOF(JCOEF)));
-#else
-  register JCOEFPTR inptr, outptr;
-  register long count;
-
-  inptr = (JCOEFPTR) input_row;
-  outptr = (JCOEFPTR) output_row;
-  for (count = (long) num_blocks * DCTSIZE2; count > 0; count--) {
-    *outptr++ = *inptr++;
-  }
-#endif
+  MEMCOPY(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
 }
 
 
 GLOBAL(void)
-jzero_far (void FAR * target, size_t bytestozero)
-/* Zero out a chunk of FAR memory. */
+jzero_far (void *target, size_t bytestozero)
+/* Zero out a chunk of memory. */
 /* This might be sample-array data, block-array data, or alloc_large data. */
 {
-#ifdef FMEMZERO
-  FMEMZERO(target, bytestozero);
-#else
-  register char FAR * ptr = (char FAR *) target;
-  register size_t count;
-
-  for (count = bytestozero; count > 0; count--) {
-    *ptr++ = 0;
-  }
-#endif
+  MEMZERO(target, bytestozero);
 }
diff --git a/jversion.h b/jversion.h
index c37651b..6ce663d 100644
--- a/jversion.h
+++ b/jversion.h
@@ -3,9 +3,10 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
- * Modifications:
- * Copyright (C) 2010, 2012-2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010, 2012-2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains software version identification.
  */
@@ -13,20 +14,36 @@
 
 #if JPEG_LIB_VERSION >= 80
 
-#define JVERSION	"8d  15-Jan-2012"
+#define JVERSION        "8d  15-Jan-2012"
 
 #elif JPEG_LIB_VERSION >= 70
 
-#define JVERSION	"7  27-Jun-2009"
+#define JVERSION        "7  27-Jun-2009"
 
 #else
 
-#define JVERSION	"6b  27-Mar-1998"
+#define JVERSION        "6b  27-Mar-1998"
 
 #endif
 
-#define JCOPYRIGHT	"Copyright (C) 1991-2012 Thomas G. Lane, Guido Vollbeding\n" \
-			"Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
-			"Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
-			"Copyright (C) 2009-2014 D. R. Commander\n" \
-			"Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)"
+/*
+ * NOTE: It is our convention to place the authors in the following order:
+ * - libjpeg-turbo authors (2009-) in descending order of the date of their
+ *   most recent contribution to the project, then in ascending order of the
+ *   date of their first contribution to the project
+ * - Upstream authors in descending order of the date of the first inclusion of
+ *   their code
+ */
+
+#define JCOPYRIGHT      "Copyright (C) 2009-2016 D. R. Commander\n" \
+                        "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
+                        "Copyright (C) 2015-2016 Matthieu Darbois\n" \
+                        "Copyright (C) 2015 Google, Inc.\n" \
+                        "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
+                        "Copyright (C) 2013 Linaro Limited\n" \
+                        "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
+                        "Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
+                        "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
+                        "Copyright (C) 1991-2016 Thomas G. Lane, Guido Vollbeding" \
+
+#define JCOPYRIGHT_SHORT "Copyright (C) 1991-2016 The libjpeg-turbo Project and many others"
diff --git a/libjpeg.gyp b/libjpeg.gyp
index 5754c00..5620479 100644
--- a/libjpeg.gyp
+++ b/libjpeg.gyp
@@ -102,59 +102,61 @@
         [ 'target_arch=="ia32"', {
           'sources': [
             'simd/jsimd_i386.c',
-            'simd/jccolmmx.asm',
-            'simd/jccolss2.asm',
-            'simd/jcgrammx.asm',
-            'simd/jcgrass2.asm',
-            'simd/jcqnt3dn.asm',
-            'simd/jcqntmmx.asm',
-            'simd/jcqnts2f.asm',
-            'simd/jcqnts2i.asm',
-            'simd/jcqntsse.asm',
-            'simd/jcsammmx.asm',
-            'simd/jcsamss2.asm',
-            'simd/jdcolmmx.asm',
-            'simd/jdcolss2.asm',
-            'simd/jdmermmx.asm',
-            'simd/jdmerss2.asm',
-            'simd/jdsammmx.asm',
-            'simd/jdsamss2.asm',
-            'simd/jf3dnflt.asm',
-            'simd/jfmmxfst.asm',
-            'simd/jfmmxint.asm',
-            'simd/jfss2fst.asm',
-            'simd/jfss2int.asm',
-            'simd/jfsseflt.asm',
-            'simd/ji3dnflt.asm',
-            'simd/jimmxfst.asm',
-            'simd/jimmxint.asm',
-            'simd/jimmxred.asm',
-            'simd/jiss2flt.asm',
-            'simd/jiss2fst.asm',
-            'simd/jiss2int.asm',
-            'simd/jiss2red.asm',
-            'simd/jisseflt.asm',
+            'simd/jccolor-mmx.asm', 
+            'simd/jccolor-sse2.asm',
+            'simd/jcgray-mmx.asm',  
+            'simd/jcgray-sse2.asm',
+            'simd/jchuff-sse2.asm', 
+            'simd/jcsample-mmx.asm',
+            'simd/jcsample-sse2.asm',
+            'simd/jdcolor-mmx.asm', 
+            'simd/jdcolor-sse2.asm',
+            'simd/jdmerge-mmx.asm', 
+            'simd/jdmerge-sse2.asm',
+            'simd/jdsample-mmx.asm',
+            'simd/jdsample-sse2.asm',
+            'simd/jfdctflt-3dn.asm',
+            'simd/jfdctflt-sse.asm',
+            'simd/jfdctfst-mmx.asm',
+            'simd/jfdctfst-sse2.asm',
+            'simd/jfdctint-mmx.asm',
+            'simd/jfdctint-sse2.asm',
+            'simd/jidctflt-3dn.asm',
+            'simd/jidctflt-sse2.asm',
+            'simd/jidctflt-sse.asm',
+            'simd/jidctfst-mmx.asm',
+            'simd/jidctfst-sse2.asm',
+            'simd/jidctint-mmx.asm',
+            'simd/jidctint-sse2.asm',
+            'simd/jidctred-mmx.asm',
+            'simd/jidctred-sse2.asm',
+            'simd/jquant-3dn.asm',  
+            'simd/jquantf-sse2.asm',
+            'simd/jquanti-sse2.asm',
+            'simd/jquant-mmx.asm',  
+            'simd/jquant-sse.asm',  
             'simd/jsimdcpu.asm',
           ],
         }],
         [ 'target_arch=="x64" and msan!=1', {
           'sources': [
             'simd/jsimd_x86_64.c',
-            'simd/jccolss2-64.asm',
-            'simd/jcgrass2-64.asm',
-            'simd/jcqnts2f-64.asm',
-            'simd/jcqnts2i-64.asm',
-            'simd/jcsamss2-64.asm',
-            'simd/jdcolss2-64.asm',
-            'simd/jdmerss2-64.asm',
-            'simd/jdsamss2-64.asm',
-            'simd/jfss2fst-64.asm',
-            'simd/jfss2int-64.asm',
-            'simd/jfsseflt-64.asm',
-            'simd/jiss2flt-64.asm',
-            'simd/jiss2fst-64.asm',
-            'simd/jiss2int-64.asm',
-            'simd/jiss2red-64.asm',
+            'simd/jccolor-sse2-64.asm',
+            'simd/jcgray-sse2-64.asm',
+            'simd/jchuff-sse2-64.asm',
+            'simd/jcsample-sse2-64.asm',
+            'simd/jdcolor-sse2-64.asm',
+            'simd/jdmerge-sse2-64.asm',
+            'simd/jdsample-sse2-64.asm',
+            'simd/jfdctflt-sse-64.asm',
+            'simd/jfdctfst-sse2-64.asm',
+            'simd/jfdctint-sse2-64.asm',
+            'simd/jidctflt-sse2-64.asm',
+            'simd/jidctfst-sse2-64.asm',
+            'simd/jidctint-sse2-64.asm',
+            'simd/jidctred-sse2-64.asm',
+            'simd/jquantf-sse2-64.asm',
+            'simd/jquanti-sse2-64.asm',
           ],
         }],
         # MemorySanitizer doesn't support assembly code, so keep it disabled in
@@ -208,7 +210,6 @@
                   '-D__x86__',
                   '-DWIN32',
                   '-DMSVC',
-                  '-Iwin/'
                 ],
               }, {
                 'yasm_format': '-fwin64',
@@ -216,7 +217,6 @@
                   '-D__x86_64__',
                   '-DWIN64',
                   '-DMSVC',
-                  '-Iwin/'
                 ],
               }],
             ],
@@ -234,14 +234,12 @@
                 'yasm_flags': [
                   '-D__x86__',
                   '-DMACHO',
-                  '-Imac/'
                 ],
               }, {
                 'yasm_format': '-fmacho64',
                 'yasm_flags': [
                   '-D__x86_64__',
                   '-DMACHO',
-                  '-Imac/'
                 ],
               }],
             ],
@@ -267,14 +265,12 @@
                 'yasm_flags': [
                   '-D__x86__',
                   '-DELF',
-                  '-Ilinux/'
                 ],
               }, {
                 'yasm_format': '-felf64',
                 'yasm_flags': [
                   '-D__x86_64__',
                   '-DELF',
-                  '-Ilinux/'
                 ],
               }],
             ],
diff --git a/libjpeg.map.in b/libjpeg.map.in
new file mode 100644
index 0000000..b4480d8
--- /dev/null
+++ b/libjpeg.map.in
@@ -0,0 +1,11 @@
+LIBJPEGTURBO_@JPEG_LIB_VERSION_DECIMAL@ {
+  @MEM_SRCDST_FUNCTIONS@
+  local:
+    jsimd_*;
+    jconst_*;
+};
+
+LIBJPEG_@JPEG_LIB_VERSION_DECIMAL@ {
+  global:
+    *;
+};
diff --git a/libjpeg.txt b/libjpeg.txt
new file mode 100644
index 0000000..71d37c6
--- /dev/null
+++ b/libjpeg.txt
@@ -0,0 +1,3105 @@
+USING THE IJG JPEG LIBRARY
+
+This file was part of the Independent JPEG Group's software:
+Copyright (C) 1994-2013, Thomas G. Lane, Guido Vollbeding.
+libjpeg-turbo Modifications:
+Copyright (C) 2010, 2014-2016, D. R. Commander.
+Copyright (C) 2015, Google, Inc.
+For conditions of distribution and use, see the accompanying README.ijg file.
+
+
+This file describes how to use the IJG JPEG library within an application
+program.  Read it if you want to write a program that uses the library.
+
+The file example.c provides heavily commented skeleton code for calling the
+JPEG library.  Also see jpeglib.h (the include file to be used by application
+programs) for full details about data structures and function parameter lists.
+The library source code, of course, is the ultimate reference.
+
+Note that there have been *major* changes from the application interface
+presented by IJG version 4 and earlier versions.  The old design had several
+inherent limitations, and it had accumulated a lot of cruft as we added
+features while trying to minimize application-interface changes.  We have
+sacrificed backward compatibility in the version 5 rewrite, but we think the
+improvements justify this.
+
+
+TABLE OF CONTENTS
+-----------------
+
+Overview:
+        Functions provided by the library
+        Outline of typical usage
+Basic library usage:
+        Data formats
+        Compression details
+        Decompression details
+        Mechanics of usage: include files, linking, etc
+Advanced features:
+        Compression parameter selection
+        Decompression parameter selection
+        Special color spaces
+        Error handling
+        Compressed data handling (source and destination managers)
+        I/O suspension
+        Progressive JPEG support
+        Buffered-image mode
+        Abbreviated datastreams and multiple images
+        Special markers
+        Raw (downsampled) image data
+        Really raw data: DCT coefficients
+        Progress monitoring
+        Memory management
+        Memory usage
+        Library compile-time options
+        Portability considerations
+
+You should read at least the overview and basic usage sections before trying
+to program with the library.  The sections on advanced features can be read
+if and when you need them.
+
+
+OVERVIEW
+========
+
+Functions provided by the library
+---------------------------------
+
+The IJG JPEG library provides C code to read and write JPEG-compressed image
+files.  The surrounding application program receives or supplies image data a
+scanline at a time, using a straightforward uncompressed image format.  All
+details of color conversion and other preprocessing/postprocessing can be
+handled by the library.
+
+The library includes a substantial amount of code that is not covered by the
+JPEG standard but is necessary for typical applications of JPEG.  These
+functions preprocess the image before JPEG compression or postprocess it after
+decompression.  They include colorspace conversion, downsampling/upsampling,
+and color quantization.  The application indirectly selects use of this code
+by specifying the format in which it wishes to supply or receive image data.
+For example, if colormapped output is requested, then the decompression
+library automatically invokes color quantization.
+
+A wide range of quality vs. speed tradeoffs are possible in JPEG processing,
+and even more so in decompression postprocessing.  The decompression library
+provides multiple implementations that cover most of the useful tradeoffs,
+ranging from very-high-quality down to fast-preview operation.  On the
+compression side we have generally not provided low-quality choices, since
+compression is normally less time-critical.  It should be understood that the
+low-quality modes may not meet the JPEG standard's accuracy requirements;
+nonetheless, they are useful for viewers.
+
+A word about functions *not* provided by the library.  We handle a subset of
+the ISO JPEG standard; most baseline, extended-sequential, and progressive
+JPEG processes are supported.  (Our subset includes all features now in common
+use.)  Unsupported ISO options include:
+        * Hierarchical storage
+        * Lossless JPEG
+        * DNL marker
+        * Nonintegral subsampling ratios
+We support both 8- and 12-bit data precision, but this is a compile-time
+choice rather than a run-time choice; hence it is difficult to use both
+precisions in a single application.
+
+By itself, the library handles only interchange JPEG datastreams --- in
+particular the widely used JFIF file format.  The library can be used by
+surrounding code to process interchange or abbreviated JPEG datastreams that
+are embedded in more complex file formats.  (For example, this library is
+used by the free LIBTIFF library to support JPEG compression in TIFF.)
+
+
+Outline of typical usage
+------------------------
+
+The rough outline of a JPEG compression operation is:
+
+        Allocate and initialize a JPEG compression object
+        Specify the destination for the compressed data (eg, a file)
+        Set parameters for compression, including image size & colorspace
+        jpeg_start_compress(...);
+        while (scan lines remain to be written)
+                jpeg_write_scanlines(...);
+        jpeg_finish_compress(...);
+        Release the JPEG compression object
+
+A JPEG compression object holds parameters and working state for the JPEG
+library.  We make creation/destruction of the object separate from starting
+or finishing compression of an image; the same object can be re-used for a
+series of image compression operations.  This makes it easy to re-use the
+same parameter settings for a sequence of images.  Re-use of a JPEG object
+also has important implications for processing abbreviated JPEG datastreams,
+as discussed later.
+
+The image data to be compressed is supplied to jpeg_write_scanlines() from
+in-memory buffers.  If the application is doing file-to-file compression,
+reading image data from the source file is the application's responsibility.
+The library emits compressed data by calling a "data destination manager",
+which typically will write the data into a file; but the application can
+provide its own destination manager to do something else.
+
+Similarly, the rough outline of a JPEG decompression operation is:
+
+        Allocate and initialize a JPEG decompression object
+        Specify the source of the compressed data (eg, a file)
+        Call jpeg_read_header() to obtain image info
+        Set parameters for decompression
+        jpeg_start_decompress(...);
+        while (scan lines remain to be read)
+                jpeg_read_scanlines(...);
+        jpeg_finish_decompress(...);
+        Release the JPEG decompression object
+
+This is comparable to the compression outline except that reading the
+datastream header is a separate step.  This is helpful because information
+about the image's size, colorspace, etc is available when the application
+selects decompression parameters.  For example, the application can choose an
+output scaling ratio that will fit the image into the available screen size.
+
+The decompression library obtains compressed data by calling a data source
+manager, which typically will read the data from a file; but other behaviors
+can be obtained with a custom source manager.  Decompressed data is delivered
+into in-memory buffers passed to jpeg_read_scanlines().
+
+It is possible to abort an incomplete compression or decompression operation
+by calling jpeg_abort(); or, if you do not need to retain the JPEG object,
+simply release it by calling jpeg_destroy().
+
+JPEG compression and decompression objects are two separate struct types.
+However, they share some common fields, and certain routines such as
+jpeg_destroy() can work on either type of object.
+
+The JPEG library has no static variables: all state is in the compression
+or decompression object.  Therefore it is possible to process multiple
+compression and decompression operations concurrently, using multiple JPEG
+objects.
+
+Both compression and decompression can be done in an incremental memory-to-
+memory fashion, if suitable source/destination managers are used.  See the
+section on "I/O suspension" for more details.
+
+
+BASIC LIBRARY USAGE
+===================
+
+Data formats
+------------
+
+Before diving into procedural details, it is helpful to understand the
+image data format that the JPEG library expects or returns.
+
+The standard input image format is a rectangular array of pixels, with each
+pixel having the same number of "component" or "sample" values (color
+channels).  You must specify how many components there are and the colorspace
+interpretation of the components.  Most applications will use RGB data
+(three components per pixel) or grayscale data (one component per pixel).
+PLEASE NOTE THAT RGB DATA IS THREE SAMPLES PER PIXEL, GRAYSCALE ONLY ONE.
+A remarkable number of people manage to miss this, only to find that their
+programs don't work with grayscale JPEG files.
+
+There is no provision for colormapped input.  JPEG files are always full-color
+or full grayscale (or sometimes another colorspace such as CMYK).  You can
+feed in a colormapped image by expanding it to full-color format.  However
+JPEG often doesn't work very well with source data that has been colormapped,
+because of dithering noise.  This is discussed in more detail in the JPEG FAQ
+and the other references mentioned in the README.ijg file.
+
+Pixels are stored by scanlines, with each scanline running from left to
+right.  The component values for each pixel are adjacent in the row; for
+example, R,G,B,R,G,B,R,G,B,... for 24-bit RGB color.  Each scanline is an
+array of data type JSAMPLE --- which is typically "unsigned char", unless
+you've changed jmorecfg.h.  (You can also change the RGB pixel layout, say
+to B,G,R order, by modifying jmorecfg.h.  But see the restrictions listed in
+that file before doing so.)
+
+A 2-D array of pixels is formed by making a list of pointers to the starts of
+scanlines; so the scanlines need not be physically adjacent in memory.  Even
+if you process just one scanline at a time, you must make a one-element
+pointer array to conform to this structure.  Pointers to JSAMPLE rows are of
+type JSAMPROW, and the pointer to the pointer array is of type JSAMPARRAY.
+
+The library accepts or supplies one or more complete scanlines per call.
+It is not possible to process part of a row at a time.  Scanlines are always
+processed top-to-bottom.  You can process an entire image in one call if you
+have it all in memory, but usually it's simplest to process one scanline at
+a time.
+
+For best results, source data values should have the precision specified by
+BITS_IN_JSAMPLE (normally 8 bits).  For instance, if you choose to compress
+data that's only 6 bits/channel, you should left-justify each value in a
+byte before passing it to the compressor.  If you need to compress data
+that has more than 8 bits/channel, compile with BITS_IN_JSAMPLE = 12.
+(See "Library compile-time options", later.)
+
+
+The data format returned by the decompressor is the same in all details,
+except that colormapped output is supported.  (Again, a JPEG file is never
+colormapped.  But you can ask the decompressor to perform on-the-fly color
+quantization to deliver colormapped output.)  If you request colormapped
+output then the returned data array contains a single JSAMPLE per pixel;
+its value is an index into a color map.  The color map is represented as
+a 2-D JSAMPARRAY in which each row holds the values of one color component,
+that is, colormap[i][j] is the value of the i'th color component for pixel
+value (map index) j.  Note that since the colormap indexes are stored in
+JSAMPLEs, the maximum number of colors is limited by the size of JSAMPLE
+(ie, at most 256 colors for an 8-bit JPEG library).
+
+
+Compression details
+-------------------
+
+Here we revisit the JPEG compression outline given in the overview.
+
+1. Allocate and initialize a JPEG compression object.
+
+A JPEG compression object is a "struct jpeg_compress_struct".  (It also has
+a bunch of subsidiary structures which are allocated via malloc(), but the
+application doesn't control those directly.)  This struct can be just a local
+variable in the calling routine, if a single routine is going to execute the
+whole JPEG compression sequence.  Otherwise it can be static or allocated
+from malloc().
+
+You will also need a structure representing a JPEG error handler.  The part
+of this that the library cares about is a "struct jpeg_error_mgr".  If you
+are providing your own error handler, you'll typically want to embed the
+jpeg_error_mgr struct in a larger structure; this is discussed later under
+"Error handling".  For now we'll assume you are just using the default error
+handler.  The default error handler will print JPEG error/warning messages
+on stderr, and it will call exit() if a fatal error occurs.
+
+You must initialize the error handler structure, store a pointer to it into
+the JPEG object's "err" field, and then call jpeg_create_compress() to
+initialize the rest of the JPEG object.
+
+Typical code for this step, if you are using the default error handler, is
+
+        struct jpeg_compress_struct cinfo;
+        struct jpeg_error_mgr jerr;
+        ...
+        cinfo.err = jpeg_std_error(&jerr);
+        jpeg_create_compress(&cinfo);
+
+jpeg_create_compress allocates a small amount of memory, so it could fail
+if you are out of memory.  In that case it will exit via the error handler;
+that's why the error handler must be initialized first.
+
+
+2. Specify the destination for the compressed data (eg, a file).
+
+As previously mentioned, the JPEG library delivers compressed data to a
+"data destination" module.  The library includes one data destination
+module which knows how to write to a stdio stream.  You can use your own
+destination module if you want to do something else, as discussed later.
+
+If you use the standard destination module, you must open the target stdio
+stream beforehand.  Typical code for this step looks like:
+
+        FILE *outfile;
+        ...
+        if ((outfile = fopen(filename, "wb")) == NULL) {
+            fprintf(stderr, "can't open %s\n", filename);
+            exit(1);
+        }
+        jpeg_stdio_dest(&cinfo, outfile);
+
+where the last line invokes the standard destination module.
+
+WARNING: it is critical that the binary compressed data be delivered to the
+output file unchanged.  On non-Unix systems the stdio library may perform
+newline translation or otherwise corrupt binary data.  To suppress this
+behavior, you may need to use a "b" option to fopen (as shown above), or use
+setmode() or another routine to put the stdio stream in binary mode.  See
+cjpeg.c and djpeg.c for code that has been found to work on many systems.
+
+You can select the data destination after setting other parameters (step 3),
+if that's more convenient.  You may not change the destination between
+calling jpeg_start_compress() and jpeg_finish_compress().
+
+
+3. Set parameters for compression, including image size & colorspace.
+
+You must supply information about the source image by setting the following
+fields in the JPEG object (cinfo structure):
+
+        image_width             Width of image, in pixels
+        image_height            Height of image, in pixels
+        input_components        Number of color channels (samples per pixel)
+        in_color_space          Color space of source image
+
+The image dimensions are, hopefully, obvious.  JPEG supports image dimensions
+of 1 to 64K pixels in either direction.  The input color space is typically
+RGB or grayscale, and input_components is 3 or 1 accordingly.  (See "Special
+color spaces", later, for more info.)  The in_color_space field must be
+assigned one of the J_COLOR_SPACE enum constants, typically JCS_RGB or
+JCS_GRAYSCALE.
+
+JPEG has a large number of compression parameters that determine how the
+image is encoded.  Most applications don't need or want to know about all
+these parameters.  You can set all the parameters to reasonable defaults by
+calling jpeg_set_defaults(); then, if there are particular values you want
+to change, you can do so after that.  The "Compression parameter selection"
+section tells about all the parameters.
+
+You must set in_color_space correctly before calling jpeg_set_defaults(),
+because the defaults depend on the source image colorspace.  However the
+other three source image parameters need not be valid until you call
+jpeg_start_compress().  There's no harm in calling jpeg_set_defaults() more
+than once, if that happens to be convenient.
+
+Typical code for a 24-bit RGB source image is
+
+        cinfo.image_width = Width;      /* image width and height, in pixels */
+        cinfo.image_height = Height;
+        cinfo.input_components = 3;     /* # of color components per pixel */
+        cinfo.in_color_space = JCS_RGB; /* colorspace of input image */
+
+        jpeg_set_defaults(&cinfo);
+        /* Make optional parameter settings here */
+
+
+4. jpeg_start_compress(...);
+
+After you have established the data destination and set all the necessary
+source image info and other parameters, call jpeg_start_compress() to begin
+a compression cycle.  This will initialize internal state, allocate working
+storage, and emit the first few bytes of the JPEG datastream header.
+
+Typical code:
+
+        jpeg_start_compress(&cinfo, TRUE);
+
+The "TRUE" parameter ensures that a complete JPEG interchange datastream
+will be written.  This is appropriate in most cases.  If you think you might
+want to use an abbreviated datastream, read the section on abbreviated
+datastreams, below.
+
+Once you have called jpeg_start_compress(), you may not alter any JPEG
+parameters or other fields of the JPEG object until you have completed
+the compression cycle.
+
+
+5. while (scan lines remain to be written)
+        jpeg_write_scanlines(...);
+
+Now write all the required image data by calling jpeg_write_scanlines()
+one or more times.  You can pass one or more scanlines in each call, up
+to the total image height.  In most applications it is convenient to pass
+just one or a few scanlines at a time.  The expected format for the passed
+data is discussed under "Data formats", above.
+
+Image data should be written in top-to-bottom scanline order.  The JPEG spec
+contains some weasel wording about how top and bottom are application-defined
+terms (a curious interpretation of the English language...) but if you want
+your files to be compatible with everyone else's, you WILL use top-to-bottom
+order.  If the source data must be read in bottom-to-top order, you can use
+the JPEG library's virtual array mechanism to invert the data efficiently.
+Examples of this can be found in the sample application cjpeg.
+
+The library maintains a count of the number of scanlines written so far
+in the next_scanline field of the JPEG object.  Usually you can just use
+this variable as the loop counter, so that the loop test looks like
+"while (cinfo.next_scanline < cinfo.image_height)".
+
+Code for this step depends heavily on the way that you store the source data.
+example.c shows the following code for the case of a full-size 2-D source
+array containing 3-byte RGB pixels:
+
+        JSAMPROW row_pointer[1];        /* pointer to a single row */
+        int row_stride;                 /* physical row width in buffer */
+
+        row_stride = image_width * 3;   /* JSAMPLEs per row in image_buffer */
+
+        while (cinfo.next_scanline < cinfo.image_height) {
+            row_pointer[0] = & image_buffer[cinfo.next_scanline * row_stride];
+            jpeg_write_scanlines(&cinfo, row_pointer, 1);
+        }
+
+jpeg_write_scanlines() returns the number of scanlines actually written.
+This will normally be equal to the number passed in, so you can usually
+ignore the return value.  It is different in just two cases:
+  * If you try to write more scanlines than the declared image height,
+    the additional scanlines are ignored.
+  * If you use a suspending data destination manager, output buffer overrun
+    will cause the compressor to return before accepting all the passed lines.
+    This feature is discussed under "I/O suspension", below.  The normal
+    stdio destination manager will NOT cause this to happen.
+In any case, the return value is the same as the change in the value of
+next_scanline.
+
+
+6. jpeg_finish_compress(...);
+
+After all the image data has been written, call jpeg_finish_compress() to
+complete the compression cycle.  This step is ESSENTIAL to ensure that the
+last bufferload of data is written to the data destination.
+jpeg_finish_compress() also releases working memory associated with the JPEG
+object.
+
+Typical code:
+
+        jpeg_finish_compress(&cinfo);
+
+If using the stdio destination manager, don't forget to close the output
+stdio stream (if necessary) afterwards.
+
+If you have requested a multi-pass operating mode, such as Huffman code
+optimization, jpeg_finish_compress() will perform the additional passes using
+data buffered by the first pass.  In this case jpeg_finish_compress() may take
+quite a while to complete.  With the default compression parameters, this will
+not happen.
+
+It is an error to call jpeg_finish_compress() before writing the necessary
+total number of scanlines.  If you wish to abort compression, call
+jpeg_abort() as discussed below.
+
+After completing a compression cycle, you may dispose of the JPEG object
+as discussed next, or you may use it to compress another image.  In that case
+return to step 2, 3, or 4 as appropriate.  If you do not change the
+destination manager, the new datastream will be written to the same target.
+If you do not change any JPEG parameters, the new datastream will be written
+with the same parameters as before.  Note that you can change the input image
+dimensions freely between cycles, but if you change the input colorspace, you
+should call jpeg_set_defaults() to adjust for the new colorspace; and then
+you'll need to repeat all of step 3.
+
+
+7. Release the JPEG compression object.
+
+When you are done with a JPEG compression object, destroy it by calling
+jpeg_destroy_compress().  This will free all subsidiary memory (regardless of
+the previous state of the object).  Or you can call jpeg_destroy(), which
+works for either compression or decompression objects --- this may be more
+convenient if you are sharing code between compression and decompression
+cases.  (Actually, these routines are equivalent except for the declared type
+of the passed pointer.  To avoid gripes from ANSI C compilers, jpeg_destroy()
+should be passed a j_common_ptr.)
+
+If you allocated the jpeg_compress_struct structure from malloc(), freeing
+it is your responsibility --- jpeg_destroy() won't.  Ditto for the error
+handler structure.
+
+Typical code:
+
+        jpeg_destroy_compress(&cinfo);
+
+
+8. Aborting.
+
+If you decide to abort a compression cycle before finishing, you can clean up
+in either of two ways:
+
+* If you don't need the JPEG object any more, just call
+  jpeg_destroy_compress() or jpeg_destroy() to release memory.  This is
+  legitimate at any point after calling jpeg_create_compress() --- in fact,
+  it's safe even if jpeg_create_compress() fails.
+
+* If you want to re-use the JPEG object, call jpeg_abort_compress(), or call
+  jpeg_abort() which works on both compression and decompression objects.
+  This will return the object to an idle state, releasing any working memory.
+  jpeg_abort() is allowed at any time after successful object creation.
+
+Note that cleaning up the data destination, if required, is your
+responsibility; neither of these routines will call term_destination().
+(See "Compressed data handling", below, for more about that.)
+
+jpeg_destroy() and jpeg_abort() are the only safe calls to make on a JPEG
+object that has reported an error by calling error_exit (see "Error handling"
+for more info).  The internal state of such an object is likely to be out of
+whack.  Either of these two routines will return the object to a known state.
+
+
+Decompression details
+---------------------
+
+Here we revisit the JPEG decompression outline given in the overview.
+
+1. Allocate and initialize a JPEG decompression object.
+
+This is just like initialization for compression, as discussed above,
+except that the object is a "struct jpeg_decompress_struct" and you
+call jpeg_create_decompress().  Error handling is exactly the same.
+
+Typical code:
+
+        struct jpeg_decompress_struct cinfo;
+        struct jpeg_error_mgr jerr;
+        ...
+        cinfo.err = jpeg_std_error(&jerr);
+        jpeg_create_decompress(&cinfo);
+
+(Both here and in the IJG code, we usually use variable name "cinfo" for
+both compression and decompression objects.)
+
+
+2. Specify the source of the compressed data (eg, a file).
+
+As previously mentioned, the JPEG library reads compressed data from a "data
+source" module.  The library includes one data source module which knows how
+to read from a stdio stream.  You can use your own source module if you want
+to do something else, as discussed later.
+
+If you use the standard source module, you must open the source stdio stream
+beforehand.  Typical code for this step looks like:
+
+        FILE *infile;
+        ...
+        if ((infile = fopen(filename, "rb")) == NULL) {
+            fprintf(stderr, "can't open %s\n", filename);
+            exit(1);
+        }
+        jpeg_stdio_src(&cinfo, infile);
+
+where the last line invokes the standard source module.
+
+WARNING: it is critical that the binary compressed data be read unchanged.
+On non-Unix systems the stdio library may perform newline translation or
+otherwise corrupt binary data.  To suppress this behavior, you may need to use
+a "b" option to fopen (as shown above), or use setmode() or another routine to
+put the stdio stream in binary mode.  See cjpeg.c and djpeg.c for code that
+has been found to work on many systems.
+
+You may not change the data source between calling jpeg_read_header() and
+jpeg_finish_decompress().  If you wish to read a series of JPEG images from
+a single source file, you should repeat the jpeg_read_header() to
+jpeg_finish_decompress() sequence without reinitializing either the JPEG
+object or the data source module; this prevents buffered input data from
+being discarded.
+
+
+3. Call jpeg_read_header() to obtain image info.
+
+Typical code for this step is just
+
+        jpeg_read_header(&cinfo, TRUE);
+
+This will read the source datastream header markers, up to the beginning
+of the compressed data proper.  On return, the image dimensions and other
+info have been stored in the JPEG object.  The application may wish to
+consult this information before selecting decompression parameters.
+
+More complex code is necessary if
+  * A suspending data source is used --- in that case jpeg_read_header()
+    may return before it has read all the header data.  See "I/O suspension",
+    below.  The normal stdio source manager will NOT cause this to happen.
+  * Abbreviated JPEG files are to be processed --- see the section on
+    abbreviated datastreams.  Standard applications that deal only in
+    interchange JPEG files need not be concerned with this case either.
+
+It is permissible to stop at this point if you just wanted to find out the
+image dimensions and other header info for a JPEG file.  In that case,
+call jpeg_destroy() when you are done with the JPEG object, or call
+jpeg_abort() to return it to an idle state before selecting a new data
+source and reading another header.
+
+
+4. Set parameters for decompression.
+
+jpeg_read_header() sets appropriate default decompression parameters based on
+the properties of the image (in particular, its colorspace).  However, you
+may well want to alter these defaults before beginning the decompression.
+For example, the default is to produce full color output from a color file.
+If you want colormapped output you must ask for it.  Other options allow the
+returned image to be scaled and allow various speed/quality tradeoffs to be
+selected.  "Decompression parameter selection", below, gives details.
+
+If the defaults are appropriate, nothing need be done at this step.
+
+Note that all default values are set by each call to jpeg_read_header().
+If you reuse a decompression object, you cannot expect your parameter
+settings to be preserved across cycles, as you can for compression.
+You must set desired parameter values each time.
+
+
+5. jpeg_start_decompress(...);
+
+Once the parameter values are satisfactory, call jpeg_start_decompress() to
+begin decompression.  This will initialize internal state, allocate working
+memory, and prepare for returning data.
+
+Typical code is just
+
+        jpeg_start_decompress(&cinfo);
+
+If you have requested a multi-pass operating mode, such as 2-pass color
+quantization, jpeg_start_decompress() will do everything needed before data
+output can begin.  In this case jpeg_start_decompress() may take quite a while
+to complete.  With a single-scan (non progressive) JPEG file and default
+decompression parameters, this will not happen; jpeg_start_decompress() will
+return quickly.
+
+After this call, the final output image dimensions, including any requested
+scaling, are available in the JPEG object; so is the selected colormap, if
+colormapped output has been requested.  Useful fields include
+
+        output_width            image width and height, as scaled
+        output_height
+        out_color_components    # of color components in out_color_space
+        output_components       # of color components returned per pixel
+        colormap                the selected colormap, if any
+        actual_number_of_colors         number of entries in colormap
+
+output_components is 1 (a colormap index) when quantizing colors; otherwise it
+equals out_color_components.  It is the number of JSAMPLE values that will be
+emitted per pixel in the output arrays.
+
+Typically you will need to allocate data buffers to hold the incoming image.
+You will need output_width * output_components JSAMPLEs per scanline in your
+output buffer, and a total of output_height scanlines will be returned.
+
+Note: if you are using the JPEG library's internal memory manager to allocate
+data buffers (as djpeg does), then the manager's protocol requires that you
+request large buffers *before* calling jpeg_start_decompress().  This is a
+little tricky since the output_XXX fields are not normally valid then.  You
+can make them valid by calling jpeg_calc_output_dimensions() after setting the
+relevant parameters (scaling, output color space, and quantization flag).
+
+
+6. while (scan lines remain to be read)
+        jpeg_read_scanlines(...);
+
+Now you can read the decompressed image data by calling jpeg_read_scanlines()
+one or more times.  At each call, you pass in the maximum number of scanlines
+to be read (ie, the height of your working buffer); jpeg_read_scanlines()
+will return up to that many lines.  The return value is the number of lines
+actually read.  The format of the returned data is discussed under "Data
+formats", above.  Don't forget that grayscale and color JPEGs will return
+different data formats!
+
+Image data is returned in top-to-bottom scanline order.  If you must write
+out the image in bottom-to-top order, you can use the JPEG library's virtual
+array mechanism to invert the data efficiently.  Examples of this can be
+found in the sample application djpeg.
+
+The library maintains a count of the number of scanlines returned so far
+in the output_scanline field of the JPEG object.  Usually you can just use
+this variable as the loop counter, so that the loop test looks like
+"while (cinfo.output_scanline < cinfo.output_height)".  (Note that the test
+should NOT be against image_height, unless you never use scaling.  The
+image_height field is the height of the original unscaled image.)
+The return value always equals the change in the value of output_scanline.
+
+If you don't use a suspending data source, it is safe to assume that
+jpeg_read_scanlines() reads at least one scanline per call, until the
+bottom of the image has been reached.
+
+If you use a buffer larger than one scanline, it is NOT safe to assume that
+jpeg_read_scanlines() fills it.  (The current implementation returns only a
+few scanlines per call, no matter how large a buffer you pass.)  So you must
+always provide a loop that calls jpeg_read_scanlines() repeatedly until the
+whole image has been read.
+
+
+7. jpeg_finish_decompress(...);
+
+After all the image data has been read, call jpeg_finish_decompress() to
+complete the decompression cycle.  This causes working memory associated
+with the JPEG object to be released.
+
+Typical code:
+
+        jpeg_finish_decompress(&cinfo);
+
+If using the stdio source manager, don't forget to close the source stdio
+stream if necessary.
+
+It is an error to call jpeg_finish_decompress() before reading the correct
+total number of scanlines.  If you wish to abort decompression, call
+jpeg_abort() as discussed below.
+
+After completing a decompression cycle, you may dispose of the JPEG object as
+discussed next, or you may use it to decompress another image.  In that case
+return to step 2 or 3 as appropriate.  If you do not change the source
+manager, the next image will be read from the same source.
+
+
+8. Release the JPEG decompression object.
+
+When you are done with a JPEG decompression object, destroy it by calling
+jpeg_destroy_decompress() or jpeg_destroy().  The previous discussion of
+destroying compression objects applies here too.
+
+Typical code:
+
+        jpeg_destroy_decompress(&cinfo);
+
+
+9. Aborting.
+
+You can abort a decompression cycle by calling jpeg_destroy_decompress() or
+jpeg_destroy() if you don't need the JPEG object any more, or
+jpeg_abort_decompress() or jpeg_abort() if you want to reuse the object.
+The previous discussion of aborting compression cycles applies here too.
+
+
+Partial image decompression
+---------------------------
+
+Partial image decompression is convenient for performance-critical applications
+that wish to view only a portion of a large JPEG image without decompressing
+the whole thing.  It it also useful in memory-constrained environments (such as
+on mobile devices.)  This library provides the following functions to support
+partial image decompression:
+
+1. Skipping rows when decompressing
+
+        jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines);
+
+This function provides application programmers with the ability to skip over
+multiple rows in the JPEG image.
+
+Suspending data sources are not supported by this function.  Calling
+jpeg_skip_scanlines() with a suspending data source will result in undefined
+behavior.
+
+jpeg_skip_scanlines() will not allow skipping past the bottom of the image.  If
+the value of num_lines is large enough to skip past the bottom of the image,
+then the function will skip to the end of the image instead.
+
+If the value of num_lines is valid, then jpeg_skip_scanlines() will always
+skip all of the input rows requested.  There is no need to inspect the return
+value of the function in that case.
+
+Best results will be achieved by calling jpeg_skip_scanlines() for large chunks
+of rows.  The function should be viewed as a way to quickly jump to a
+particular vertical offset in the JPEG image in order to decode a subset of the
+image.  Used in this manner, it will provide significant performance
+improvements.
+
+Calling jpeg_skip_scanlines() for small values of num_lines has several
+potential drawbacks:
+    1) JPEG decompression occurs in blocks, so if jpeg_skip_scanlines() is
+       called from the middle of a decompression block, then it is likely that
+       much of the decompression work has already been done for the first
+       couple of rows that need to be skipped.
+    2) When this function returns, it must leave the decompressor in a state
+       such that it is ready to read the next line.  This may involve
+       decompressing a block that must be partially skipped.
+These issues are especially tricky for cases in which upsampling requires
+context rows.  In the worst case, jpeg_skip_scanlines() will perform similarly
+to jpeg_read_scanlines() (since it will actually call jpeg_read_scanlines().)
+
+2. Decompressing partial scanlines
+
+        jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                            JDIMENSION *width)
+
+This function provides application programmers with the ability to decompress
+only a portion of each row in the JPEG image.  It must be called after
+jpeg_start_decompress() and before any calls to jpeg_read_scanlines() or
+jpeg_skip_scanlines().
+
+If xoffset and width do not form a valid subset of the image row, then this
+function will generate an error.  Note that if the output image is scaled, then
+xoffset and width are relative to the scaled image dimensions.
+
+xoffset and width are passed by reference because xoffset must fall on an iMCU
+boundary.  If it doesn't, then it will be moved left to the nearest iMCU
+boundary, and width will be increased accordingly.  If the calling program does
+not like the adjusted values of xoffset and width, then it can call
+jpeg_crop_scanline() again with new values (for instance, if it wants to move
+xoffset to the nearest iMCU boundary to the right instead of to the left.)
+
+After calling this function, cinfo->output_width will be set to the adjusted
+width.  This value should be used when allocating an output buffer to pass to
+jpeg_read_scanlines().
+
+The output image from a partial-width decompression will be identical to the
+corresponding image region from a full decode, with one exception:  The "fancy"
+(smooth) h2v2 (4:2:0) and h2v1 (4:2:2) upsampling algorithms fill in the
+missing chroma components by averaging the chroma components from neighboring
+pixels, except on the right and left edges of the image (where there are no
+neighboring pixels.)  When performing a partial-width decompression, these
+"fancy" upsampling algorithms may treat the left and right edges of the partial
+image region as if they are the left and right edges of the image, meaning that
+the upsampling algorithm may be simplified.  The result is that the pixels on
+the left or right edge of the partial image may not be exactly identical to the
+corresponding pixels in the original image.
+
+
+Mechanics of usage: include files, linking, etc
+-----------------------------------------------
+
+Applications using the JPEG library should include the header file jpeglib.h
+to obtain declarations of data types and routines.  Before including
+jpeglib.h, include system headers that define at least the typedefs FILE and
+size_t.  On ANSI-conforming systems, including <stdio.h> is sufficient; on
+older Unix systems, you may need <sys/types.h> to define size_t.
+
+If the application needs to refer to individual JPEG library error codes, also
+include jerror.h to define those symbols.
+
+jpeglib.h indirectly includes the files jconfig.h and jmorecfg.h.  If you are
+installing the JPEG header files in a system directory, you will want to
+install all four files: jpeglib.h, jerror.h, jconfig.h, jmorecfg.h.
+
+The most convenient way to include the JPEG code into your executable program
+is to prepare a library file ("libjpeg.a", or a corresponding name on non-Unix
+machines) and reference it at your link step.  If you use only half of the
+library (only compression or only decompression), only that much code will be
+included from the library, unless your linker is hopelessly brain-damaged.
+The supplied makefiles build libjpeg.a automatically (see install.txt).
+
+While you can build the JPEG library as a shared library if the whim strikes
+you, we don't really recommend it.  The trouble with shared libraries is that
+at some point you'll probably try to substitute a new version of the library
+without recompiling the calling applications.  That generally doesn't work
+because the parameter struct declarations usually change with each new
+version.  In other words, the library's API is *not* guaranteed binary
+compatible across versions; we only try to ensure source-code compatibility.
+(In hindsight, it might have been smarter to hide the parameter structs from
+applications and introduce a ton of access functions instead.  Too late now,
+however.)
+
+It may be worth pointing out that the core JPEG library does not actually
+require the stdio library: only the default source/destination managers and
+error handler need it.  You can use the library in a stdio-less environment
+if you replace those modules and use jmemnobs.c (or another memory manager of
+your own devising).  More info about the minimum system library requirements
+may be found in jinclude.h.
+
+
+ADVANCED FEATURES
+=================
+
+Compression parameter selection
+-------------------------------
+
+This section describes all the optional parameters you can set for JPEG
+compression, as well as the "helper" routines provided to assist in this
+task.  Proper setting of some parameters requires detailed understanding
+of the JPEG standard; if you don't know what a parameter is for, it's best
+not to mess with it!  See REFERENCES in the README.ijg file for pointers to
+more info about JPEG.
+
+It's a good idea to call jpeg_set_defaults() first, even if you plan to set
+all the parameters; that way your code is more likely to work with future JPEG
+libraries that have additional parameters.  For the same reason, we recommend
+you use a helper routine where one is provided, in preference to twiddling
+cinfo fields directly.
+
+The helper routines are:
+
+jpeg_set_defaults (j_compress_ptr cinfo)
+        This routine sets all JPEG parameters to reasonable defaults, using
+        only the input image's color space (field in_color_space, which must
+        already be set in cinfo).  Many applications will only need to use
+        this routine and perhaps jpeg_set_quality().
+
+jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
+        Sets the JPEG file's colorspace (field jpeg_color_space) as specified,
+        and sets other color-space-dependent parameters appropriately.  See
+        "Special color spaces", below, before using this.  A large number of
+        parameters, including all per-component parameters, are set by this
+        routine; if you want to twiddle individual parameters you should call
+        jpeg_set_colorspace() before rather than after.
+
+jpeg_default_colorspace (j_compress_ptr cinfo)
+        Selects an appropriate JPEG colorspace based on cinfo->in_color_space,
+        and calls jpeg_set_colorspace().  This is actually a subroutine of
+        jpeg_set_defaults().  It's broken out in case you want to change
+        just the colorspace-dependent JPEG parameters.
+
+jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
+        Constructs JPEG quantization tables appropriate for the indicated
+        quality setting.  The quality value is expressed on the 0..100 scale
+        recommended by IJG (cjpeg's "-quality" switch uses this routine).
+        Note that the exact mapping from quality values to tables may change
+        in future IJG releases as more is learned about DCT quantization.
+        If the force_baseline parameter is TRUE, then the quantization table
+        entries are constrained to the range 1..255 for full JPEG baseline
+        compatibility.  In the current implementation, this only makes a
+        difference for quality settings below 25, and it effectively prevents
+        very small/low quality files from being generated.  The IJG decoder
+        is capable of reading the non-baseline files generated at low quality
+        settings when force_baseline is FALSE, but other decoders may not be.
+
+jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
+                         boolean force_baseline)
+        Same as jpeg_set_quality() except that the generated tables are the
+        sample tables given in the JPEC spec section K.1, multiplied by the
+        specified scale factor (which is expressed as a percentage; thus
+        scale_factor = 100 reproduces the spec's tables).  Note that larger
+        scale factors give lower quality.  This entry point is useful for
+        conforming to the Adobe PostScript DCT conventions, but we do not
+        recommend linear scaling as a user-visible quality scale otherwise.
+        force_baseline again constrains the computed table entries to 1..255.
+
+int jpeg_quality_scaling (int quality)
+        Converts a value on the IJG-recommended quality scale to a linear
+        scaling percentage.  Note that this routine may change or go away
+        in future releases --- IJG may choose to adopt a scaling method that
+        can't be expressed as a simple scalar multiplier, in which case the
+        premise of this routine collapses.  Caveat user.
+
+jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+        [libjpeg v7+ API/ABI emulation only]
+        Set default quantization tables with linear q_scale_factor[] values
+        (see below).
+
+jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
+                      const unsigned int *basic_table,
+                      int scale_factor, boolean force_baseline)
+        Allows an arbitrary quantization table to be created.  which_tbl
+        indicates which table slot to fill.  basic_table points to an array
+        of 64 unsigned ints given in normal array order.  These values are
+        multiplied by scale_factor/100 and then clamped to the range 1..65535
+        (or to 1..255 if force_baseline is TRUE).
+        CAUTION: prior to library version 6a, jpeg_add_quant_table expected
+        the basic table to be given in JPEG zigzag order.  If you need to
+        write code that works with either older or newer versions of this
+        routine, you must check the library version number.  Something like
+        "#if JPEG_LIB_VERSION >= 61" is the right test.
+
+jpeg_simple_progression (j_compress_ptr cinfo)
+        Generates a default scan script for writing a progressive-JPEG file.
+        This is the recommended method of creating a progressive file,
+        unless you want to make a custom scan sequence.  You must ensure that
+        the JPEG color space is set correctly before calling this routine.
+
+
+Compression parameters (cinfo fields) include:
+
+boolean arith_code
+	If TRUE, use arithmetic coding.
+	If FALSE, use Huffman coding.
+
+J_DCT_METHOD dct_method
+        Selects the algorithm used for the DCT step.  Choices are:
+                JDCT_ISLOW: slow but accurate integer algorithm
+                JDCT_IFAST: faster, less accurate integer method
+                JDCT_FLOAT: floating-point method
+                JDCT_DEFAULT: default method (normally JDCT_ISLOW)
+                JDCT_FASTEST: fastest method (normally JDCT_IFAST)
+        In libjpeg-turbo, JDCT_IFAST is generally about 5-15% faster than
+        JDCT_ISLOW when using the x86/x86-64 SIMD extensions (results may vary
+        with other SIMD implementations, or when using libjpeg-turbo without
+        SIMD extensions.)  For quality levels of 90 and below, there should be
+        little or no perceptible difference between the two algorithms.  For
+        quality levels above 90, however, the difference between JDCT_IFAST and
+        JDCT_ISLOW becomes more pronounced.  With quality=97, for instance,
+        JDCT_IFAST incurs generally about a 1-3 dB loss (in PSNR) relative to
+        JDCT_ISLOW, but this can be larger for some images.  Do not use
+        JDCT_IFAST with quality levels above 97.  The algorithm often
+        degenerates at quality=98 and above and can actually produce a more
+        lossy image than if lower quality levels had been used.  Also, in
+        libjpeg-turbo, JDCT_IFAST is not fully accelerated for quality levels
+        above 97, so it will be slower than JDCT_ISLOW.  JDCT_FLOAT is mainly a
+        legacy feature.  It does not produce significantly more accurate
+        results than the ISLOW method, and it is much slower.  The FLOAT method
+        may also give different results on different machines due to varying
+        roundoff behavior, whereas the integer methods should give the same
+        results on all machines.
+
+J_COLOR_SPACE jpeg_color_space
+int num_components
+        The JPEG color space and corresponding number of components; see
+        "Special color spaces", below, for more info.  We recommend using
+        jpeg_set_color_space() if you want to change these.
+
+boolean optimize_coding
+        TRUE causes the compressor to compute optimal Huffman coding tables
+        for the image.  This requires an extra pass over the data and
+        therefore costs a good deal of space and time.  The default is
+        FALSE, which tells the compressor to use the supplied or default
+        Huffman tables.  In most cases optimal tables save only a few percent
+        of file size compared to the default tables.  Note that when this is
+        TRUE, you need not supply Huffman tables at all, and any you do
+        supply will be overwritten.
+
+unsigned int restart_interval
+int restart_in_rows
+        To emit restart markers in the JPEG file, set one of these nonzero.
+        Set restart_interval to specify the exact interval in MCU blocks.
+        Set restart_in_rows to specify the interval in MCU rows.  (If
+        restart_in_rows is not 0, then restart_interval is set after the
+        image width in MCUs is computed.)  Defaults are zero (no restarts).
+        One restart marker per MCU row is often a good choice.
+        NOTE: the overhead of restart markers is higher in grayscale JPEG
+        files than in color files, and MUCH higher in progressive JPEGs.
+        If you use restarts, you may want to use larger intervals in those
+        cases.
+
+const jpeg_scan_info *scan_info
+int num_scans
+        By default, scan_info is NULL; this causes the compressor to write a
+        single-scan sequential JPEG file.  If not NULL, scan_info points to
+        an array of scan definition records of length num_scans.  The
+        compressor will then write a JPEG file having one scan for each scan
+        definition record.  This is used to generate noninterleaved or
+        progressive JPEG files.  The library checks that the scan array
+        defines a valid JPEG scan sequence.  (jpeg_simple_progression creates
+        a suitable scan definition array for progressive JPEG.)  This is
+        discussed further under "Progressive JPEG support".
+
+int smoothing_factor
+        If non-zero, the input image is smoothed; the value should be 1 for
+        minimal smoothing to 100 for maximum smoothing.  Consult jcsample.c
+        for details of the smoothing algorithm.  The default is zero.
+
+boolean write_JFIF_header
+        If TRUE, a JFIF APP0 marker is emitted.  jpeg_set_defaults() and
+        jpeg_set_colorspace() set this TRUE if a JFIF-legal JPEG color space
+        (ie, YCbCr or grayscale) is selected, otherwise FALSE.
+
+UINT8 JFIF_major_version
+UINT8 JFIF_minor_version
+        The version number to be written into the JFIF marker.
+        jpeg_set_defaults() initializes the version to 1.01 (major=minor=1).
+        You should set it to 1.02 (major=1, minor=2) if you plan to write
+        any JFIF 1.02 extension markers.
+
+UINT8 density_unit
+UINT16 X_density
+UINT16 Y_density
+        The resolution information to be written into the JFIF marker;
+        not used otherwise.  density_unit may be 0 for unknown,
+        1 for dots/inch, or 2 for dots/cm.  The default values are 0,1,1
+        indicating square pixels of unknown size.
+
+boolean write_Adobe_marker
+        If TRUE, an Adobe APP14 marker is emitted.  jpeg_set_defaults() and
+        jpeg_set_colorspace() set this TRUE if JPEG color space RGB, CMYK,
+        or YCCK is selected, otherwise FALSE.  It is generally a bad idea
+        to set both write_JFIF_header and write_Adobe_marker.  In fact,
+        you probably shouldn't change the default settings at all --- the
+        default behavior ensures that the JPEG file's color space can be
+        recognized by the decoder.
+
+JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS]
+        Pointers to coefficient quantization tables, one per table slot,
+        or NULL if no table is defined for a slot.  Usually these should
+        be set via one of the above helper routines; jpeg_add_quant_table()
+        is general enough to define any quantization table.  The other
+        routines will set up table slot 0 for luminance quality and table
+        slot 1 for chrominance.
+
+int q_scale_factor[NUM_QUANT_TBLS]
+        [libjpeg v7+ API/ABI emulation only]
+        Linear quantization scaling factors (0-100, default 100)
+        for use with jpeg_default_qtables().
+        See rdswitch.c and cjpeg.c for an example of usage.
+        Note that the q_scale_factor[] values use "linear" scales, so JPEG
+        quality levels chosen by the user must be converted to these scales
+        using jpeg_quality_scaling().  Here is an example that corresponds to
+        cjpeg -quality 90,70:
+
+                jpeg_set_defaults(cinfo);
+
+                /* Set luminance quality 90. */
+                cinfo->q_scale_factor[0] = jpeg_quality_scaling(90);
+                /* Set chrominance quality 70. */
+                cinfo->q_scale_factor[1] = jpeg_quality_scaling(70);
+
+                jpeg_default_qtables(cinfo, force_baseline);
+
+        CAUTION: Setting separate quality levels for chrominance and luminance
+        is mainly only useful if chrominance subsampling is disabled.  2x2
+        chrominance subsampling (AKA "4:2:0") is the default, but you can
+        explicitly disable subsampling as follows:
+
+                cinfo->comp_info[0].v_samp_factor = 1;
+                cinfo->comp_info[0].h_samp_factor = 1;
+
+JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS]
+JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS]
+        Pointers to Huffman coding tables, one per table slot, or NULL if
+        no table is defined for a slot.  Slots 0 and 1 are filled with the
+        JPEG sample tables by jpeg_set_defaults().  If you need to allocate
+        more table structures, jpeg_alloc_huff_table() may be used.
+        Note that optimal Huffman tables can be computed for an image
+        by setting optimize_coding, as discussed above; there's seldom
+        any need to mess with providing your own Huffman tables.
+
+
+[libjpeg v7+ API/ABI emulation only]
+The actual dimensions of the JPEG image that will be written to the file are
+given by the following fields.  These are computed from the input image
+dimensions and the compression parameters by jpeg_start_compress().  You can
+also call jpeg_calc_jpeg_dimensions() to obtain the values that will result
+from the current parameter settings.  This can be useful if you are trying
+to pick a scaling ratio that will get close to a desired target size.
+
+JDIMENSION jpeg_width           Actual dimensions of output image.
+JDIMENSION jpeg_height
+
+
+Per-component parameters are stored in the struct cinfo.comp_info[i] for
+component number i.  Note that components here refer to components of the
+JPEG color space, *not* the source image color space.  A suitably large
+comp_info[] array is allocated by jpeg_set_defaults(); if you choose not
+to use that routine, it's up to you to allocate the array.
+
+int component_id
+        The one-byte identifier code to be recorded in the JPEG file for
+        this component.  For the standard color spaces, we recommend you
+        leave the default values alone.
+
+int h_samp_factor
+int v_samp_factor
+        Horizontal and vertical sampling factors for the component; must
+        be 1..4 according to the JPEG standard.  Note that larger sampling
+        factors indicate a higher-resolution component; many people find
+        this behavior quite unintuitive.  The default values are 2,2 for
+        luminance components and 1,1 for chrominance components, except
+        for grayscale where 1,1 is used.
+
+int quant_tbl_no
+        Quantization table number for component.  The default value is
+        0 for luminance components and 1 for chrominance components.
+
+int dc_tbl_no
+int ac_tbl_no
+        DC and AC entropy coding table numbers.  The default values are
+        0 for luminance components and 1 for chrominance components.
+
+int component_index
+        Must equal the component's index in comp_info[].  (Beginning in
+        release v6, the compressor library will fill this in automatically;
+        you don't have to.)
+
+
+Decompression parameter selection
+---------------------------------
+
+Decompression parameter selection is somewhat simpler than compression
+parameter selection, since all of the JPEG internal parameters are
+recorded in the source file and need not be supplied by the application.
+(Unless you are working with abbreviated files, in which case see
+"Abbreviated datastreams", below.)  Decompression parameters control
+the postprocessing done on the image to deliver it in a format suitable
+for the application's use.  Many of the parameters control speed/quality
+tradeoffs, in which faster decompression may be obtained at the price of
+a poorer-quality image.  The defaults select the highest quality (slowest)
+processing.
+
+The following fields in the JPEG object are set by jpeg_read_header() and
+may be useful to the application in choosing decompression parameters:
+
+JDIMENSION image_width                  Width and height of image
+JDIMENSION image_height
+int num_components                      Number of color components
+J_COLOR_SPACE jpeg_color_space          Colorspace of image
+boolean saw_JFIF_marker                 TRUE if a JFIF APP0 marker was seen
+  UINT8 JFIF_major_version              Version information from JFIF marker
+  UINT8 JFIF_minor_version
+  UINT8 density_unit                    Resolution data from JFIF marker
+  UINT16 X_density
+  UINT16 Y_density
+boolean saw_Adobe_marker                TRUE if an Adobe APP14 marker was seen
+  UINT8 Adobe_transform                 Color transform code from Adobe marker
+
+The JPEG color space, unfortunately, is something of a guess since the JPEG
+standard proper does not provide a way to record it.  In practice most files
+adhere to the JFIF or Adobe conventions, and the decoder will recognize these
+correctly.  See "Special color spaces", below, for more info.
+
+
+The decompression parameters that determine the basic properties of the
+returned image are:
+
+J_COLOR_SPACE out_color_space
+        Output color space.  jpeg_read_header() sets an appropriate default
+        based on jpeg_color_space; typically it will be RGB or grayscale.
+        The application can change this field to request output in a different
+        colorspace.  For example, set it to JCS_GRAYSCALE to get grayscale
+        output from a color file.  (This is useful for previewing: grayscale
+        output is faster than full color since the color components need not
+        be processed.)  Note that not all possible color space transforms are
+        currently implemented; you may need to extend jdcolor.c if you want an
+        unusual conversion.
+
+unsigned int scale_num, scale_denom
+        Scale the image by the fraction scale_num/scale_denom.  Default is
+        1/1, or no scaling.  Currently, the only supported scaling ratios
+        are M/8 with all M from 1 to 16, or any reduced fraction thereof (such
+        as 1/2, 3/4, etc.)  (The library design allows for arbitrary
+        scaling ratios but this is not likely to be implemented any time soon.)
+        Smaller scaling ratios permit significantly faster decoding since
+        fewer pixels need be processed and a simpler IDCT method can be used.
+
+boolean quantize_colors
+        If set TRUE, colormapped output will be delivered.  Default is FALSE,
+        meaning that full-color output will be delivered.
+
+The next three parameters are relevant only if quantize_colors is TRUE.
+
+int desired_number_of_colors
+        Maximum number of colors to use in generating a library-supplied color
+        map (the actual number of colors is returned in a different field).
+        Default 256.  Ignored when the application supplies its own color map.
+
+boolean two_pass_quantize
+        If TRUE, an extra pass over the image is made to select a custom color
+        map for the image.  This usually looks a lot better than the one-size-
+        fits-all colormap that is used otherwise.  Default is TRUE.  Ignored
+        when the application supplies its own color map.
+
+J_DITHER_MODE dither_mode
+        Selects color dithering method.  Supported values are:
+                JDITHER_NONE    no dithering: fast, very low quality
+                JDITHER_ORDERED ordered dither: moderate speed and quality
+                JDITHER_FS      Floyd-Steinberg dither: slow, high quality
+        Default is JDITHER_FS.  (At present, ordered dither is implemented
+        only in the single-pass, standard-colormap case.  If you ask for
+        ordered dither when two_pass_quantize is TRUE or when you supply
+        an external color map, you'll get F-S dithering.)
+
+When quantize_colors is TRUE, the target color map is described by the next
+two fields.  colormap is set to NULL by jpeg_read_header().  The application
+can supply a color map by setting colormap non-NULL and setting
+actual_number_of_colors to the map size.  Otherwise, jpeg_start_decompress()
+selects a suitable color map and sets these two fields itself.
+[Implementation restriction: at present, an externally supplied colormap is
+only accepted for 3-component output color spaces.]
+
+JSAMPARRAY colormap
+        The color map, represented as a 2-D pixel array of out_color_components
+        rows and actual_number_of_colors columns.  Ignored if not quantizing.
+        CAUTION: if the JPEG library creates its own colormap, the storage
+        pointed to by this field is released by jpeg_finish_decompress().
+        Copy the colormap somewhere else first, if you want to save it.
+
+int actual_number_of_colors
+        The number of colors in the color map.
+
+Additional decompression parameters that the application may set include:
+
+J_DCT_METHOD dct_method
+        Selects the algorithm used for the DCT step.  Choices are:
+                JDCT_ISLOW: slow but accurate integer algorithm
+                JDCT_IFAST: faster, less accurate integer method
+                JDCT_FLOAT: floating-point method
+                JDCT_DEFAULT: default method (normally JDCT_ISLOW)
+                JDCT_FASTEST: fastest method (normally JDCT_IFAST)
+        In libjpeg-turbo, JDCT_IFAST is generally about 5-15% faster than
+        JDCT_ISLOW when using the x86/x86-64 SIMD extensions (results may vary
+        with other SIMD implementations, or when using libjpeg-turbo without
+        SIMD extensions.)  If the JPEG image was compressed using a quality
+        level of 85 or below, then there should be little or no perceptible
+        difference between the two algorithms.  When decompressing images that
+        were compressed using quality levels above 85, however, the difference
+        between JDCT_IFAST and JDCT_ISLOW becomes more pronounced.  With images
+        compressed using quality=97, for instance, JDCT_IFAST incurs generally
+        about a 4-6 dB loss (in PSNR) relative to JDCT_ISLOW, but this can be
+        larger for some images.  If you can avoid it, do not use JDCT_IFAST
+        when decompressing images that were compressed using quality levels
+        above 97.  The algorithm often degenerates for such images and can
+        actually produce a more lossy output image than if the JPEG image had
+        been compressed using lower quality levels.  JDCT_FLOAT is mainly a
+        legacy feature.  It does not produce significantly more accurate
+        results than the ISLOW method, and it is much slower.  The FLOAT method
+        may also give different results on different machines due to varying
+        roundoff behavior, whereas the integer methods should give the same
+        results on all machines.
+
+boolean do_fancy_upsampling
+        If TRUE, do careful upsampling of chroma components.  If FALSE,
+        a faster but sloppier method is used.  Default is TRUE.  The visual
+        impact of the sloppier method is often very small.
+
+boolean do_block_smoothing
+        If TRUE, interblock smoothing is applied in early stages of decoding
+        progressive JPEG files; if FALSE, not.  Default is TRUE.  Early
+        progression stages look "fuzzy" with smoothing, "blocky" without.
+        In any case, block smoothing ceases to be applied after the first few
+        AC coefficients are known to full accuracy, so it is relevant only
+        when using buffered-image mode for progressive images.
+
+boolean enable_1pass_quant
+boolean enable_external_quant
+boolean enable_2pass_quant
+        These are significant only in buffered-image mode, which is
+        described in its own section below.
+
+
+The output image dimensions are given by the following fields.  These are
+computed from the source image dimensions and the decompression parameters
+by jpeg_start_decompress().  You can also call jpeg_calc_output_dimensions()
+to obtain the values that will result from the current parameter settings.
+This can be useful if you are trying to pick a scaling ratio that will get
+close to a desired target size.  It's also important if you are using the
+JPEG library's memory manager to allocate output buffer space, because you
+are supposed to request such buffers *before* jpeg_start_decompress().
+
+JDIMENSION output_width         Actual dimensions of output image.
+JDIMENSION output_height
+int out_color_components        Number of color components in out_color_space.
+int output_components           Number of color components returned.
+int rec_outbuf_height           Recommended height of scanline buffer.
+
+When quantizing colors, output_components is 1, indicating a single color map
+index per pixel.  Otherwise it equals out_color_components.  The output arrays
+are required to be output_width * output_components JSAMPLEs wide.
+
+rec_outbuf_height is the recommended minimum height (in scanlines) of the
+buffer passed to jpeg_read_scanlines().  If the buffer is smaller, the
+library will still work, but time will be wasted due to unnecessary data
+copying.  In high-quality modes, rec_outbuf_height is always 1, but some
+faster, lower-quality modes set it to larger values (typically 2 to 4).
+If you are going to ask for a high-speed processing mode, you may as well
+go to the trouble of honoring rec_outbuf_height so as to avoid data copying.
+(An output buffer larger than rec_outbuf_height lines is OK, but won't
+provide any material speed improvement over that height.)
+
+
+Special color spaces
+--------------------
+
+The JPEG standard itself is "color blind" and doesn't specify any particular
+color space.  It is customary to convert color data to a luminance/chrominance
+color space before compressing, since this permits greater compression.  The
+existing de-facto JPEG file format standards specify YCbCr or grayscale data
+(JFIF), or grayscale, RGB, YCbCr, CMYK, or YCCK (Adobe).  For special
+applications such as multispectral images, other color spaces can be used,
+but it must be understood that such files will be unportable.
+
+The JPEG library can handle the most common colorspace conversions (namely
+RGB <=> YCbCr and CMYK <=> YCCK).  It can also deal with data of an unknown
+color space, passing it through without conversion.  If you deal extensively
+with an unusual color space, you can easily extend the library to understand
+additional color spaces and perform appropriate conversions.
+
+For compression, the source data's color space is specified by field
+in_color_space.  This is transformed to the JPEG file's color space given
+by jpeg_color_space.  jpeg_set_defaults() chooses a reasonable JPEG color
+space depending on in_color_space, but you can override this by calling
+jpeg_set_colorspace().  Of course you must select a supported transformation.
+jccolor.c currently supports the following transformations:
+        RGB => YCbCr
+        RGB => GRAYSCALE
+        YCbCr => GRAYSCALE
+        CMYK => YCCK
+plus the null transforms: GRAYSCALE => GRAYSCALE, RGB => RGB,
+YCbCr => YCbCr, CMYK => CMYK, YCCK => YCCK, and UNKNOWN => UNKNOWN.
+
+The de-facto file format standards (JFIF and Adobe) specify APPn markers that
+indicate the color space of the JPEG file.  It is important to ensure that
+these are written correctly, or omitted if the JPEG file's color space is not
+one of the ones supported by the de-facto standards.  jpeg_set_colorspace()
+will set the compression parameters to include or omit the APPn markers
+properly, so long as it is told the truth about the JPEG color space.
+For example, if you are writing some random 3-component color space without
+conversion, don't try to fake out the library by setting in_color_space and
+jpeg_color_space to JCS_YCbCr; use JCS_UNKNOWN.  You may want to write an
+APPn marker of your own devising to identify the colorspace --- see "Special
+markers", below.
+
+When told that the color space is UNKNOWN, the library will default to using
+luminance-quality compression parameters for all color components.  You may
+well want to change these parameters.  See the source code for
+jpeg_set_colorspace(), in jcparam.c, for details.
+
+For decompression, the JPEG file's color space is given in jpeg_color_space,
+and this is transformed to the output color space out_color_space.
+jpeg_read_header's setting of jpeg_color_space can be relied on if the file
+conforms to JFIF or Adobe conventions, but otherwise it is no better than a
+guess.  If you know the JPEG file's color space for certain, you can override
+jpeg_read_header's guess by setting jpeg_color_space.  jpeg_read_header also
+selects a default output color space based on (its guess of) jpeg_color_space;
+set out_color_space to override this.  Again, you must select a supported
+transformation.  jdcolor.c currently supports
+        YCbCr => RGB
+        YCbCr => GRAYSCALE
+        RGB => GRAYSCALE
+        GRAYSCALE => RGB
+        YCCK => CMYK
+as well as the null transforms.  (Since GRAYSCALE=>RGB is provided, an
+application can force grayscale JPEGs to look like color JPEGs if it only
+wants to handle one case.)
+
+The two-pass color quantizer, jquant2.c, is specialized to handle RGB data
+(it weights distances appropriately for RGB colors).  You'll need to modify
+the code if you want to use it for non-RGB output color spaces.  Note that
+jquant2.c is used to map to an application-supplied colormap as well as for
+the normal two-pass colormap selection process.
+
+CAUTION: it appears that Adobe Photoshop writes inverted data in CMYK JPEG
+files: 0 represents 100% ink coverage, rather than 0% ink as you'd expect.
+This is arguably a bug in Photoshop, but if you need to work with Photoshop
+CMYK files, you will have to deal with it in your application.  We cannot
+"fix" this in the library by inverting the data during the CMYK<=>YCCK
+transform, because that would break other applications, notably Ghostscript.
+Photoshop versions prior to 3.0 write EPS files containing JPEG-encoded CMYK
+data in the same inverted-YCCK representation used in bare JPEG files, but
+the surrounding PostScript code performs an inversion using the PS image
+operator.  I am told that Photoshop 3.0 will write uninverted YCCK in
+EPS/JPEG files, and will omit the PS-level inversion.  (But the data
+polarity used in bare JPEG files will not change in 3.0.)  In either case,
+the JPEG library must not invert the data itself, or else Ghostscript would
+read these EPS files incorrectly.
+
+
+Error handling
+--------------
+
+When the default error handler is used, any error detected inside the JPEG
+routines will cause a message to be printed on stderr, followed by exit().
+You can supply your own error handling routines to override this behavior
+and to control the treatment of nonfatal warnings and trace/debug messages.
+The file example.c illustrates the most common case, which is to have the
+application regain control after an error rather than exiting.
+
+The JPEG library never writes any message directly; it always goes through
+the error handling routines.  Three classes of messages are recognized:
+  * Fatal errors: the library cannot continue.
+  * Warnings: the library can continue, but the data is corrupt, and a
+    damaged output image is likely to result.
+  * Trace/informational messages.  These come with a trace level indicating
+    the importance of the message; you can control the verbosity of the
+    program by adjusting the maximum trace level that will be displayed.
+
+You may, if you wish, simply replace the entire JPEG error handling module
+(jerror.c) with your own code.  However, you can avoid code duplication by
+only replacing some of the routines depending on the behavior you need.
+This is accomplished by calling jpeg_std_error() as usual, but then overriding
+some of the method pointers in the jpeg_error_mgr struct, as illustrated by
+example.c.
+
+All of the error handling routines will receive a pointer to the JPEG object
+(a j_common_ptr which points to either a jpeg_compress_struct or a
+jpeg_decompress_struct; if you need to tell which, test the is_decompressor
+field).  This struct includes a pointer to the error manager struct in its
+"err" field.  Frequently, custom error handler routines will need to access
+additional data which is not known to the JPEG library or the standard error
+handler.  The most convenient way to do this is to embed either the JPEG
+object or the jpeg_error_mgr struct in a larger structure that contains
+additional fields; then casting the passed pointer provides access to the
+additional fields.  Again, see example.c for one way to do it.  (Beginning
+with IJG version 6b, there is also a void pointer "client_data" in each
+JPEG object, which the application can also use to find related data.
+The library does not touch client_data at all.)
+
+The individual methods that you might wish to override are:
+
+error_exit (j_common_ptr cinfo)
+        Receives control for a fatal error.  Information sufficient to
+        generate the error message has been stored in cinfo->err; call
+        output_message to display it.  Control must NOT return to the caller;
+        generally this routine will exit() or longjmp() somewhere.
+        Typically you would override this routine to get rid of the exit()
+        default behavior.  Note that if you continue processing, you should
+        clean up the JPEG object with jpeg_abort() or jpeg_destroy().
+
+output_message (j_common_ptr cinfo)
+        Actual output of any JPEG message.  Override this to send messages
+        somewhere other than stderr.  Note that this method does not know
+        how to generate a message, only where to send it.
+
+format_message (j_common_ptr cinfo, char *buffer)
+        Constructs a readable error message string based on the error info
+        stored in cinfo->err.  This method is called by output_message.  Few
+        applications should need to override this method.  One possible
+        reason for doing so is to implement dynamic switching of error message
+        language.
+
+emit_message (j_common_ptr cinfo, int msg_level)
+        Decide whether or not to emit a warning or trace message; if so,
+        calls output_message.  The main reason for overriding this method
+        would be to abort on warnings.  msg_level is -1 for warnings,
+        0 and up for trace messages.
+
+Only error_exit() and emit_message() are called from the rest of the JPEG
+library; the other two are internal to the error handler.
+
+The actual message texts are stored in an array of strings which is pointed to
+by the field err->jpeg_message_table.  The messages are numbered from 0 to
+err->last_jpeg_message, and it is these code numbers that are used in the
+JPEG library code.  You could replace the message texts (for instance, with
+messages in French or German) by changing the message table pointer.  See
+jerror.h for the default texts.  CAUTION: this table will almost certainly
+change or grow from one library version to the next.
+
+It may be useful for an application to add its own message texts that are
+handled by the same mechanism.  The error handler supports a second "add-on"
+message table for this purpose.  To define an addon table, set the pointer
+err->addon_message_table and the message numbers err->first_addon_message and
+err->last_addon_message.  If you number the addon messages beginning at 1000
+or so, you won't have to worry about conflicts with the library's built-in
+messages.  See the sample applications cjpeg/djpeg for an example of using
+addon messages (the addon messages are defined in cderror.h).
+
+Actual invocation of the error handler is done via macros defined in jerror.h:
+        ERREXITn(...)   for fatal errors
+        WARNMSn(...)    for corrupt-data warnings
+        TRACEMSn(...)   for trace and informational messages.
+These macros store the message code and any additional parameters into the
+error handler struct, then invoke the error_exit() or emit_message() method.
+The variants of each macro are for varying numbers of additional parameters.
+The additional parameters are inserted into the generated message using
+standard printf() format codes.
+
+See jerror.h and jerror.c for further details.
+
+
+Compressed data handling (source and destination managers)
+----------------------------------------------------------
+
+The JPEG compression library sends its compressed data to a "destination
+manager" module.  The default destination manager just writes the data to a
+memory buffer or to a stdio stream, but you can provide your own manager to
+do something else.  Similarly, the decompression library calls a "source
+manager" to obtain the compressed data; you can provide your own source
+manager if you want the data to come from somewhere other than a memory
+buffer or a stdio stream.
+
+In both cases, compressed data is processed a bufferload at a time: the
+destination or source manager provides a work buffer, and the library invokes
+the manager only when the buffer is filled or emptied.  (You could define a
+one-character buffer to force the manager to be invoked for each byte, but
+that would be rather inefficient.)  The buffer's size and location are
+controlled by the manager, not by the library.  For example, the memory
+source manager just makes the buffer pointer and length point to the original
+data in memory.  In this case the buffer-reload procedure will be invoked
+only if the decompressor ran off the end of the datastream, which would
+indicate an erroneous datastream.
+
+The work buffer is defined as an array of datatype JOCTET, which is generally
+"char" or "unsigned char".  On a machine where char is not exactly 8 bits
+wide, you must define JOCTET as a wider data type and then modify the data
+source and destination modules to transcribe the work arrays into 8-bit units
+on external storage.
+
+A data destination manager struct contains a pointer and count defining the
+next byte to write in the work buffer and the remaining free space:
+
+        JOCTET *next_output_byte;   /* => next byte to write in buffer */
+        size_t free_in_buffer;      /* # of byte spaces remaining in buffer */
+
+The library increments the pointer and decrements the count until the buffer
+is filled.  The manager's empty_output_buffer method must reset the pointer
+and count.  The manager is expected to remember the buffer's starting address
+and total size in private fields not visible to the library.
+
+A data destination manager provides three methods:
+
+init_destination (j_compress_ptr cinfo)
+        Initialize destination.  This is called by jpeg_start_compress()
+        before any data is actually written.  It must initialize
+        next_output_byte and free_in_buffer.  free_in_buffer must be
+        initialized to a positive value.
+
+empty_output_buffer (j_compress_ptr cinfo)
+        This is called whenever the buffer has filled (free_in_buffer
+        reaches zero).  In typical applications, it should write out the
+        *entire* buffer (use the saved start address and buffer length;
+        ignore the current state of next_output_byte and free_in_buffer).
+        Then reset the pointer & count to the start of the buffer, and
+        return TRUE indicating that the buffer has been dumped.
+        free_in_buffer must be set to a positive value when TRUE is
+        returned.  A FALSE return should only be used when I/O suspension is
+        desired (this operating mode is discussed in the next section).
+
+term_destination (j_compress_ptr cinfo)
+        Terminate destination --- called by jpeg_finish_compress() after all
+        data has been written.  In most applications, this must flush any
+        data remaining in the buffer.  Use either next_output_byte or
+        free_in_buffer to determine how much data is in the buffer.
+
+term_destination() is NOT called by jpeg_abort() or jpeg_destroy().  If you
+want the destination manager to be cleaned up during an abort, you must do it
+yourself.
+
+You will also need code to create a jpeg_destination_mgr struct, fill in its
+method pointers, and insert a pointer to the struct into the "dest" field of
+the JPEG compression object.  This can be done in-line in your setup code if
+you like, but it's probably cleaner to provide a separate routine similar to
+the jpeg_stdio_dest() or jpeg_mem_dest() routines of the supplied destination
+managers.
+
+Decompression source managers follow a parallel design, but with some
+additional frammishes.  The source manager struct contains a pointer and count
+defining the next byte to read from the work buffer and the number of bytes
+remaining:
+
+        const JOCTET *next_input_byte;  /* => next byte to read from buffer */
+        size_t bytes_in_buffer;         /* # of bytes remaining in buffer */
+
+The library increments the pointer and decrements the count until the buffer
+is emptied.  The manager's fill_input_buffer method must reset the pointer and
+count.  In most applications, the manager must remember the buffer's starting
+address and total size in private fields not visible to the library.
+
+A data source manager provides five methods:
+
+init_source (j_decompress_ptr cinfo)
+        Initialize source.  This is called by jpeg_read_header() before any
+        data is actually read.  Unlike init_destination(), it may leave
+        bytes_in_buffer set to 0 (in which case a fill_input_buffer() call
+        will occur immediately).
+
+fill_input_buffer (j_decompress_ptr cinfo)
+        This is called whenever bytes_in_buffer has reached zero and more
+        data is wanted.  In typical applications, it should read fresh data
+        into the buffer (ignoring the current state of next_input_byte and
+        bytes_in_buffer), reset the pointer & count to the start of the
+        buffer, and return TRUE indicating that the buffer has been reloaded.
+        It is not necessary to fill the buffer entirely, only to obtain at
+        least one more byte.  bytes_in_buffer MUST be set to a positive value
+        if TRUE is returned.  A FALSE return should only be used when I/O
+        suspension is desired (this mode is discussed in the next section).
+
+skip_input_data (j_decompress_ptr cinfo, long num_bytes)
+        Skip num_bytes worth of data.  The buffer pointer and count should
+        be advanced over num_bytes input bytes, refilling the buffer as
+        needed.  This is used to skip over a potentially large amount of
+        uninteresting data (such as an APPn marker).  In some applications
+        it may be possible to optimize away the reading of the skipped data,
+        but it's not clear that being smart is worth much trouble; large
+        skips are uncommon.  bytes_in_buffer may be zero on return.
+        A zero or negative skip count should be treated as a no-op.
+
+resync_to_restart (j_decompress_ptr cinfo, int desired)
+        This routine is called only when the decompressor has failed to find
+        a restart (RSTn) marker where one is expected.  Its mission is to
+        find a suitable point for resuming decompression.  For most
+        applications, we recommend that you just use the default resync
+        procedure, jpeg_resync_to_restart().  However, if you are able to back
+        up in the input data stream, or if you have a-priori knowledge about
+        the likely location of restart markers, you may be able to do better.
+        Read the read_restart_marker() and jpeg_resync_to_restart() routines
+        in jdmarker.c if you think you'd like to implement your own resync
+        procedure.
+
+term_source (j_decompress_ptr cinfo)
+        Terminate source --- called by jpeg_finish_decompress() after all
+        data has been read.  Often a no-op.
+
+For both fill_input_buffer() and skip_input_data(), there is no such thing
+as an EOF return.  If the end of the file has been reached, the routine has
+a choice of exiting via ERREXIT() or inserting fake data into the buffer.
+In most cases, generating a warning message and inserting a fake EOI marker
+is the best course of action --- this will allow the decompressor to output
+however much of the image is there.  In pathological cases, the decompressor
+may swallow the EOI and again demand data ... just keep feeding it fake EOIs.
+jdatasrc.c illustrates the recommended error recovery behavior.
+
+term_source() is NOT called by jpeg_abort() or jpeg_destroy().  If you want
+the source manager to be cleaned up during an abort, you must do it yourself.
+
+You will also need code to create a jpeg_source_mgr struct, fill in its method
+pointers, and insert a pointer to the struct into the "src" field of the JPEG
+decompression object.  This can be done in-line in your setup code if you
+like, but it's probably cleaner to provide a separate routine similar to the
+jpeg_stdio_src() or jpeg_mem_src() routines of the supplied source managers.
+
+For more information, consult the memory and stdio source and destination
+managers in jdatasrc.c and jdatadst.c.
+
+
+I/O suspension
+--------------
+
+Some applications need to use the JPEG library as an incremental memory-to-
+memory filter: when the compressed data buffer is filled or emptied, they want
+control to return to the outer loop, rather than expecting that the buffer can
+be emptied or reloaded within the data source/destination manager subroutine.
+The library supports this need by providing an "I/O suspension" mode, which we
+describe in this section.
+
+The I/O suspension mode is not a panacea: nothing is guaranteed about the
+maximum amount of time spent in any one call to the library, so it will not
+eliminate response-time problems in single-threaded applications.  If you
+need guaranteed response time, we suggest you "bite the bullet" and implement
+a real multi-tasking capability.
+
+To use I/O suspension, cooperation is needed between the calling application
+and the data source or destination manager; you will always need a custom
+source/destination manager.  (Please read the previous section if you haven't
+already.)  The basic idea is that the empty_output_buffer() or
+fill_input_buffer() routine is a no-op, merely returning FALSE to indicate
+that it has done nothing.  Upon seeing this, the JPEG library suspends
+operation and returns to its caller.  The surrounding application is
+responsible for emptying or refilling the work buffer before calling the
+JPEG library again.
+
+Compression suspension:
+
+For compression suspension, use an empty_output_buffer() routine that returns
+FALSE; typically it will not do anything else.  This will cause the
+compressor to return to the caller of jpeg_write_scanlines(), with the return
+value indicating that not all the supplied scanlines have been accepted.
+The application must make more room in the output buffer, adjust the output
+buffer pointer/count appropriately, and then call jpeg_write_scanlines()
+again, pointing to the first unconsumed scanline.
+
+When forced to suspend, the compressor will backtrack to a convenient stopping
+point (usually the start of the current MCU); it will regenerate some output
+data when restarted.  Therefore, although empty_output_buffer() is only
+called when the buffer is filled, you should NOT write out the entire buffer
+after a suspension.  Write only the data up to the current position of
+next_output_byte/free_in_buffer.  The data beyond that point will be
+regenerated after resumption.
+
+Because of the backtracking behavior, a good-size output buffer is essential
+for efficiency; you don't want the compressor to suspend often.  (In fact, an
+overly small buffer could lead to infinite looping, if a single MCU required
+more data than would fit in the buffer.)  We recommend a buffer of at least
+several Kbytes.  You may want to insert explicit code to ensure that you don't
+call jpeg_write_scanlines() unless there is a reasonable amount of space in
+the output buffer; in other words, flush the buffer before trying to compress
+more data.
+
+The compressor does not allow suspension while it is trying to write JPEG
+markers at the beginning and end of the file.  This means that:
+  * At the beginning of a compression operation, there must be enough free
+    space in the output buffer to hold the header markers (typically 600 or
+    so bytes).  The recommended buffer size is bigger than this anyway, so
+    this is not a problem as long as you start with an empty buffer.  However,
+    this restriction might catch you if you insert large special markers, such
+    as a JFIF thumbnail image, without flushing the buffer afterwards.
+  * When you call jpeg_finish_compress(), there must be enough space in the
+    output buffer to emit any buffered data and the final EOI marker.  In the
+    current implementation, half a dozen bytes should suffice for this, but
+    for safety's sake we recommend ensuring that at least 100 bytes are free
+    before calling jpeg_finish_compress().
+
+A more significant restriction is that jpeg_finish_compress() cannot suspend.
+This means you cannot use suspension with multi-pass operating modes, namely
+Huffman code optimization and multiple-scan output.  Those modes write the
+whole file during jpeg_finish_compress(), which will certainly result in
+buffer overrun.  (Note that this restriction applies only to compression,
+not decompression.  The decompressor supports input suspension in all of its
+operating modes.)
+
+Decompression suspension:
+
+For decompression suspension, use a fill_input_buffer() routine that simply
+returns FALSE (except perhaps during error recovery, as discussed below).
+This will cause the decompressor to return to its caller with an indication
+that suspension has occurred.  This can happen at four places:
+  * jpeg_read_header(): will return JPEG_SUSPENDED.
+  * jpeg_start_decompress(): will return FALSE, rather than its usual TRUE.
+  * jpeg_read_scanlines(): will return the number of scanlines already
+        completed (possibly 0).
+  * jpeg_finish_decompress(): will return FALSE, rather than its usual TRUE.
+The surrounding application must recognize these cases, load more data into
+the input buffer, and repeat the call.  In the case of jpeg_read_scanlines(),
+increment the passed pointers past any scanlines successfully read.
+
+Just as with compression, the decompressor will typically backtrack to a
+convenient restart point before suspending.  When fill_input_buffer() is
+called, next_input_byte/bytes_in_buffer point to the current restart point,
+which is where the decompressor will backtrack to if FALSE is returned.
+The data beyond that position must NOT be discarded if you suspend; it needs
+to be re-read upon resumption.  In most implementations, you'll need to shift
+this data down to the start of your work buffer and then load more data after
+it.  Again, this behavior means that a several-Kbyte work buffer is essential
+for decent performance; furthermore, you should load a reasonable amount of
+new data before resuming decompression.  (If you loaded, say, only one new
+byte each time around, you could waste a LOT of cycles.)
+
+The skip_input_data() source manager routine requires special care in a
+suspension scenario.  This routine is NOT granted the ability to suspend the
+decompressor; it can decrement bytes_in_buffer to zero, but no more.  If the
+requested skip distance exceeds the amount of data currently in the input
+buffer, then skip_input_data() must set bytes_in_buffer to zero and record the
+additional skip distance somewhere else.  The decompressor will immediately
+call fill_input_buffer(), which should return FALSE, which will cause a
+suspension return.  The surrounding application must then arrange to discard
+the recorded number of bytes before it resumes loading the input buffer.
+(Yes, this design is rather baroque, but it avoids complexity in the far more
+common case where a non-suspending source manager is used.)
+
+If the input data has been exhausted, we recommend that you emit a warning
+and insert dummy EOI markers just as a non-suspending data source manager
+would do.  This can be handled either in the surrounding application logic or
+within fill_input_buffer(); the latter is probably more efficient.  If
+fill_input_buffer() knows that no more data is available, it can set the
+pointer/count to point to a dummy EOI marker and then return TRUE just as
+though it had read more data in a non-suspending situation.
+
+The decompressor does not attempt to suspend within standard JPEG markers;
+instead it will backtrack to the start of the marker and reprocess the whole
+marker next time.  Hence the input buffer must be large enough to hold the
+longest standard marker in the file.  Standard JPEG markers should normally
+not exceed a few hundred bytes each (DHT tables are typically the longest).
+We recommend at least a 2K buffer for performance reasons, which is much
+larger than any correct marker is likely to be.  For robustness against
+damaged marker length counts, you may wish to insert a test in your
+application for the case that the input buffer is completely full and yet
+the decoder has suspended without consuming any data --- otherwise, if this
+situation did occur, it would lead to an endless loop.  (The library can't
+provide this test since it has no idea whether "the buffer is full", or
+even whether there is a fixed-size input buffer.)
+
+The input buffer would need to be 64K to allow for arbitrary COM or APPn
+markers, but these are handled specially: they are either saved into allocated
+memory, or skipped over by calling skip_input_data().  In the former case,
+suspension is handled correctly, and in the latter case, the problem of
+buffer overrun is placed on skip_input_data's shoulders, as explained above.
+Note that if you provide your own marker handling routine for large markers,
+you should consider how to deal with buffer overflow.
+
+Multiple-buffer management:
+
+In some applications it is desirable to store the compressed data in a linked
+list of buffer areas, so as to avoid data copying.  This can be handled by
+having empty_output_buffer() or fill_input_buffer() set the pointer and count
+to reference the next available buffer; FALSE is returned only if no more
+buffers are available.  Although seemingly straightforward, there is a
+pitfall in this approach: the backtrack that occurs when FALSE is returned
+could back up into an earlier buffer.  For example, when fill_input_buffer()
+is called, the current pointer & count indicate the backtrack restart point.
+Since fill_input_buffer() will set the pointer and count to refer to a new
+buffer, the restart position must be saved somewhere else.  Suppose a second
+call to fill_input_buffer() occurs in the same library call, and no
+additional input data is available, so fill_input_buffer must return FALSE.
+If the JPEG library has not moved the pointer/count forward in the current
+buffer, then *the correct restart point is the saved position in the prior
+buffer*.  Prior buffers may be discarded only after the library establishes
+a restart point within a later buffer.  Similar remarks apply for output into
+a chain of buffers.
+
+The library will never attempt to backtrack over a skip_input_data() call,
+so any skipped data can be permanently discarded.  You still have to deal
+with the case of skipping not-yet-received data, however.
+
+It's much simpler to use only a single buffer; when fill_input_buffer() is
+called, move any unconsumed data (beyond the current pointer/count) down to
+the beginning of this buffer and then load new data into the remaining buffer
+space.  This approach requires a little more data copying but is far easier
+to get right.
+
+
+Progressive JPEG support
+------------------------
+
+Progressive JPEG rearranges the stored data into a series of scans of
+increasing quality.  In situations where a JPEG file is transmitted across a
+slow communications link, a decoder can generate a low-quality image very
+quickly from the first scan, then gradually improve the displayed quality as
+more scans are received.  The final image after all scans are complete is
+identical to that of a regular (sequential) JPEG file of the same quality
+setting.  Progressive JPEG files are often slightly smaller than equivalent
+sequential JPEG files, but the possibility of incremental display is the main
+reason for using progressive JPEG.
+
+The IJG encoder library generates progressive JPEG files when given a
+suitable "scan script" defining how to divide the data into scans.
+Creation of progressive JPEG files is otherwise transparent to the encoder.
+Progressive JPEG files can also be read transparently by the decoder library.
+If the decoding application simply uses the library as defined above, it
+will receive a final decoded image without any indication that the file was
+progressive.  Of course, this approach does not allow incremental display.
+To perform incremental display, an application needs to use the decoder
+library's "buffered-image" mode, in which it receives a decoded image
+multiple times.
+
+Each displayed scan requires about as much work to decode as a full JPEG
+image of the same size, so the decoder must be fairly fast in relation to the
+data transmission rate in order to make incremental display useful.  However,
+it is possible to skip displaying the image and simply add the incoming bits
+to the decoder's coefficient buffer.  This is fast because only Huffman
+decoding need be done, not IDCT, upsampling, colorspace conversion, etc.
+The IJG decoder library allows the application to switch dynamically between
+displaying the image and simply absorbing the incoming bits.  A properly
+coded application can automatically adapt the number of display passes to
+suit the time available as the image is received.  Also, a final
+higher-quality display cycle can be performed from the buffered data after
+the end of the file is reached.
+
+Progressive compression:
+
+To create a progressive JPEG file (or a multiple-scan sequential JPEG file),
+set the scan_info cinfo field to point to an array of scan descriptors, and
+perform compression as usual.  Instead of constructing your own scan list,
+you can call the jpeg_simple_progression() helper routine to create a
+recommended progression sequence; this method should be used by all
+applications that don't want to get involved in the nitty-gritty of
+progressive scan sequence design.  (If you want to provide user control of
+scan sequences, you may wish to borrow the scan script reading code found
+in rdswitch.c, so that you can read scan script files just like cjpeg's.)
+When scan_info is not NULL, the compression library will store DCT'd data
+into a buffer array as jpeg_write_scanlines() is called, and will emit all
+the requested scans during jpeg_finish_compress().  This implies that
+multiple-scan output cannot be created with a suspending data destination
+manager, since jpeg_finish_compress() does not support suspension.  We
+should also note that the compressor currently forces Huffman optimization
+mode when creating a progressive JPEG file, because the default Huffman
+tables are unsuitable for progressive files.
+
+Progressive decompression:
+
+When buffered-image mode is not used, the decoder library will read all of
+a multi-scan file during jpeg_start_decompress(), so that it can provide a
+final decoded image.  (Here "multi-scan" means either progressive or
+multi-scan sequential.)  This makes multi-scan files transparent to the
+decoding application.  However, existing applications that used suspending
+input with version 5 of the IJG library will need to be modified to check
+for a suspension return from jpeg_start_decompress().
+
+To perform incremental display, an application must use the library's
+buffered-image mode.  This is described in the next section.
+
+
+Buffered-image mode
+-------------------
+
+In buffered-image mode, the library stores the partially decoded image in a
+coefficient buffer, from which it can be read out as many times as desired.
+This mode is typically used for incremental display of progressive JPEG files,
+but it can be used with any JPEG file.  Each scan of a progressive JPEG file
+adds more data (more detail) to the buffered image.  The application can
+display in lockstep with the source file (one display pass per input scan),
+or it can allow input processing to outrun display processing.  By making
+input and display processing run independently, it is possible for the
+application to adapt progressive display to a wide range of data transmission
+rates.
+
+The basic control flow for buffered-image decoding is
+
+        jpeg_create_decompress()
+        set data source
+        jpeg_read_header()
+        set overall decompression parameters
+        cinfo.buffered_image = TRUE;    /* select buffered-image mode */
+        jpeg_start_decompress()
+        for (each output pass) {
+            adjust output decompression parameters if required
+            jpeg_start_output()         /* start a new output pass */
+            for (all scanlines in image) {
+                jpeg_read_scanlines()
+                display scanlines
+            }
+            jpeg_finish_output()        /* terminate output pass */
+        }
+        jpeg_finish_decompress()
+        jpeg_destroy_decompress()
+
+This differs from ordinary unbuffered decoding in that there is an additional
+level of looping.  The application can choose how many output passes to make
+and how to display each pass.
+
+The simplest approach to displaying progressive images is to do one display
+pass for each scan appearing in the input file.  In this case the outer loop
+condition is typically
+        while (! jpeg_input_complete(&cinfo))
+and the start-output call should read
+        jpeg_start_output(&cinfo, cinfo.input_scan_number);
+The second parameter to jpeg_start_output() indicates which scan of the input
+file is to be displayed; the scans are numbered starting at 1 for this
+purpose.  (You can use a loop counter starting at 1 if you like, but using
+the library's input scan counter is easier.)  The library automatically reads
+data as necessary to complete each requested scan, and jpeg_finish_output()
+advances to the next scan or end-of-image marker (hence input_scan_number
+will be incremented by the time control arrives back at jpeg_start_output()).
+With this technique, data is read from the input file only as needed, and
+input and output processing run in lockstep.
+
+After reading the final scan and reaching the end of the input file, the
+buffered image remains available; it can be read additional times by
+repeating the jpeg_start_output()/jpeg_read_scanlines()/jpeg_finish_output()
+sequence.  For example, a useful technique is to use fast one-pass color
+quantization for display passes made while the image is arriving, followed by
+a final display pass using two-pass quantization for highest quality.  This
+is done by changing the library parameters before the final output pass.
+Changing parameters between passes is discussed in detail below.
+
+In general the last scan of a progressive file cannot be recognized as such
+until after it is read, so a post-input display pass is the best approach if
+you want special processing in the final pass.
+
+When done with the image, be sure to call jpeg_finish_decompress() to release
+the buffered image (or just use jpeg_destroy_decompress()).
+
+If input data arrives faster than it can be displayed, the application can
+cause the library to decode input data in advance of what's needed to produce
+output.  This is done by calling the routine jpeg_consume_input().
+The return value is one of the following:
+        JPEG_REACHED_SOS:    reached an SOS marker (the start of a new scan)
+        JPEG_REACHED_EOI:    reached the EOI marker (end of image)
+        JPEG_ROW_COMPLETED:  completed reading one MCU row of compressed data
+        JPEG_SCAN_COMPLETED: completed reading last MCU row of current scan
+        JPEG_SUSPENDED:      suspended before completing any of the above
+(JPEG_SUSPENDED can occur only if a suspending data source is used.)  This
+routine can be called at any time after initializing the JPEG object.  It
+reads some additional data and returns when one of the indicated significant
+events occurs.  (If called after the EOI marker is reached, it will
+immediately return JPEG_REACHED_EOI without attempting to read more data.)
+
+The library's output processing will automatically call jpeg_consume_input()
+whenever the output processing overtakes the input; thus, simple lockstep
+display requires no direct calls to jpeg_consume_input().  But by adding
+calls to jpeg_consume_input(), you can absorb data in advance of what is
+being displayed.  This has two benefits:
+  * You can limit buildup of unprocessed data in your input buffer.
+  * You can eliminate extra display passes by paying attention to the
+    state of the library's input processing.
+
+The first of these benefits only requires interspersing calls to
+jpeg_consume_input() with your display operations and any other processing
+you may be doing.  To avoid wasting cycles due to backtracking, it's best to
+call jpeg_consume_input() only after a hundred or so new bytes have arrived.
+This is discussed further under "I/O suspension", above.  (Note: the JPEG
+library currently is not thread-safe.  You must not call jpeg_consume_input()
+from one thread of control if a different library routine is working on the
+same JPEG object in another thread.)
+
+When input arrives fast enough that more than one new scan is available
+before you start a new output pass, you may as well skip the output pass
+corresponding to the completed scan.  This occurs for free if you pass
+cinfo.input_scan_number as the target scan number to jpeg_start_output().
+The input_scan_number field is simply the index of the scan currently being
+consumed by the input processor.  You can ensure that this is up-to-date by
+emptying the input buffer just before calling jpeg_start_output(): call
+jpeg_consume_input() repeatedly until it returns JPEG_SUSPENDED or
+JPEG_REACHED_EOI.
+
+The target scan number passed to jpeg_start_output() is saved in the
+cinfo.output_scan_number field.  The library's output processing calls
+jpeg_consume_input() whenever the current input scan number and row within
+that scan is less than or equal to the current output scan number and row.
+Thus, input processing can "get ahead" of the output processing but is not
+allowed to "fall behind".  You can achieve several different effects by
+manipulating this interlock rule.  For example, if you pass a target scan
+number greater than the current input scan number, the output processor will
+wait until that scan starts to arrive before producing any output.  (To avoid
+an infinite loop, the target scan number is automatically reset to the last
+scan number when the end of image is reached.  Thus, if you specify a large
+target scan number, the library will just absorb the entire input file and
+then perform an output pass.  This is effectively the same as what
+jpeg_start_decompress() does when you don't select buffered-image mode.)
+When you pass a target scan number equal to the current input scan number,
+the image is displayed no faster than the current input scan arrives.  The
+final possibility is to pass a target scan number less than the current input
+scan number; this disables the input/output interlock and causes the output
+processor to simply display whatever it finds in the image buffer, without
+waiting for input.  (However, the library will not accept a target scan
+number less than one, so you can't avoid waiting for the first scan.)
+
+When data is arriving faster than the output display processing can advance
+through the image, jpeg_consume_input() will store data into the buffered
+image beyond the point at which the output processing is reading data out
+again.  If the input arrives fast enough, it may "wrap around" the buffer to
+the point where the input is more than one whole scan ahead of the output.
+If the output processing simply proceeds through its display pass without
+paying attention to the input, the effect seen on-screen is that the lower
+part of the image is one or more scans better in quality than the upper part.
+Then, when the next output scan is started, you have a choice of what target
+scan number to use.  The recommended choice is to use the current input scan
+number at that time, which implies that you've skipped the output scans
+corresponding to the input scans that were completed while you processed the
+previous output scan.  In this way, the decoder automatically adapts its
+speed to the arriving data, by skipping output scans as necessary to keep up
+with the arriving data.
+
+When using this strategy, you'll want to be sure that you perform a final
+output pass after receiving all the data; otherwise your last display may not
+be full quality across the whole screen.  So the right outer loop logic is
+something like this:
+        do {
+            absorb any waiting input by calling jpeg_consume_input()
+            final_pass = jpeg_input_complete(&cinfo);
+            adjust output decompression parameters if required
+            jpeg_start_output(&cinfo, cinfo.input_scan_number);
+            ...
+            jpeg_finish_output()
+        } while (! final_pass);
+rather than quitting as soon as jpeg_input_complete() returns TRUE.  This
+arrangement makes it simple to use higher-quality decoding parameters
+for the final pass.  But if you don't want to use special parameters for
+the final pass, the right loop logic is like this:
+        for (;;) {
+            absorb any waiting input by calling jpeg_consume_input()
+            jpeg_start_output(&cinfo, cinfo.input_scan_number);
+            ...
+            jpeg_finish_output()
+            if (jpeg_input_complete(&cinfo) &&
+                cinfo.input_scan_number == cinfo.output_scan_number)
+              break;
+        }
+In this case you don't need to know in advance whether an output pass is to
+be the last one, so it's not necessary to have reached EOF before starting
+the final output pass; rather, what you want to test is whether the output
+pass was performed in sync with the final input scan.  This form of the loop
+will avoid an extra output pass whenever the decoder is able (or nearly able)
+to keep up with the incoming data.
+
+When the data transmission speed is high, you might begin a display pass,
+then find that much or all of the file has arrived before you can complete
+the pass.  (You can detect this by noting the JPEG_REACHED_EOI return code
+from jpeg_consume_input(), or equivalently by testing jpeg_input_complete().)
+In this situation you may wish to abort the current display pass and start a
+new one using the newly arrived information.  To do so, just call
+jpeg_finish_output() and then start a new pass with jpeg_start_output().
+
+A variant strategy is to abort and restart display if more than one complete
+scan arrives during an output pass; this can be detected by noting
+JPEG_REACHED_SOS returns and/or examining cinfo.input_scan_number.  This
+idea should be employed with caution, however, since the display process
+might never get to the bottom of the image before being aborted, resulting
+in the lower part of the screen being several passes worse than the upper.
+In most cases it's probably best to abort an output pass only if the whole
+file has arrived and you want to begin the final output pass immediately.
+
+When receiving data across a communication link, we recommend always using
+the current input scan number for the output target scan number; if a
+higher-quality final pass is to be done, it should be started (aborting any
+incomplete output pass) as soon as the end of file is received.  However,
+many other strategies are possible.  For example, the application can examine
+the parameters of the current input scan and decide whether to display it or
+not.  If the scan contains only chroma data, one might choose not to use it
+as the target scan, expecting that the scan will be small and will arrive
+quickly.  To skip to the next scan, call jpeg_consume_input() until it
+returns JPEG_REACHED_SOS or JPEG_REACHED_EOI.  Or just use the next higher
+number as the target scan for jpeg_start_output(); but that method doesn't
+let you inspect the next scan's parameters before deciding to display it.
+
+
+In buffered-image mode, jpeg_start_decompress() never performs input and
+thus never suspends.  An application that uses input suspension with
+buffered-image mode must be prepared for suspension returns from these
+routines:
+* jpeg_start_output() performs input only if you request 2-pass quantization
+  and the target scan isn't fully read yet.  (This is discussed below.)
+* jpeg_read_scanlines(), as always, returns the number of scanlines that it
+  was able to produce before suspending.
+* jpeg_finish_output() will read any markers following the target scan,
+  up to the end of the file or the SOS marker that begins another scan.
+  (But it reads no input if jpeg_consume_input() has already reached the
+  end of the file or a SOS marker beyond the target output scan.)
+* jpeg_finish_decompress() will read until the end of file, and thus can
+  suspend if the end hasn't already been reached (as can be tested by
+  calling jpeg_input_complete()).
+jpeg_start_output(), jpeg_finish_output(), and jpeg_finish_decompress()
+all return TRUE if they completed their tasks, FALSE if they had to suspend.
+In the event of a FALSE return, the application must load more input data
+and repeat the call.  Applications that use non-suspending data sources need
+not check the return values of these three routines.
+
+
+It is possible to change decoding parameters between output passes in the
+buffered-image mode.  The decoder library currently supports only very
+limited changes of parameters.  ONLY THE FOLLOWING parameter changes are
+allowed after jpeg_start_decompress() is called:
+* dct_method can be changed before each call to jpeg_start_output().
+  For example, one could use a fast DCT method for early scans, changing
+  to a higher quality method for the final scan.
+* dither_mode can be changed before each call to jpeg_start_output();
+  of course this has no impact if not using color quantization.  Typically
+  one would use ordered dither for initial passes, then switch to
+  Floyd-Steinberg dither for the final pass.  Caution: changing dither mode
+  can cause more memory to be allocated by the library.  Although the amount
+  of memory involved is not large (a scanline or so), it may cause the
+  initial max_memory_to_use specification to be exceeded, which in the worst
+  case would result in an out-of-memory failure.
+* do_block_smoothing can be changed before each call to jpeg_start_output().
+  This setting is relevant only when decoding a progressive JPEG image.
+  During the first DC-only scan, block smoothing provides a very "fuzzy" look
+  instead of the very "blocky" look seen without it; which is better seems a
+  matter of personal taste.  But block smoothing is nearly always a win
+  during later stages, especially when decoding a successive-approximation
+  image: smoothing helps to hide the slight blockiness that otherwise shows
+  up on smooth gradients until the lowest coefficient bits are sent.
+* Color quantization mode can be changed under the rules described below.
+  You *cannot* change between full-color and quantized output (because that
+  would alter the required I/O buffer sizes), but you can change which
+  quantization method is used.
+
+When generating color-quantized output, changing quantization method is a
+very useful way of switching between high-speed and high-quality display.
+The library allows you to change among its three quantization methods:
+1. Single-pass quantization to a fixed color cube.
+   Selected by cinfo.two_pass_quantize = FALSE and cinfo.colormap = NULL.
+2. Single-pass quantization to an application-supplied colormap.
+   Selected by setting cinfo.colormap to point to the colormap (the value of
+   two_pass_quantize is ignored); also set cinfo.actual_number_of_colors.
+3. Two-pass quantization to a colormap chosen specifically for the image.
+   Selected by cinfo.two_pass_quantize = TRUE and cinfo.colormap = NULL.
+   (This is the default setting selected by jpeg_read_header, but it is
+   probably NOT what you want for the first pass of progressive display!)
+These methods offer successively better quality and lesser speed.  However,
+only the first method is available for quantizing in non-RGB color spaces.
+
+IMPORTANT: because the different quantizer methods have very different
+working-storage requirements, the library requires you to indicate which
+one(s) you intend to use before you call jpeg_start_decompress().  (If we did
+not require this, the max_memory_to_use setting would be a complete fiction.)
+You do this by setting one or more of these three cinfo fields to TRUE:
+        enable_1pass_quant              Fixed color cube colormap
+        enable_external_quant           Externally-supplied colormap
+        enable_2pass_quant              Two-pass custom colormap
+All three are initialized FALSE by jpeg_read_header().  But
+jpeg_start_decompress() automatically sets TRUE the one selected by the
+current two_pass_quantize and colormap settings, so you only need to set the
+enable flags for any other quantization methods you plan to change to later.
+
+After setting the enable flags correctly at jpeg_start_decompress() time, you
+can change to any enabled quantization method by setting two_pass_quantize
+and colormap properly just before calling jpeg_start_output().  The following
+special rules apply:
+1. You must explicitly set cinfo.colormap to NULL when switching to 1-pass
+   or 2-pass mode from a different mode, or when you want the 2-pass
+   quantizer to be re-run to generate a new colormap.
+2. To switch to an external colormap, or to change to a different external
+   colormap than was used on the prior pass, you must call
+   jpeg_new_colormap() after setting cinfo.colormap.
+NOTE: if you want to use the same colormap as was used in the prior pass,
+you should not do either of these things.  This will save some nontrivial
+switchover costs.
+(These requirements exist because cinfo.colormap will always be non-NULL
+after completing a prior output pass, since both the 1-pass and 2-pass
+quantizers set it to point to their output colormaps.  Thus you have to
+do one of these two things to notify the library that something has changed.
+Yup, it's a bit klugy, but it's necessary to do it this way for backwards
+compatibility.)
+
+Note that in buffered-image mode, the library generates any requested colormap
+during jpeg_start_output(), not during jpeg_start_decompress().
+
+When using two-pass quantization, jpeg_start_output() makes a pass over the
+buffered image to determine the optimum color map; it therefore may take a
+significant amount of time, whereas ordinarily it does little work.  The
+progress monitor hook is called during this pass, if defined.  It is also
+important to realize that if the specified target scan number is greater than
+or equal to the current input scan number, jpeg_start_output() will attempt
+to consume input as it makes this pass.  If you use a suspending data source,
+you need to check for a FALSE return from jpeg_start_output() under these
+conditions.  The combination of 2-pass quantization and a not-yet-fully-read
+target scan is the only case in which jpeg_start_output() will consume input.
+
+
+Application authors who support buffered-image mode may be tempted to use it
+for all JPEG images, even single-scan ones.  This will work, but it is
+inefficient: there is no need to create an image-sized coefficient buffer for
+single-scan images.  Requesting buffered-image mode for such an image wastes
+memory.  Worse, it can cost time on large images, since the buffered data has
+to be swapped out or written to a temporary file.  If you are concerned about
+maximum performance on baseline JPEG files, you should use buffered-image
+mode only when the incoming file actually has multiple scans.  This can be
+tested by calling jpeg_has_multiple_scans(), which will return a correct
+result at any time after jpeg_read_header() completes.
+
+It is also worth noting that when you use jpeg_consume_input() to let input
+processing get ahead of output processing, the resulting pattern of access to
+the coefficient buffer is quite nonsequential.  It's best to use the memory
+manager jmemnobs.c if you can (ie, if you have enough real or virtual main
+memory).  If not, at least make sure that max_memory_to_use is set as high as
+possible.  If the JPEG memory manager has to use a temporary file, you will
+probably see a lot of disk traffic and poor performance.  (This could be
+improved with additional work on the memory manager, but we haven't gotten
+around to it yet.)
+
+In some applications it may be convenient to use jpeg_consume_input() for all
+input processing, including reading the initial markers; that is, you may
+wish to call jpeg_consume_input() instead of jpeg_read_header() during
+startup.  This works, but note that you must check for JPEG_REACHED_SOS and
+JPEG_REACHED_EOI return codes as the equivalent of jpeg_read_header's codes.
+Once the first SOS marker has been reached, you must call
+jpeg_start_decompress() before jpeg_consume_input() will consume more input;
+it'll just keep returning JPEG_REACHED_SOS until you do.  If you read a
+tables-only file this way, jpeg_consume_input() will return JPEG_REACHED_EOI
+without ever returning JPEG_REACHED_SOS; be sure to check for this case.
+If this happens, the decompressor will not read any more input until you call
+jpeg_abort() to reset it.  It is OK to call jpeg_consume_input() even when not
+using buffered-image mode, but in that case it's basically a no-op after the
+initial markers have been read: it will just return JPEG_SUSPENDED.
+
+
+Abbreviated datastreams and multiple images
+-------------------------------------------
+
+A JPEG compression or decompression object can be reused to process multiple
+images.  This saves a small amount of time per image by eliminating the
+"create" and "destroy" operations, but that isn't the real purpose of the
+feature.  Rather, reuse of an object provides support for abbreviated JPEG
+datastreams.  Object reuse can also simplify processing a series of images in
+a single input or output file.  This section explains these features.
+
+A JPEG file normally contains several hundred bytes worth of quantization
+and Huffman tables.  In a situation where many images will be stored or
+transmitted with identical tables, this may represent an annoying overhead.
+The JPEG standard therefore permits tables to be omitted.  The standard
+defines three classes of JPEG datastreams:
+  * "Interchange" datastreams contain an image and all tables needed to decode
+     the image.  These are the usual kind of JPEG file.
+  * "Abbreviated image" datastreams contain an image, but are missing some or
+    all of the tables needed to decode that image.
+  * "Abbreviated table specification" (henceforth "tables-only") datastreams
+    contain only table specifications.
+To decode an abbreviated image, it is necessary to load the missing table(s)
+into the decoder beforehand.  This can be accomplished by reading a separate
+tables-only file.  A variant scheme uses a series of images in which the first
+image is an interchange (complete) datastream, while subsequent ones are
+abbreviated and rely on the tables loaded by the first image.  It is assumed
+that once the decoder has read a table, it will remember that table until a
+new definition for the same table number is encountered.
+
+It is the application designer's responsibility to figure out how to associate
+the correct tables with an abbreviated image.  While abbreviated datastreams
+can be useful in a closed environment, their use is strongly discouraged in
+any situation where data exchange with other applications might be needed.
+Caveat designer.
+
+The JPEG library provides support for reading and writing any combination of
+tables-only datastreams and abbreviated images.  In both compression and
+decompression objects, a quantization or Huffman table will be retained for
+the lifetime of the object, unless it is overwritten by a new table definition.
+
+
+To create abbreviated image datastreams, it is only necessary to tell the
+compressor not to emit some or all of the tables it is using.  Each
+quantization and Huffman table struct contains a boolean field "sent_table",
+which normally is initialized to FALSE.  For each table used by the image, the
+header-writing process emits the table and sets sent_table = TRUE unless it is
+already TRUE.  (In normal usage, this prevents outputting the same table
+definition multiple times, as would otherwise occur because the chroma
+components typically share tables.)  Thus, setting this field to TRUE before
+calling jpeg_start_compress() will prevent the table from being written at
+all.
+
+If you want to create a "pure" abbreviated image file containing no tables,
+just call "jpeg_suppress_tables(&cinfo, TRUE)" after constructing all the
+tables.  If you want to emit some but not all tables, you'll need to set the
+individual sent_table fields directly.
+
+To create an abbreviated image, you must also call jpeg_start_compress()
+with a second parameter of FALSE, not TRUE.  Otherwise jpeg_start_compress()
+will force all the sent_table fields to FALSE.  (This is a safety feature to
+prevent abbreviated images from being created accidentally.)
+
+To create a tables-only file, perform the same parameter setup that you
+normally would, but instead of calling jpeg_start_compress() and so on, call
+jpeg_write_tables(&cinfo).  This will write an abbreviated datastream
+containing only SOI, DQT and/or DHT markers, and EOI.  All the quantization
+and Huffman tables that are currently defined in the compression object will
+be emitted unless their sent_tables flag is already TRUE, and then all the
+sent_tables flags will be set TRUE.
+
+A sure-fire way to create matching tables-only and abbreviated image files
+is to proceed as follows:
+
+        create JPEG compression object
+        set JPEG parameters
+        set destination to tables-only file
+        jpeg_write_tables(&cinfo);
+        set destination to image file
+        jpeg_start_compress(&cinfo, FALSE);
+        write data...
+        jpeg_finish_compress(&cinfo);
+
+Since the JPEG parameters are not altered between writing the table file and
+the abbreviated image file, the same tables are sure to be used.  Of course,
+you can repeat the jpeg_start_compress() ... jpeg_finish_compress() sequence
+many times to produce many abbreviated image files matching the table file.
+
+You cannot suppress output of the computed Huffman tables when Huffman
+optimization is selected.  (If you could, there'd be no way to decode the
+image...)  Generally, you don't want to set optimize_coding = TRUE when
+you are trying to produce abbreviated files.
+
+In some cases you might want to compress an image using tables which are
+not stored in the application, but are defined in an interchange or
+tables-only file readable by the application.  This can be done by setting up
+a JPEG decompression object to read the specification file, then copying the
+tables into your compression object.  See jpeg_copy_critical_parameters()
+for an example of copying quantization tables.
+
+
+To read abbreviated image files, you simply need to load the proper tables
+into the decompression object before trying to read the abbreviated image.
+If the proper tables are stored in the application program, you can just
+allocate the table structs and fill in their contents directly.  For example,
+to load a fixed quantization table into table slot "n":
+
+    if (cinfo.quant_tbl_ptrs[n] == NULL)
+      cinfo.quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) &cinfo);
+    quant_ptr = cinfo.quant_tbl_ptrs[n];        /* quant_ptr is JQUANT_TBL* */
+    for (i = 0; i < 64; i++) {
+      /* Qtable[] is desired quantization table, in natural array order */
+      quant_ptr->quantval[i] = Qtable[i];
+    }
+
+Code to load a fixed Huffman table is typically (for AC table "n"):
+
+    if (cinfo.ac_huff_tbl_ptrs[n] == NULL)
+      cinfo.ac_huff_tbl_ptrs[n] = jpeg_alloc_huff_table((j_common_ptr) &cinfo);
+    huff_ptr = cinfo.ac_huff_tbl_ptrs[n];       /* huff_ptr is JHUFF_TBL* */
+    for (i = 1; i <= 16; i++) {
+      /* counts[i] is number of Huffman codes of length i bits, i=1..16 */
+      huff_ptr->bits[i] = counts[i];
+    }
+    for (i = 0; i < 256; i++) {
+      /* symbols[] is the list of Huffman symbols, in code-length order */
+      huff_ptr->huffval[i] = symbols[i];
+    }
+
+(Note that trying to set cinfo.quant_tbl_ptrs[n] to point directly at a
+constant JQUANT_TBL object is not safe.  If the incoming file happened to
+contain a quantization table definition, your master table would get
+overwritten!  Instead allocate a working table copy and copy the master table
+into it, as illustrated above.  Ditto for Huffman tables, of course.)
+
+You might want to read the tables from a tables-only file, rather than
+hard-wiring them into your application.  The jpeg_read_header() call is
+sufficient to read a tables-only file.  You must pass a second parameter of
+FALSE to indicate that you do not require an image to be present.  Thus, the
+typical scenario is
+
+        create JPEG decompression object
+        set source to tables-only file
+        jpeg_read_header(&cinfo, FALSE);
+        set source to abbreviated image file
+        jpeg_read_header(&cinfo, TRUE);
+        set decompression parameters
+        jpeg_start_decompress(&cinfo);
+        read data...
+        jpeg_finish_decompress(&cinfo);
+
+In some cases, you may want to read a file without knowing whether it contains
+an image or just tables.  In that case, pass FALSE and check the return value
+from jpeg_read_header(): it will be JPEG_HEADER_OK if an image was found,
+JPEG_HEADER_TABLES_ONLY if only tables were found.  (A third return value,
+JPEG_SUSPENDED, is possible when using a suspending data source manager.)
+Note that jpeg_read_header() will not complain if you read an abbreviated
+image for which you haven't loaded the missing tables; the missing-table check
+occurs later, in jpeg_start_decompress().
+
+
+It is possible to read a series of images from a single source file by
+repeating the jpeg_read_header() ... jpeg_finish_decompress() sequence,
+without releasing/recreating the JPEG object or the data source module.
+(If you did reinitialize, any partial bufferload left in the data source
+buffer at the end of one image would be discarded, causing you to lose the
+start of the next image.)  When you use this method, stored tables are
+automatically carried forward, so some of the images can be abbreviated images
+that depend on tables from earlier images.
+
+If you intend to write a series of images into a single destination file,
+you might want to make a specialized data destination module that doesn't
+flush the output buffer at term_destination() time.  This would speed things
+up by some trifling amount.  Of course, you'd need to remember to flush the
+buffer after the last image.  You can make the later images be abbreviated
+ones by passing FALSE to jpeg_start_compress().
+
+
+Special markers
+---------------
+
+Some applications may need to insert or extract special data in the JPEG
+datastream.  The JPEG standard provides marker types "COM" (comment) and
+"APP0" through "APP15" (application) to hold application-specific data.
+Unfortunately, the use of these markers is not specified by the standard.
+COM markers are fairly widely used to hold user-supplied text.  The JFIF file
+format spec uses APP0 markers with specified initial strings to hold certain
+data.  Adobe applications use APP14 markers beginning with the string "Adobe"
+for miscellaneous data.  Other APPn markers are rarely seen, but might
+contain almost anything.
+
+If you wish to store user-supplied text, we recommend you use COM markers
+and place readable 7-bit ASCII text in them.  Newline conventions are not
+standardized --- expect to find LF (Unix style), CR/LF (DOS style), or CR
+(Mac style).  A robust COM reader should be able to cope with random binary
+garbage, including nulls, since some applications generate COM markers
+containing non-ASCII junk.  (But yours should not be one of them.)
+
+For program-supplied data, use an APPn marker, and be sure to begin it with an
+identifying string so that you can tell whether the marker is actually yours.
+It's probably best to avoid using APP0 or APP14 for any private markers.
+(NOTE: the upcoming SPIFF standard will use APP8 markers; we recommend you
+not use APP8 markers for any private purposes, either.)
+
+Keep in mind that at most 65533 bytes can be put into one marker, but you
+can have as many markers as you like.
+
+By default, the IJG compression library will write a JFIF APP0 marker if the
+selected JPEG colorspace is grayscale or YCbCr, or an Adobe APP14 marker if
+the selected colorspace is RGB, CMYK, or YCCK.  You can disable this, but
+we don't recommend it.  The decompression library will recognize JFIF and
+Adobe markers and will set the JPEG colorspace properly when one is found.
+
+
+You can write special markers immediately following the datastream header by
+calling jpeg_write_marker() after jpeg_start_compress() and before the first
+call to jpeg_write_scanlines().  When you do this, the markers appear after
+the SOI and the JFIF APP0 and Adobe APP14 markers (if written), but before
+all else.  Specify the marker type parameter as "JPEG_COM" for COM or
+"JPEG_APP0 + n" for APPn.  (Actually, jpeg_write_marker will let you write
+any marker type, but we don't recommend writing any other kinds of marker.)
+For example, to write a user comment string pointed to by comment_text:
+        jpeg_write_marker(cinfo, JPEG_COM, comment_text, strlen(comment_text));
+
+If it's not convenient to store all the marker data in memory at once,
+you can instead call jpeg_write_m_header() followed by multiple calls to
+jpeg_write_m_byte().  If you do it this way, it's your responsibility to
+call jpeg_write_m_byte() exactly the number of times given in the length
+parameter to jpeg_write_m_header().  (This method lets you empty the
+output buffer partway through a marker, which might be important when
+using a suspending data destination module.  In any case, if you are using
+a suspending destination, you should flush its buffer after inserting
+any special markers.  See "I/O suspension".)
+
+Or, if you prefer to synthesize the marker byte sequence yourself,
+you can just cram it straight into the data destination module.
+
+If you are writing JFIF 1.02 extension markers (thumbnail images), don't
+forget to set cinfo.JFIF_minor_version = 2 so that the encoder will write the
+correct JFIF version number in the JFIF header marker.  The library's default
+is to write version 1.01, but that's wrong if you insert any 1.02 extension
+markers.  (We could probably get away with just defaulting to 1.02, but there
+used to be broken decoders that would complain about unknown minor version
+numbers.  To reduce compatibility risks it's safest not to write 1.02 unless
+you are actually using 1.02 extensions.)
+
+
+When reading, two methods of handling special markers are available:
+1. You can ask the library to save the contents of COM and/or APPn markers
+into memory, and then examine them at your leisure afterwards.
+2. You can supply your own routine to process COM and/or APPn markers
+on-the-fly as they are read.
+The first method is simpler to use, especially if you are using a suspending
+data source; writing a marker processor that copes with input suspension is
+not easy (consider what happens if the marker is longer than your available
+input buffer).  However, the second method conserves memory since the marker
+data need not be kept around after it's been processed.
+
+For either method, you'd normally set up marker handling after creating a
+decompression object and before calling jpeg_read_header(), because the
+markers of interest will typically be near the head of the file and so will
+be scanned by jpeg_read_header.  Once you've established a marker handling
+method, it will be used for the life of that decompression object
+(potentially many datastreams), unless you change it.  Marker handling is
+determined separately for COM markers and for each APPn marker code.
+
+
+To save the contents of special markers in memory, call
+        jpeg_save_markers(cinfo, marker_code, length_limit)
+where marker_code is the marker type to save, JPEG_COM or JPEG_APP0+n.
+(To arrange to save all the special marker types, you need to call this
+routine 17 times, for COM and APP0-APP15.)  If the incoming marker is longer
+than length_limit data bytes, only length_limit bytes will be saved; this
+parameter allows you to avoid chewing up memory when you only need to see the
+first few bytes of a potentially large marker.  If you want to save all the
+data, set length_limit to 0xFFFF; that is enough since marker lengths are only
+16 bits.  As a special case, setting length_limit to 0 prevents that marker
+type from being saved at all.  (That is the default behavior, in fact.)
+
+After jpeg_read_header() completes, you can examine the special markers by
+following the cinfo->marker_list pointer chain.  All the special markers in
+the file appear in this list, in order of their occurrence in the file (but
+omitting any markers of types you didn't ask for).  Both the original data
+length and the saved data length are recorded for each list entry; the latter
+will not exceed length_limit for the particular marker type.  Note that these
+lengths exclude the marker length word, whereas the stored representation
+within the JPEG file includes it.  (Hence the maximum data length is really
+only 65533.)
+
+It is possible that additional special markers appear in the file beyond the
+SOS marker at which jpeg_read_header stops; if so, the marker list will be
+extended during reading of the rest of the file.  This is not expected to be
+common, however.  If you are short on memory you may want to reset the length
+limit to zero for all marker types after finishing jpeg_read_header, to
+ensure that the max_memory_to_use setting cannot be exceeded due to addition
+of later markers.
+
+The marker list remains stored until you call jpeg_finish_decompress or
+jpeg_abort, at which point the memory is freed and the list is set to empty.
+(jpeg_destroy also releases the storage, of course.)
+
+Note that the library is internally interested in APP0 and APP14 markers;
+if you try to set a small nonzero length limit on these types, the library
+will silently force the length up to the minimum it wants.  (But you can set
+a zero length limit to prevent them from being saved at all.)  Also, in a
+16-bit environment, the maximum length limit may be constrained to less than
+65533 by malloc() limitations.  It is therefore best not to assume that the
+effective length limit is exactly what you set it to be.
+
+
+If you want to supply your own marker-reading routine, you do it by calling
+jpeg_set_marker_processor().  A marker processor routine must have the
+signature
+        boolean jpeg_marker_parser_method (j_decompress_ptr cinfo)
+Although the marker code is not explicitly passed, the routine can find it
+in cinfo->unread_marker.  At the time of call, the marker proper has been
+read from the data source module.  The processor routine is responsible for
+reading the marker length word and the remaining parameter bytes, if any.
+Return TRUE to indicate success.  (FALSE should be returned only if you are
+using a suspending data source and it tells you to suspend.  See the standard
+marker processors in jdmarker.c for appropriate coding methods if you need to
+use a suspending data source.)
+
+If you override the default APP0 or APP14 processors, it is up to you to
+recognize JFIF and Adobe markers if you want colorspace recognition to occur
+properly.  We recommend copying and extending the default processors if you
+want to do that.  (A better idea is to save these marker types for later
+examination by calling jpeg_save_markers(); that method doesn't interfere
+with the library's own processing of these markers.)
+
+jpeg_set_marker_processor() and jpeg_save_markers() are mutually exclusive
+--- if you call one it overrides any previous call to the other, for the
+particular marker type specified.
+
+A simple example of an external COM processor can be found in djpeg.c.
+Also, see jpegtran.c for an example of using jpeg_save_markers.
+
+
+Raw (downsampled) image data
+----------------------------
+
+Some applications need to supply already-downsampled image data to the JPEG
+compressor, or to receive raw downsampled data from the decompressor.  The
+library supports this requirement by allowing the application to write or
+read raw data, bypassing the normal preprocessing or postprocessing steps.
+The interface is different from the standard one and is somewhat harder to
+use.  If your interest is merely in bypassing color conversion, we recommend
+that you use the standard interface and simply set jpeg_color_space =
+in_color_space (or jpeg_color_space = out_color_space for decompression).
+The mechanism described in this section is necessary only to supply or
+receive downsampled image data, in which not all components have the same
+dimensions.
+
+
+To compress raw data, you must supply the data in the colorspace to be used
+in the JPEG file (please read the earlier section on Special color spaces)
+and downsampled to the sampling factors specified in the JPEG parameters.
+You must supply the data in the format used internally by the JPEG library,
+namely a JSAMPIMAGE array.  This is an array of pointers to two-dimensional
+arrays, each of type JSAMPARRAY.  Each 2-D array holds the values for one
+color component.  This structure is necessary since the components are of
+different sizes.  If the image dimensions are not a multiple of the MCU size,
+you must also pad the data correctly (usually, this is done by replicating
+the last column and/or row).  The data must be padded to a multiple of a DCT
+block in each component: that is, each downsampled row must contain a
+multiple of 8 valid samples, and there must be a multiple of 8 sample rows
+for each component.  (For applications such as conversion of digital TV
+images, the standard image size is usually a multiple of the DCT block size,
+so that no padding need actually be done.)
+
+The procedure for compression of raw data is basically the same as normal
+compression, except that you call jpeg_write_raw_data() in place of
+jpeg_write_scanlines().  Before calling jpeg_start_compress(), you must do
+the following:
+  * Set cinfo->raw_data_in to TRUE.  (It is set FALSE by jpeg_set_defaults().)
+    This notifies the library that you will be supplying raw data.
+  * Ensure jpeg_color_space is correct --- an explicit jpeg_set_colorspace()
+    call is a good idea.  Note that since color conversion is bypassed,
+    in_color_space is ignored, except that jpeg_set_defaults() uses it to
+    choose the default jpeg_color_space setting.
+  * Ensure the sampling factors, cinfo->comp_info[i].h_samp_factor and
+    cinfo->comp_info[i].v_samp_factor, are correct.  Since these indicate the
+    dimensions of the data you are supplying, it's wise to set them
+    explicitly, rather than assuming the library's defaults are what you want.
+
+To pass raw data to the library, call jpeg_write_raw_data() in place of
+jpeg_write_scanlines().  The two routines work similarly except that
+jpeg_write_raw_data takes a JSAMPIMAGE data array rather than JSAMPARRAY.
+The scanlines count passed to and returned from jpeg_write_raw_data is
+measured in terms of the component with the largest v_samp_factor.
+
+jpeg_write_raw_data() processes one MCU row per call, which is to say
+v_samp_factor*DCTSIZE sample rows of each component.  The passed num_lines
+value must be at least max_v_samp_factor*DCTSIZE, and the return value will
+be exactly that amount (or possibly some multiple of that amount, in future
+library versions).  This is true even on the last call at the bottom of the
+image; don't forget to pad your data as necessary.
+
+The required dimensions of the supplied data can be computed for each
+component as
+        cinfo->comp_info[i].width_in_blocks*DCTSIZE  samples per row
+        cinfo->comp_info[i].height_in_blocks*DCTSIZE rows in image
+after jpeg_start_compress() has initialized those fields.  If the valid data
+is smaller than this, it must be padded appropriately.  For some sampling
+factors and image sizes, additional dummy DCT blocks are inserted to make
+the image a multiple of the MCU dimensions.  The library creates such dummy
+blocks itself; it does not read them from your supplied data.  Therefore you
+need never pad by more than DCTSIZE samples.  An example may help here.
+Assume 2h2v downsampling of YCbCr data, that is
+        cinfo->comp_info[0].h_samp_factor = 2           for Y
+        cinfo->comp_info[0].v_samp_factor = 2
+        cinfo->comp_info[1].h_samp_factor = 1           for Cb
+        cinfo->comp_info[1].v_samp_factor = 1
+        cinfo->comp_info[2].h_samp_factor = 1           for Cr
+        cinfo->comp_info[2].v_samp_factor = 1
+and suppose that the nominal image dimensions (cinfo->image_width and
+cinfo->image_height) are 101x101 pixels.  Then jpeg_start_compress() will
+compute downsampled_width = 101 and width_in_blocks = 13 for Y,
+downsampled_width = 51 and width_in_blocks = 7 for Cb and Cr (and the same
+for the height fields).  You must pad the Y data to at least 13*8 = 104
+columns and rows, the Cb/Cr data to at least 7*8 = 56 columns and rows.  The
+MCU height is max_v_samp_factor = 2 DCT rows so you must pass at least 16
+scanlines on each call to jpeg_write_raw_data(), which is to say 16 actual
+sample rows of Y and 8 each of Cb and Cr.  A total of 7 MCU rows are needed,
+so you must pass a total of 7*16 = 112 "scanlines".  The last DCT block row
+of Y data is dummy, so it doesn't matter what you pass for it in the data
+arrays, but the scanlines count must total up to 112 so that all of the Cb
+and Cr data gets passed.
+
+Output suspension is supported with raw-data compression: if the data
+destination module suspends, jpeg_write_raw_data() will return 0.
+In this case the same data rows must be passed again on the next call.
+
+
+Decompression with raw data output implies bypassing all postprocessing:
+you cannot ask for rescaling or color quantization, for instance.  More
+seriously, you must deal with the color space and sampling factors present in
+the incoming file.  If your application only handles, say, 2h1v YCbCr data,
+you must check for and fail on other color spaces or other sampling factors.
+The library will not convert to a different color space for you.
+
+To obtain raw data output, set cinfo->raw_data_out = TRUE before
+jpeg_start_decompress() (it is set FALSE by jpeg_read_header()).  Be sure to
+verify that the color space and sampling factors are ones you can handle.
+Then call jpeg_read_raw_data() in place of jpeg_read_scanlines().  The
+decompression process is otherwise the same as usual.
+
+jpeg_read_raw_data() returns one MCU row per call, and thus you must pass a
+buffer of at least max_v_samp_factor*DCTSIZE scanlines (scanline counting is
+the same as for raw-data compression).  The buffer you pass must be large
+enough to hold the actual data plus padding to DCT-block boundaries.  As with
+compression, any entirely dummy DCT blocks are not processed so you need not
+allocate space for them, but the total scanline count includes them.  The
+above example of computing buffer dimensions for raw-data compression is
+equally valid for decompression.
+
+Input suspension is supported with raw-data decompression: if the data source
+module suspends, jpeg_read_raw_data() will return 0.  You can also use
+buffered-image mode to read raw data in multiple passes.
+
+
+Really raw data: DCT coefficients
+---------------------------------
+
+It is possible to read or write the contents of a JPEG file as raw DCT
+coefficients.  This facility is mainly intended for use in lossless
+transcoding between different JPEG file formats.  Other possible applications
+include lossless cropping of a JPEG image, lossless reassembly of a
+multi-strip or multi-tile TIFF/JPEG file into a single JPEG datastream, etc.
+
+To read the contents of a JPEG file as DCT coefficients, open the file and do
+jpeg_read_header() as usual.  But instead of calling jpeg_start_decompress()
+and jpeg_read_scanlines(), call jpeg_read_coefficients().  This will read the
+entire image into a set of virtual coefficient-block arrays, one array per
+component.  The return value is a pointer to an array of virtual-array
+descriptors.  Each virtual array can be accessed directly using the JPEG
+memory manager's access_virt_barray method (see Memory management, below,
+and also read structure.txt's discussion of virtual array handling).  Or,
+for simple transcoding to a different JPEG file format, the array list can
+just be handed directly to jpeg_write_coefficients().
+
+Each block in the block arrays contains quantized coefficient values in
+normal array order (not JPEG zigzag order).  The block arrays contain only
+DCT blocks containing real data; any entirely-dummy blocks added to fill out
+interleaved MCUs at the right or bottom edges of the image are discarded
+during reading and are not stored in the block arrays.  (The size of each
+block array can be determined from the width_in_blocks and height_in_blocks
+fields of the component's comp_info entry.)  This is also the data format
+expected by jpeg_write_coefficients().
+
+When you are done using the virtual arrays, call jpeg_finish_decompress()
+to release the array storage and return the decompression object to an idle
+state; or just call jpeg_destroy() if you don't need to reuse the object.
+
+If you use a suspending data source, jpeg_read_coefficients() will return
+NULL if it is forced to suspend; a non-NULL return value indicates successful
+completion.  You need not test for a NULL return value when using a
+non-suspending data source.
+
+It is also possible to call jpeg_read_coefficients() to obtain access to the
+decoder's coefficient arrays during a normal decode cycle in buffered-image
+mode.  This frammish might be useful for progressively displaying an incoming
+image and then re-encoding it without loss.  To do this, decode in buffered-
+image mode as discussed previously, then call jpeg_read_coefficients() after
+the last jpeg_finish_output() call.  The arrays will be available for your use
+until you call jpeg_finish_decompress().
+
+
+To write the contents of a JPEG file as DCT coefficients, you must provide
+the DCT coefficients stored in virtual block arrays.  You can either pass
+block arrays read from an input JPEG file by jpeg_read_coefficients(), or
+allocate virtual arrays from the JPEG compression object and fill them
+yourself.  In either case, jpeg_write_coefficients() is substituted for
+jpeg_start_compress() and jpeg_write_scanlines().  Thus the sequence is
+  * Create compression object
+  * Set all compression parameters as necessary
+  * Request virtual arrays if needed
+  * jpeg_write_coefficients()
+  * jpeg_finish_compress()
+  * Destroy or re-use compression object
+jpeg_write_coefficients() is passed a pointer to an array of virtual block
+array descriptors; the number of arrays is equal to cinfo.num_components.
+
+The virtual arrays need only have been requested, not realized, before
+jpeg_write_coefficients() is called.  A side-effect of
+jpeg_write_coefficients() is to realize any virtual arrays that have been
+requested from the compression object's memory manager.  Thus, when obtaining
+the virtual arrays from the compression object, you should fill the arrays
+after calling jpeg_write_coefficients().  The data is actually written out
+when you call jpeg_finish_compress(); jpeg_write_coefficients() only writes
+the file header.
+
+When writing raw DCT coefficients, it is crucial that the JPEG quantization
+tables and sampling factors match the way the data was encoded, or the
+resulting file will be invalid.  For transcoding from an existing JPEG file,
+we recommend using jpeg_copy_critical_parameters().  This routine initializes
+all the compression parameters to default values (like jpeg_set_defaults()),
+then copies the critical information from a source decompression object.
+The decompression object should have just been used to read the entire
+JPEG input file --- that is, it should be awaiting jpeg_finish_decompress().
+
+jpeg_write_coefficients() marks all tables stored in the compression object
+as needing to be written to the output file (thus, it acts like
+jpeg_start_compress(cinfo, TRUE)).  This is for safety's sake, to avoid
+emitting abbreviated JPEG files by accident.  If you really want to emit an
+abbreviated JPEG file, call jpeg_suppress_tables(), or set the tables'
+individual sent_table flags, between calling jpeg_write_coefficients() and
+jpeg_finish_compress().
+
+
+Progress monitoring
+-------------------
+
+Some applications may need to regain control from the JPEG library every so
+often.  The typical use of this feature is to produce a percent-done bar or
+other progress display.  (For a simple example, see cjpeg.c or djpeg.c.)
+Although you do get control back frequently during the data-transferring pass
+(the jpeg_read_scanlines or jpeg_write_scanlines loop), any additional passes
+will occur inside jpeg_finish_compress or jpeg_start_decompress; those
+routines may take a long time to execute, and you don't get control back
+until they are done.
+
+You can define a progress-monitor routine which will be called periodically
+by the library.  No guarantees are made about how often this call will occur,
+so we don't recommend you use it for mouse tracking or anything like that.
+At present, a call will occur once per MCU row, scanline, or sample row
+group, whichever unit is convenient for the current processing mode; so the
+wider the image, the longer the time between calls.  During the data
+transferring pass, only one call occurs per call of jpeg_read_scanlines or
+jpeg_write_scanlines, so don't pass a large number of scanlines at once if
+you want fine resolution in the progress count.  (If you really need to use
+the callback mechanism for time-critical tasks like mouse tracking, you could
+insert additional calls inside some of the library's inner loops.)
+
+To establish a progress-monitor callback, create a struct jpeg_progress_mgr,
+fill in its progress_monitor field with a pointer to your callback routine,
+and set cinfo->progress to point to the struct.  The callback will be called
+whenever cinfo->progress is non-NULL.  (This pointer is set to NULL by
+jpeg_create_compress or jpeg_create_decompress; the library will not change
+it thereafter.  So if you allocate dynamic storage for the progress struct,
+make sure it will live as long as the JPEG object does.  Allocating from the
+JPEG memory manager with lifetime JPOOL_PERMANENT will work nicely.)  You
+can use the same callback routine for both compression and decompression.
+
+The jpeg_progress_mgr struct contains four fields which are set by the library:
+        long pass_counter;      /* work units completed in this pass */
+        long pass_limit;        /* total number of work units in this pass */
+        int completed_passes;   /* passes completed so far */
+        int total_passes;       /* total number of passes expected */
+During any one pass, pass_counter increases from 0 up to (not including)
+pass_limit; the step size is usually but not necessarily 1.  The pass_limit
+value may change from one pass to another.  The expected total number of
+passes is in total_passes, and the number of passes already completed is in
+completed_passes.  Thus the fraction of work completed may be estimated as
+                completed_passes + (pass_counter/pass_limit)
+                --------------------------------------------
+                                total_passes
+ignoring the fact that the passes may not be equal amounts of work.
+
+When decompressing, pass_limit can even change within a pass, because it
+depends on the number of scans in the JPEG file, which isn't always known in
+advance.  The computed fraction-of-work-done may jump suddenly (if the library
+discovers it has overestimated the number of scans) or even decrease (in the
+opposite case).  It is not wise to put great faith in the work estimate.
+
+When using the decompressor's buffered-image mode, the progress monitor work
+estimate is likely to be completely unhelpful, because the library has no way
+to know how many output passes will be demanded of it.  Currently, the library
+sets total_passes based on the assumption that there will be one more output
+pass if the input file end hasn't yet been read (jpeg_input_complete() isn't
+TRUE), but no more output passes if the file end has been reached when the
+output pass is started.  This means that total_passes will rise as additional
+output passes are requested.  If you have a way of determining the input file
+size, estimating progress based on the fraction of the file that's been read
+will probably be more useful than using the library's value.
+
+
+Memory management
+-----------------
+
+This section covers some key facts about the JPEG library's built-in memory
+manager.  For more info, please read structure.txt's section about the memory
+manager, and consult the source code if necessary.
+
+All memory and temporary file allocation within the library is done via the
+memory manager.  If necessary, you can replace the "back end" of the memory
+manager to control allocation yourself (for example, if you don't want the
+library to use malloc() and free() for some reason).
+
+Some data is allocated "permanently" and will not be freed until the JPEG
+object is destroyed.  Most data is allocated "per image" and is freed by
+jpeg_finish_compress, jpeg_finish_decompress, or jpeg_abort.  You can call the
+memory manager yourself to allocate structures that will automatically be
+freed at these times.  Typical code for this is
+  ptr = (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, size);
+Use JPOOL_PERMANENT to get storage that lasts as long as the JPEG object.
+Use alloc_large instead of alloc_small for anything bigger than a few Kbytes.
+There are also alloc_sarray and alloc_barray routines that automatically
+build 2-D sample or block arrays.
+
+The library's minimum space requirements to process an image depend on the
+image's width, but not on its height, because the library ordinarily works
+with "strip" buffers that are as wide as the image but just a few rows high.
+Some operating modes (eg, two-pass color quantization) require full-image
+buffers.  Such buffers are treated as "virtual arrays": only the current strip
+need be in memory, and the rest can be swapped out to a temporary file.
+
+If you use the simplest memory manager back end (jmemnobs.c), then no
+temporary files are used; virtual arrays are simply malloc()'d.  Images bigger
+than memory can be processed only if your system supports virtual memory.
+The other memory manager back ends support temporary files of various flavors
+and thus work in machines without virtual memory.  They may also be useful on
+Unix machines if you need to process images that exceed available swap space.
+
+When using temporary files, the library will make the in-memory buffers for
+its virtual arrays just big enough to stay within a "maximum memory" setting.
+Your application can set this limit by setting cinfo->mem->max_memory_to_use
+after creating the JPEG object.  (Of course, there is still a minimum size for
+the buffers, so the max-memory setting is effective only if it is bigger than
+the minimum space needed.)  If you allocate any large structures yourself, you
+must allocate them before jpeg_start_compress() or jpeg_start_decompress() in
+order to have them counted against the max memory limit.  Also keep in mind
+that space allocated with alloc_small() is ignored, on the assumption that
+it's too small to be worth worrying about; so a reasonable safety margin
+should be left when setting max_memory_to_use.
+
+
+Memory usage
+------------
+
+Working memory requirements while performing compression or decompression
+depend on image dimensions, image characteristics (such as colorspace and
+JPEG process), and operating mode (application-selected options).
+
+As of v6b, the decompressor requires:
+ 1. About 24K in more-or-less-fixed-size data.  This varies a bit depending
+    on operating mode and image characteristics (particularly color vs.
+    grayscale), but it doesn't depend on image dimensions.
+ 2. Strip buffers (of size proportional to the image width) for IDCT and
+    upsampling results.  The worst case for commonly used sampling factors
+    is about 34 bytes * width in pixels for a color image.  A grayscale image
+    only needs about 8 bytes per pixel column.
+ 3. A full-image DCT coefficient buffer is needed to decode a multi-scan JPEG
+    file (including progressive JPEGs), or whenever you select buffered-image
+    mode.  This takes 2 bytes/coefficient.  At typical 2x2 sampling, that's
+    3 bytes per pixel for a color image.  Worst case (1x1 sampling) requires
+    6 bytes/pixel.  For grayscale, figure 2 bytes/pixel.
+ 4. To perform 2-pass color quantization, the decompressor also needs a
+    128K color lookup table and a full-image pixel buffer (3 bytes/pixel).
+This does not count any memory allocated by the application, such as a
+buffer to hold the final output image.
+
+The above figures are valid for 8-bit JPEG data precision and a machine with
+32-bit ints.  For 12-bit JPEG data, double the size of the strip buffers and
+quantization pixel buffer.  The "fixed-size" data will be somewhat smaller
+with 16-bit ints, larger with 64-bit ints.  Also, CMYK or other unusual
+color spaces will require different amounts of space.
+
+The full-image coefficient and pixel buffers, if needed at all, do not
+have to be fully RAM resident; you can have the library use temporary
+files instead when the total memory usage would exceed a limit you set.
+(But if your OS supports virtual memory, it's probably better to just use
+jmemnobs and let the OS do the swapping.)
+
+The compressor's memory requirements are similar, except that it has no need
+for color quantization.  Also, it needs a full-image DCT coefficient buffer
+if Huffman-table optimization is asked for, even if progressive mode is not
+requested.
+
+If you need more detailed information about memory usage in a particular
+situation, you can enable the MEM_STATS code in jmemmgr.c.
+
+
+Library compile-time options
+----------------------------
+
+A number of compile-time options are available by modifying jmorecfg.h.
+
+The JPEG standard provides for both the baseline 8-bit DCT process and
+a 12-bit DCT process.  The IJG code supports 12-bit lossy JPEG if you define
+BITS_IN_JSAMPLE as 12 rather than 8.  Note that this causes JSAMPLE to be
+larger than a char, so it affects the surrounding application's image data.
+The sample applications cjpeg and djpeg can support 12-bit mode only for PPM
+and GIF file formats; you must disable the other file formats to compile a
+12-bit cjpeg or djpeg.  (install.txt has more information about that.)
+At present, a 12-bit library can handle *only* 12-bit images, not both
+precisions.
+
+Note that a 12-bit library always compresses in Huffman optimization mode,
+in order to generate valid Huffman tables.  This is necessary because our
+default Huffman tables only cover 8-bit data.  If you need to output 12-bit
+files in one pass, you'll have to supply suitable default Huffman tables.
+You may also want to supply your own DCT quantization tables; the existing
+quality-scaling code has been developed for 8-bit use, and probably doesn't
+generate especially good tables for 12-bit.
+
+The maximum number of components (color channels) in the image is determined
+by MAX_COMPONENTS.  The JPEG standard allows up to 255 components, but we
+expect that few applications will need more than four or so.
+
+On machines with unusual data type sizes, you may be able to improve
+performance or reduce memory space by tweaking the various typedefs in
+jmorecfg.h.  In particular, on some RISC CPUs, access to arrays of "short"s
+is quite slow; consider trading memory for speed by making JCOEF, INT16, and
+UINT16 be "int" or "unsigned int".  UINT8 is also a candidate to become int.
+You probably don't want to make JSAMPLE be int unless you have lots of memory
+to burn.
+
+You can reduce the size of the library by compiling out various optional
+functions.  To do this, undefine xxx_SUPPORTED symbols as necessary.
+
+You can also save a few K by not having text error messages in the library;
+the standard error message table occupies about 5Kb.  This is particularly
+reasonable for embedded applications where there's no good way to display
+a message anyway.  To do this, remove the creation of the message table
+(jpeg_std_message_table[]) from jerror.c, and alter format_message to do
+something reasonable without it.  You could output the numeric value of the
+message code number, for example.  If you do this, you can also save a couple
+more K by modifying the TRACEMSn() macros in jerror.h to expand to nothing;
+you don't need trace capability anyway, right?
+
+
+Portability considerations
+--------------------------
+
+The JPEG library has been written to be extremely portable; the sample
+applications cjpeg and djpeg are slightly less so.  This section summarizes
+the design goals in this area.  (If you encounter any bugs that cause the
+library to be less portable than is claimed here, we'd appreciate hearing
+about them.)
+
+The code works fine on ANSI C and C++ compilers, using any of the popular
+system include file setups, and some not-so-popular ones too.
+
+The code is not dependent on the exact sizes of the C data types.  As
+distributed, we make the assumptions that
+        char    is at least 8 bits wide
+        short   is at least 16 bits wide
+        int     is at least 16 bits wide
+        long    is at least 32 bits wide
+(These are the minimum requirements of the ANSI C standard.)  Wider types will
+work fine, although memory may be used inefficiently if char is much larger
+than 8 bits or short is much bigger than 16 bits.  The code should work
+equally well with 16- or 32-bit ints.
+
+In a system where these assumptions are not met, you may be able to make the
+code work by modifying the typedefs in jmorecfg.h.  However, you will probably
+have difficulty if int is less than 16 bits wide, since references to plain
+int abound in the code.
+
+char can be either signed or unsigned, although the code runs faster if an
+unsigned char type is available.  If char is wider than 8 bits, you will need
+to redefine JOCTET and/or provide custom data source/destination managers so
+that JOCTET represents exactly 8 bits of data on external storage.
+
+The JPEG library proper does not assume ASCII representation of characters.
+But some of the image file I/O modules in cjpeg/djpeg do have ASCII
+dependencies in file-header manipulation; so does cjpeg's select_file_type()
+routine.
+
+The JPEG library does not rely heavily on the C library.  In particular, C
+stdio is used only by the data source/destination modules and the error
+handler, all of which are application-replaceable.  (cjpeg/djpeg are more
+heavily dependent on stdio.)  malloc and free are called only from the memory
+manager "back end" module, so you can use a different memory allocator by
+replacing that one file.
+
+More info about porting the code may be gleaned by reading jconfig.txt,
+jmorecfg.h, and jinclude.h.
diff --git a/mac/jsimdcfg.inc b/mac/jsimdcfg.inc
deleted file mode 100644
index 9d4aede..0000000
--- a/mac/jsimdcfg.inc
+++ /dev/null
@@ -1,94 +0,0 @@
-;
-; Automatically generated include file from jsimdcfg.inc.h
-;
-;
-; -- jpeglib.h
-;
-%define DCTSIZE 8
-%define DCTSIZE2 64
-;
-; -- jmorecfg.h
-;
-%define RGB_RED 0
-%define RGB_GREEN 1
-%define RGB_BLUE 2
-%define RGB_PIXELSIZE 3
-%define EXT_RGB_RED 0
-%define EXT_RGB_GREEN 1
-%define EXT_RGB_BLUE 2
-%define EXT_RGB_PIXELSIZE 3
-%define EXT_RGBX_RED 0
-%define EXT_RGBX_GREEN 1
-%define EXT_RGBX_BLUE 2
-%define EXT_RGBX_PIXELSIZE 4
-%define EXT_BGR_RED 2
-%define EXT_BGR_GREEN 1
-%define EXT_BGR_BLUE 0
-%define EXT_BGR_PIXELSIZE 3
-%define EXT_BGRX_RED 2
-%define EXT_BGRX_GREEN 1
-%define EXT_BGRX_BLUE 0
-%define EXT_BGRX_PIXELSIZE 4
-%define EXT_XBGR_RED 3
-%define EXT_XBGR_GREEN 2
-%define EXT_XBGR_BLUE 1
-%define EXT_XBGR_PIXELSIZE 4
-%define EXT_XRGB_RED 1
-%define EXT_XRGB_GREEN 2
-%define EXT_XRGB_BLUE 3
-%define EXT_XRGB_PIXELSIZE 4
-%define RGBX_FILLER_0XFF 1
-; Representation of a single sample (pixel element value).
-; On this SIMD implementation, this must be 'unsigned char'.
-;
-%define JSAMPLE byte ; unsigned char
-%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
-%define CENTERJSAMPLE 128
-; Representation of a DCT frequency coefficient.
-; On this SIMD implementation, this must be 'short'.
-;
-%define JCOEF word ; short
-%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
-; Datatype used for image dimensions.
-; On this SIMD implementation, this must be 'unsigned int'.
-;
-%define JDIMENSION dword ; unsigned int
-%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
-%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
-%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
-%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
-%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
-%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
-%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
-%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
-%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
-;
-; -- jdct.h
-;
-; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
-; the DCT is to be performed in-place in that buffer.
-; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
-;
-%define DCTELEM word ; short
-%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
-%define float FP32 ; float
-%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)
-; To maximize parallelism, Type short is changed to short.
-;
-%define ISLOW_MULT_TYPE word ; must be short
-%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
-%define IFAST_MULT_TYPE word ; must be short
-%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
-%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
-%define FLOAT_MULT_TYPE FP32 ; must be float
-%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
-;
-; -- jsimd.h
-;
-%define JSIMD_NONE 0x00
-%define JSIMD_MMX 0x01
-%define JSIMD_3DNOW 0x02
-%define JSIMD_SSE 0x04
-%define JSIMD_SSE2 0x08
-; Short forms of external names for systems with brain-damaged linkers.
-;
diff --git a/rdbmp.c b/rdbmp.c
index ba9f728..eaa7086 100644
--- a/rdbmp.c
+++ b/rdbmp.c
@@ -6,7 +6,9 @@
  * Modified 2009-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Modified 2011 by Siarhei Siamashka.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to read input images in Microsoft "BMP"
  * format (MS Windows 3.x, OS/2 1.x, and OS/2 2.x flavors).
@@ -24,7 +26,7 @@
  * This code contributed by James Arthur Boucher.
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef BMP_SUPPORTED
 
@@ -33,37 +35,37 @@
 
 #ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
-#define UCH(x)	((int) (x))
+#define UCH(x)  ((int) (x))
 #else /* !HAVE_UNSIGNED_CHAR */
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
 typedef char U_CHAR;
-#define UCH(x)	((int) (x))
+#define UCH(x)  ((int) (x))
 #else
 typedef char U_CHAR;
-#define UCH(x)	((int) (x) & 0xFF)
+#define UCH(x)  ((int) (x) & 0xFF)
 #endif
 #endif /* HAVE_UNSIGNED_CHAR */
 
 
-#define	ReadOK(file,buffer,len)	(JFREAD(file,buffer,len) == ((size_t) (len)))
+#define ReadOK(file,buffer,len) (JFREAD(file,buffer,len) == ((size_t) (len)))
 
 
 /* Private version of data source object */
 
-typedef struct _bmp_source_struct * bmp_source_ptr;
+typedef struct _bmp_source_struct *bmp_source_ptr;
 
 typedef struct _bmp_source_struct {
   struct cjpeg_source_struct pub; /* public fields */
 
-  j_compress_ptr cinfo;		/* back link saves passing separate parm */
+  j_compress_ptr cinfo;         /* back link saves passing separate parm */
 
-  JSAMPARRAY colormap;		/* BMP colormap (converted to my format) */
+  JSAMPARRAY colormap;          /* BMP colormap (converted to my format) */
 
-  jvirt_sarray_ptr whole_image;	/* Needed to reverse row order */
-  JDIMENSION source_row;	/* Current source row number */
-  JDIMENSION row_width;		/* Physical width of scanlines in file */
+  jvirt_sarray_ptr whole_image; /* Needed to reverse row order */
+  JDIMENSION source_row;        /* Current source row number */
+  JDIMENSION row_width;         /* Physical width of scanlines in file */
 
-  int bits_per_pixel;		/* remembers 8- or 24-bit format */
+  int bits_per_pixel;           /* remembers 8- or 24-bit format */
 } bmp_source_struct;
 
 
@@ -140,7 +142,7 @@
   outptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
     t = GETJSAMPLE(*inptr++);
-    *outptr++ = colormap[0][t];	/* can omit GETJSAMPLE() safely */
+    *outptr++ = colormap[0][t]; /* can omit GETJSAMPLE() safely */
     *outptr++ = colormap[1][t];
     *outptr++ = colormap[2][t];
   }
@@ -170,7 +172,7 @@
   inptr = image_ptr[0];
   outptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
-    outptr[2] = *inptr++;	/* can omit GETJSAMPLE() safely */
+    outptr[2] = *inptr++;       /* can omit GETJSAMPLE() safely */
     outptr[1] = *inptr++;
     outptr[0] = *inptr++;
     outptr += 3;
@@ -200,10 +202,10 @@
   inptr = image_ptr[0];
   outptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
-    outptr[2] = *inptr++;	/* can omit GETJSAMPLE() safely */
+    outptr[2] = *inptr++;       /* can omit GETJSAMPLE() safely */
     outptr[1] = *inptr++;
     outptr[0] = *inptr++;
-    inptr++;			/* skip the 4th byte (Alpha channel) */
+    inptr++;                    /* skip the 4th byte (Alpha channel) */
     outptr += 3;
   }
 
@@ -279,22 +281,22 @@
   bmp_source_ptr source = (bmp_source_ptr) sinfo;
   U_CHAR bmpfileheader[14];
   U_CHAR bmpinfoheader[64];
-#define GET_2B(array,offset)  ((unsigned int) UCH(array[offset]) + \
-			       (((unsigned int) UCH(array[offset+1])) << 8))
-#define GET_4B(array,offset)  ((INT32) UCH(array[offset]) + \
-			       (((INT32) UCH(array[offset+1])) << 8) + \
-			       (((INT32) UCH(array[offset+2])) << 16) + \
-			       (((INT32) UCH(array[offset+3])) << 24))
-  INT32 bfOffBits;
-  INT32 headerSize;
-  INT32 biWidth;
-  INT32 biHeight;
-  unsigned int biPlanes;
-  INT32 biCompression;
-  INT32 biXPelsPerMeter,biYPelsPerMeter;
-  INT32 biClrUsed = 0;
-  int mapentrysize = 0;		/* 0 indicates no colormap */
-  INT32 bPad;
+#define GET_2B(array,offset)  ((unsigned short) UCH(array[offset]) + \
+                               (((unsigned short) UCH(array[offset+1])) << 8))
+#define GET_4B(array,offset)  ((unsigned int) UCH(array[offset]) + \
+                               (((unsigned int) UCH(array[offset+1])) << 8) + \
+                               (((unsigned int) UCH(array[offset+2])) << 16) + \
+                               (((unsigned int) UCH(array[offset+3])) << 24))
+  unsigned int bfOffBits;
+  unsigned int headerSize;
+  int biWidth;
+  int biHeight;
+  unsigned short biPlanes;
+  unsigned int biCompression;
+  int biXPelsPerMeter,biYPelsPerMeter;
+  unsigned int biClrUsed = 0;
+  int mapentrysize = 0;         /* 0 indicates no colormap */
+  int bPad;
   JDIMENSION row_width;
 
   /* Read and verify the bitmap file header */
@@ -302,7 +304,7 @@
     ERREXIT(cinfo, JERR_INPUT_EOF);
   if (GET_2B(bmpfileheader,0) != 0x4D42) /* 'BM' */
     ERREXIT(cinfo, JERR_BMP_NOT);
-  bfOffBits = (INT32) GET_4B(bmpfileheader,10);
+  bfOffBits = GET_4B(bmpfileheader,10);
   /* We ignore the remaining fileheader fields */
 
   /* The infoheader might be 12 bytes (OS/2 1.x), 40 bytes (Windows),
@@ -310,27 +312,27 @@
    */
   if (! ReadOK(source->pub.input_file, bmpinfoheader, 4))
     ERREXIT(cinfo, JERR_INPUT_EOF);
-  headerSize = (INT32) GET_4B(bmpinfoheader,0);
+  headerSize = GET_4B(bmpinfoheader,0);
   if (headerSize < 12 || headerSize > 64)
     ERREXIT(cinfo, JERR_BMP_BADHEADER);
   if (! ReadOK(source->pub.input_file, bmpinfoheader+4, headerSize-4))
     ERREXIT(cinfo, JERR_INPUT_EOF);
 
-  switch ((int) headerSize) {
+  switch (headerSize) {
   case 12:
     /* Decode OS/2 1.x header (Microsoft calls this a BITMAPCOREHEADER) */
-    biWidth = (INT32) GET_2B(bmpinfoheader,4);
-    biHeight = (INT32) GET_2B(bmpinfoheader,6);
+    biWidth = (int) GET_2B(bmpinfoheader,4);
+    biHeight = (int) GET_2B(bmpinfoheader,6);
     biPlanes = GET_2B(bmpinfoheader,8);
     source->bits_per_pixel = (int) GET_2B(bmpinfoheader,10);
 
     switch (source->bits_per_pixel) {
-    case 8:			/* colormapped image */
-      mapentrysize = 3;		/* OS/2 uses RGBTRIPLE colormap */
-      TRACEMS2(cinfo, 1, JTRC_BMP_OS2_MAPPED, (int) biWidth, (int) biHeight);
+    case 8:                     /* colormapped image */
+      mapentrysize = 3;         /* OS/2 uses RGBTRIPLE colormap */
+      TRACEMS2(cinfo, 1, JTRC_BMP_OS2_MAPPED, biWidth, biHeight);
       break;
-    case 24:			/* RGB image */
-      TRACEMS2(cinfo, 1, JTRC_BMP_OS2, (int) biWidth, (int) biHeight);
+    case 24:                    /* RGB image */
+      TRACEMS2(cinfo, 1, JTRC_BMP_OS2, biWidth, biHeight);
       break;
     default:
       ERREXIT(cinfo, JERR_BMP_BADDEPTH);
@@ -341,26 +343,26 @@
   case 64:
     /* Decode Windows 3.x header (Microsoft calls this a BITMAPINFOHEADER) */
     /* or OS/2 2.x header, which has additional fields that we ignore */
-    biWidth = GET_4B(bmpinfoheader,4);
-    biHeight = GET_4B(bmpinfoheader,8);
+    biWidth = (int) GET_4B(bmpinfoheader,4);
+    biHeight = (int) GET_4B(bmpinfoheader,8);
     biPlanes = GET_2B(bmpinfoheader,12);
     source->bits_per_pixel = (int) GET_2B(bmpinfoheader,14);
     biCompression = GET_4B(bmpinfoheader,16);
-    biXPelsPerMeter = GET_4B(bmpinfoheader,24);
-    biYPelsPerMeter = GET_4B(bmpinfoheader,28);
+    biXPelsPerMeter = (int) GET_4B(bmpinfoheader,24);
+    biYPelsPerMeter = (int) GET_4B(bmpinfoheader,28);
     biClrUsed = GET_4B(bmpinfoheader,32);
     /* biSizeImage, biClrImportant fields are ignored */
 
     switch (source->bits_per_pixel) {
-    case 8:			/* colormapped image */
-      mapentrysize = 4;		/* Windows uses RGBQUAD colormap */
-      TRACEMS2(cinfo, 1, JTRC_BMP_MAPPED, (int) biWidth, (int) biHeight);
+    case 8:                     /* colormapped image */
+      mapentrysize = 4;         /* Windows uses RGBQUAD colormap */
+      TRACEMS2(cinfo, 1, JTRC_BMP_MAPPED, biWidth, biHeight);
       break;
-    case 24:			/* RGB image */
-      TRACEMS2(cinfo, 1, JTRC_BMP, (int) biWidth, (int) biHeight);
+    case 24:                    /* RGB image */
+      TRACEMS2(cinfo, 1, JTRC_BMP, biWidth, biHeight);
       break;
-    case 32:			/* RGB image + Alpha channel */
-      TRACEMS2(cinfo, 1, JTRC_BMP, (int) biWidth, (int) biHeight);
+    case 32:                    /* RGB image + Alpha channel */
+      TRACEMS2(cinfo, 1, JTRC_BMP, biWidth, biHeight);
       break;
     default:
       ERREXIT(cinfo, JERR_BMP_BADDEPTH);
@@ -373,7 +375,7 @@
       /* Set JFIF density parameters from the BMP data */
       cinfo->X_density = (UINT16) (biXPelsPerMeter/100); /* 100 cm per meter */
       cinfo->Y_density = (UINT16) (biYPelsPerMeter/100);
-      cinfo->density_unit = 2;	/* dots/cm */
+      cinfo->density_unit = 2;  /* dots/cm */
     }
     break;
   default:
@@ -392,7 +394,7 @@
   /* Read the colormap, if any */
   if (mapentrysize > 0) {
     if (biClrUsed <= 0)
-      biClrUsed = 256;		/* assume it's 256 */
+      biClrUsed = 256;          /* assume it's 256 */
     else if (biClrUsed > 256)
       ERREXIT(cinfo, JERR_BMP_BADCMAP);
     /* Allocate space to store the colormap */
@@ -406,7 +408,7 @@
   }
 
   /* Skip any remaining pad bytes */
-  if (bPad < 0)			/* incorrect bfOffBits value? */
+  if (bPad < 0)                 /* incorrect bfOffBits value? */
     ERREXIT(cinfo, JERR_BMP_BADHEADER);
   while (--bPad >= 0) {
     (void) read_byte(source);
@@ -469,8 +471,8 @@
   /* Create module interface object */
   source = (bmp_source_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(bmp_source_struct));
-  source->cinfo = cinfo;	/* make back link for subroutines */
+                                  sizeof(bmp_source_struct));
+  source->cinfo = cinfo;        /* make back link for subroutines */
   /* Fill in method ptrs, except get_pixel_rows which start_input sets */
   source->pub.start_input = start_input_bmp;
   source->pub.finish_input = finish_input_bmp;
diff --git a/rdcolmap.c b/rdcolmap.c
index 42b3437..ed8ca3b 100644
--- a/rdcolmap.c
+++ b/rdcolmap.c
@@ -3,7 +3,8 @@
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file implements djpeg's "-map file" switch.  It reads a source image
  * and constructs a colormap to be supplied to the JPEG decompressor.
@@ -21,9 +22,9 @@
  * currently implemented.
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
-#ifdef QUANT_2PASS_SUPPORTED	/* otherwise can't quantize to supplied map */
+#ifdef QUANT_2PASS_SUPPORTED    /* otherwise can't quantize to supplied map */
 
 /* Portions of this code are based on the PBMPLUS library, which is:
 **
@@ -54,9 +55,9 @@
   /* Check for duplicate color. */
   for (index = 0; index < ncolors; index++) {
     if (GETJSAMPLE(colormap0[index]) == R &&
-	GETJSAMPLE(colormap1[index]) == G &&
-	GETJSAMPLE(colormap2[index]) == B)
-      return;			/* color is already in map */
+        GETJSAMPLE(colormap1[index]) == G &&
+        GETJSAMPLE(colormap2[index]) == B)
+      return;                   /* color is already in map */
   }
 
   /* Check for map overflow. */
@@ -76,7 +77,7 @@
  */
 
 LOCAL(void)
-read_gif_map (j_decompress_ptr cinfo, FILE * infile)
+read_gif_map (j_decompress_ptr cinfo, FILE *infile)
 {
   int header[13];
   int i, colormaplen;
@@ -107,9 +108,9 @@
     if (R == EOF || G == EOF || B == EOF)
       ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
     add_map_entry(cinfo,
-		  R << (BITS_IN_JSAMPLE-8),
-		  G << (BITS_IN_JSAMPLE-8),
-		  B << (BITS_IN_JSAMPLE-8));
+                  R << (BITS_IN_JSAMPLE-8),
+                  G << (BITS_IN_JSAMPLE-8),
+                  B << (BITS_IN_JSAMPLE-8));
   }
 }
 
@@ -118,12 +119,12 @@
 
 
 LOCAL(int)
-pbm_getc (FILE * infile)
+pbm_getc (FILE *infile)
 /* Read next char, skipping over any comments */
 /* A comment/newline sequence is returned as a newline */
 {
   register int ch;
-  
+
   ch = getc(infile);
   if (ch == '#') {
     do {
@@ -135,7 +136,7 @@
 
 
 LOCAL(unsigned int)
-read_pbm_integer (j_decompress_ptr cinfo, FILE * infile)
+read_pbm_integer (j_decompress_ptr cinfo, FILE *infile)
 /* Read an unsigned decimal integer from the PPM file */
 /* Swallows one trailing character after the integer */
 /* Note that on a 16-bit-int machine, only values up to 64k can be read. */
@@ -143,17 +144,17 @@
 {
   register int ch;
   register unsigned int val;
-  
+
   /* Skip any leading whitespace */
   do {
     ch = pbm_getc(infile);
     if (ch == EOF)
       ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
   } while (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
-  
+
   if (ch < '0' || ch > '9')
     ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
-  
+
   val = ch - '0';
   while ((ch = pbm_getc(infile)) >= '0' && ch <= '9') {
     val *= 10;
@@ -168,14 +169,14 @@
  */
 
 LOCAL(void)
-read_ppm_map (j_decompress_ptr cinfo, FILE * infile)
+read_ppm_map (j_decompress_ptr cinfo, FILE *infile)
 {
   int c;
   unsigned int w, h, maxval, row, col;
   int R, G, B;
 
   /* Initial 'P' has already been read by read_color_map */
-  c = getc(infile);		/* save format discriminator for a sec */
+  c = getc(infile);             /* save format discriminator for a sec */
 
   /* while we fetch the remaining header info */
   w = read_pbm_integer(cinfo, infile);
@@ -190,26 +191,26 @@
     ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
 
   switch (c) {
-  case '3':			/* it's a text-format PPM file */
+  case '3':                     /* it's a text-format PPM file */
     for (row = 0; row < h; row++) {
       for (col = 0; col < w; col++) {
-	R = read_pbm_integer(cinfo, infile);
-	G = read_pbm_integer(cinfo, infile);
-	B = read_pbm_integer(cinfo, infile);
-	add_map_entry(cinfo, R, G, B);
+        R = read_pbm_integer(cinfo, infile);
+        G = read_pbm_integer(cinfo, infile);
+        B = read_pbm_integer(cinfo, infile);
+        add_map_entry(cinfo, R, G, B);
       }
     }
     break;
 
-  case '6':			/* it's a raw-format PPM file */
+  case '6':                     /* it's a raw-format PPM file */
     for (row = 0; row < h; row++) {
       for (col = 0; col < w; col++) {
-	R = getc(infile);
-	G = getc(infile);
-	B = getc(infile);
-	if (R == EOF || G == EOF || B == EOF)
-	  ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
-	add_map_entry(cinfo, R, G, B);
+        R = getc(infile);
+        G = getc(infile);
+        B = getc(infile);
+        if (R == EOF || G == EOF || B == EOF)
+          ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
+        add_map_entry(cinfo, R, G, B);
       }
     }
     break;
@@ -228,7 +229,7 @@
  */
 
 GLOBAL(void)
-read_color_map (j_decompress_ptr cinfo, FILE * infile)
+read_color_map (j_decompress_ptr cinfo, FILE *infile)
 {
   /* Allocate space for a color map of maximum supported size. */
   cinfo->colormap = (*cinfo->mem->alloc_sarray)
diff --git a/rdgif.c b/rdgif.c
index b27c167..ce689f7 100644
--- a/rdgif.c
+++ b/rdgif.c
@@ -3,7 +3,8 @@
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to read input images in GIF format.
  *
@@ -19,7 +20,7 @@
  *    CompuServe Incorporated."
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef GIF_SUPPORTED
 
@@ -32,7 +33,7 @@
 {
   fprintf(stderr, "GIF input is unsupported for legal reasons.  Sorry.\n");
   exit(EXIT_FAILURE);
-  return NULL;			/* keep compiler happy */
+  return NULL;                  /* keep compiler happy */
 }
 
 #endif /* GIF_SUPPORTED */
diff --git a/rdjpgcom.1 b/rdjpgcom.1
new file mode 100644
index 0000000..97611df
--- /dev/null
+++ b/rdjpgcom.1
@@ -0,0 +1,63 @@
+.TH RDJPGCOM 1 "02 April 2009"
+.SH NAME
+rdjpgcom \- display text comments from a JPEG file
+.SH SYNOPSIS
+.B rdjpgcom
+[
+.B \-raw
+]
+[
+.B \-verbose
+]
+[
+.I filename
+]
+.LP
+.SH DESCRIPTION
+.LP
+.B rdjpgcom
+reads the named JPEG/JFIF file, or the standard input if no file is named,
+and prints any text comments found in the file on the standard output.
+.PP
+The JPEG standard allows "comment" (COM) blocks to occur within a JPEG file.
+Although the standard doesn't actually define what COM blocks are for, they
+are widely used to hold user-supplied text strings.  This lets you add
+annotations, titles, index terms, etc to your JPEG files, and later retrieve
+them as text.  COM blocks do not interfere with the image stored in the JPEG
+file.  The maximum size of a COM block is 64K, but you can have as many of
+them as you like in one JPEG file.
+.SH OPTIONS
+.TP
+.B \-raw
+Normally
+.B rdjpgcom
+escapes non-printable characters in comments, for security reasons.
+This option avoids that.
+.PP
+.B \-verbose
+Causes
+.B rdjpgcom
+to also display the JPEG image dimensions.
+.PP
+Switch names may be abbreviated, and are not case sensitive.
+.SH HINTS
+.B rdjpgcom
+does not depend on the IJG JPEG library.  Its source code is intended as an
+illustration of the minimum amount of code required to parse a JPEG file
+header correctly.
+.PP
+In
+.B \-verbose
+mode,
+.B rdjpgcom
+will also attempt to print the contents of any "APP12" markers as text.
+Some digital cameras produce APP12 markers containing useful textual
+information.  If you like, you can modify the source code to print
+other APPn marker types as well.
+.SH SEE ALSO
+.BR cjpeg (1),
+.BR djpeg (1),
+.BR jpegtran (1),
+.BR wrjpgcom (1)
+.SH AUTHOR
+Independent JPEG Group
diff --git a/rdjpgcom.c b/rdjpgcom.c
index 3719154..b3076dd 100644
--- a/rdjpgcom.c
+++ b/rdjpgcom.c
@@ -1,10 +1,13 @@
 /*
  * rdjpgcom.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * Modified 2009 by Bill Allombert, Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a very simple stand-alone application that displays
  * the text in COM (comment) markers in a JFIF file.
@@ -12,49 +15,41 @@
  * JPEG markers.
  */
 
-#define JPEG_CJPEG_DJPEG	/* to get the command-line config symbols */
-#include "jinclude.h"		/* get auto-config symbols, <stdio.h> */
+#define JPEG_CJPEG_DJPEG        /* to get the command-line config symbols */
+#include "jinclude.h"           /* get auto-config symbols, <stdio.h> */
 
 #ifdef HAVE_LOCALE_H
-#include <locale.h>		/* Bill Allombert: use locale for isprint */
+#include <locale.h>             /* Bill Allombert: use locale for isprint */
 #endif
-#include <ctype.h>		/* to declare isupper(), tolower() */
+#include <ctype.h>              /* to declare isupper(), tolower() */
 #ifdef USE_SETMODE
-#include <fcntl.h>		/* to declare setmode()'s parameter macros */
+#include <fcntl.h>              /* to declare setmode()'s parameter macros */
 /* If you have setmode() but not <io.h>, just delete this line: */
-#include <io.h>			/* to declare setmode() */
+#include <io.h>                 /* to declare setmode() */
 #endif
 
-#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
+#ifdef USE_CCOMMAND             /* command-line reader for Macintosh */
 #ifdef __MWERKS__
 #include <SIOUX.h>              /* Metrowerks needs this */
-#include <console.h>		/* ... and this */
+#include <console.h>            /* ... and this */
 #endif
 #ifdef THINK_C
-#include <console.h>		/* Think declares it here */
+#include <console.h>            /* Think declares it here */
 #endif
 #endif
 
-#ifdef DONT_USE_B_MODE		/* define mode parameters for fopen() */
-#define READ_BINARY	"r"
+#ifdef DONT_USE_B_MODE          /* define mode parameters for fopen() */
+#define READ_BINARY     "r"
 #else
-#ifdef VMS			/* VMS is very nonstandard */
-#define READ_BINARY	"rb", "ctx=stm"
-#else				/* standard ANSI-compliant case */
-#define READ_BINARY	"rb"
-#endif
+#define READ_BINARY     "rb"
 #endif
 
-#ifndef EXIT_FAILURE		/* define exit() codes if not provided */
+#ifndef EXIT_FAILURE            /* define exit() codes if not provided */
 #define EXIT_FAILURE  1
 #endif
 #ifndef EXIT_SUCCESS
-#ifdef VMS
-#define EXIT_SUCCESS  1		/* VMS is very nonstandard */
-#else
 #define EXIT_SUCCESS  0
 #endif
-#endif
 
 
 /*
@@ -62,7 +57,7 @@
  * To reuse this code in another application, you might need to change these.
  */
 
-static FILE * infile;		/* input JPEG file */
+static FILE *infile;            /* input JPEG file */
 
 /* Return next input byte, or EOF if no more */
 #define NEXTBYTE()  getc(infile)
@@ -107,11 +102,11 @@
  * in this program.  (See jdmarker.c for a more complete list.)
  */
 
-#define M_SOF0  0xC0		/* Start Of Frame N */
-#define M_SOF1  0xC1		/* N indicates which compression process */
-#define M_SOF2  0xC2		/* Only SOF0-SOF2 are now in common use */
+#define M_SOF0  0xC0            /* Start Of Frame N */
+#define M_SOF1  0xC1            /* N indicates which compression process */
+#define M_SOF2  0xC2            /* Only SOF0-SOF2 are now in common use */
 #define M_SOF3  0xC3
-#define M_SOF5  0xC5		/* NB: codes C4 and CC are NOT SOF markers */
+#define M_SOF5  0xC5            /* NB: codes C4 and CC are NOT SOF markers */
 #define M_SOF6  0xC6
 #define M_SOF7  0xC7
 #define M_SOF9  0xC9
@@ -120,12 +115,12 @@
 #define M_SOF13 0xCD
 #define M_SOF14 0xCE
 #define M_SOF15 0xCF
-#define M_SOI   0xD8		/* Start Of Image (beginning of datastream) */
-#define M_EOI   0xD9		/* End Of Image (end of datastream) */
-#define M_SOS   0xDA		/* Start Of Scan (begins compressed data) */
-#define M_APP0	0xE0		/* Application-specific marker, type N */
-#define M_APP12	0xEC		/* (we don't bother to list all 16 APPn's) */
-#define M_COM   0xFE		/* COMment */
+#define M_SOI   0xD8            /* Start Of Image (beginning of datastream) */
+#define M_EOI   0xD9            /* End Of Image (end of datastream) */
+#define M_SOS   0xDA            /* Start Of Scan (begins compressed data) */
+#define M_APP0  0xE0            /* Application-specific marker, type N */
+#define M_APP12 0xEC            /* (we don't bother to list all 16 APPn's) */
+#define M_COM   0xFE            /* COMment */
 
 
 /*
@@ -253,7 +248,7 @@
       printf("\n");
     } else if (ch == '\n') {
       if (lastch != '\r')
-	printf("\n");
+        printf("\n");
     } else if (ch == '\\') {
       printf("\\\\");
     } else if (isprint(ch)) {
@@ -284,10 +279,10 @@
   unsigned int length;
   unsigned int image_height, image_width;
   int data_precision, num_components;
-  const char * process;
+  const char *process;
   int ci;
 
-  length = read_2_bytes();	/* usual parameter length count */
+  length = read_2_bytes();      /* usual parameter length count */
 
   data_precision = read_1_byte();
   image_height = read_2_bytes();
@@ -295,33 +290,33 @@
   num_components = read_1_byte();
 
   switch (marker) {
-  case M_SOF0:	process = "Baseline";  break;
-  case M_SOF1:	process = "Extended sequential";  break;
-  case M_SOF2:	process = "Progressive";  break;
-  case M_SOF3:	process = "Lossless";  break;
-  case M_SOF5:	process = "Differential sequential";  break;
-  case M_SOF6:	process = "Differential progressive";  break;
-  case M_SOF7:	process = "Differential lossless";  break;
-  case M_SOF9:	process = "Extended sequential, arithmetic coding";  break;
-  case M_SOF10:	process = "Progressive, arithmetic coding";  break;
-  case M_SOF11:	process = "Lossless, arithmetic coding";  break;
-  case M_SOF13:	process = "Differential sequential, arithmetic coding";  break;
-  case M_SOF14:	process = "Differential progressive, arithmetic coding"; break;
-  case M_SOF15:	process = "Differential lossless, arithmetic coding";  break;
-  default:	process = "Unknown";  break;
+  case M_SOF0:  process = "Baseline";  break;
+  case M_SOF1:  process = "Extended sequential";  break;
+  case M_SOF2:  process = "Progressive";  break;
+  case M_SOF3:  process = "Lossless";  break;
+  case M_SOF5:  process = "Differential sequential";  break;
+  case M_SOF6:  process = "Differential progressive";  break;
+  case M_SOF7:  process = "Differential lossless";  break;
+  case M_SOF9:  process = "Extended sequential, arithmetic coding";  break;
+  case M_SOF10: process = "Progressive, arithmetic coding";  break;
+  case M_SOF11: process = "Lossless, arithmetic coding";  break;
+  case M_SOF13: process = "Differential sequential, arithmetic coding";  break;
+  case M_SOF14: process = "Differential progressive, arithmetic coding"; break;
+  case M_SOF15: process = "Differential lossless, arithmetic coding";  break;
+  default:      process = "Unknown";  break;
   }
 
   printf("JPEG image is %uw * %uh, %d color components, %d bits per sample\n",
-	 image_width, image_height, num_components, data_precision);
+         image_width, image_height, num_components, data_precision);
   printf("JPEG process: %s\n", process);
 
   if (length != (unsigned int) (8 + num_components * 3))
     ERREXIT("Bogus SOF marker length");
 
   for (ci = 0; ci < num_components; ci++) {
-    (void) read_1_byte();	/* Component ID code */
-    (void) read_1_byte();	/* H, V sampling factors */
-    (void) read_1_byte();	/* Quantization table number */
+    (void) read_1_byte();       /* Component ID code */
+    (void) read_1_byte();       /* H, V sampling factors */
+    (void) read_1_byte();       /* Quantization table number */
   }
 }
 
@@ -352,29 +347,29 @@
       /* Note that marker codes 0xC4, 0xC8, 0xCC are not, and must not be,
        * treated as SOFn.  C4 in particular is actually DHT.
        */
-    case M_SOF0:		/* Baseline */
-    case M_SOF1:		/* Extended sequential, Huffman */
-    case M_SOF2:		/* Progressive, Huffman */
-    case M_SOF3:		/* Lossless, Huffman */
-    case M_SOF5:		/* Differential sequential, Huffman */
-    case M_SOF6:		/* Differential progressive, Huffman */
-    case M_SOF7:		/* Differential lossless, Huffman */
-    case M_SOF9:		/* Extended sequential, arithmetic */
-    case M_SOF10:		/* Progressive, arithmetic */
-    case M_SOF11:		/* Lossless, arithmetic */
-    case M_SOF13:		/* Differential sequential, arithmetic */
-    case M_SOF14:		/* Differential progressive, arithmetic */
-    case M_SOF15:		/* Differential lossless, arithmetic */
+    case M_SOF0:                /* Baseline */
+    case M_SOF1:                /* Extended sequential, Huffman */
+    case M_SOF2:                /* Progressive, Huffman */
+    case M_SOF3:                /* Lossless, Huffman */
+    case M_SOF5:                /* Differential sequential, Huffman */
+    case M_SOF6:                /* Differential progressive, Huffman */
+    case M_SOF7:                /* Differential lossless, Huffman */
+    case M_SOF9:                /* Extended sequential, arithmetic */
+    case M_SOF10:               /* Progressive, arithmetic */
+    case M_SOF11:               /* Lossless, arithmetic */
+    case M_SOF13:               /* Differential sequential, arithmetic */
+    case M_SOF14:               /* Differential progressive, arithmetic */
+    case M_SOF15:               /* Differential lossless, arithmetic */
       if (verbose)
-	process_SOFn(marker);
+        process_SOFn(marker);
       else
-	skip_variable();
+        skip_variable();
       break;
 
-    case M_SOS:			/* stop before hitting compressed data */
+    case M_SOS:                 /* stop before hitting compressed data */
       return marker;
 
-    case M_EOI:			/* in case it's a tables-only JPEG stream */
+    case M_EOI:                 /* in case it's a tables-only JPEG stream */
       return marker;
 
     case M_COM:
@@ -386,14 +381,14 @@
        * APP12 markers, so we print those out too when in -verbose mode.
        */
       if (verbose) {
-	printf("APP12 contains:\n");
-	process_COM(raw);
+        printf("APP12 contains:\n");
+        process_COM(raw);
       } else
-	skip_variable();
+        skip_variable();
       break;
 
-    default:			/* Anything else just gets skipped */
-      skip_variable();		/* we assume it has a parameter count... */
+    default:                    /* Anything else just gets skipped */
+      skip_variable();          /* we assume it has a parameter count... */
       break;
     }
   } /* end loop */
@@ -402,7 +397,7 @@
 
 /* Command line parsing code */
 
-static const char * progname;	/* program name for error messages */
+static const char *progname;    /* program name for error messages */
 
 
 static void
@@ -422,7 +417,7 @@
 
 
 static int
-keymatch (char * arg, const char * keyword, int minchars)
+keymatch (char *arg, const char *keyword, int minchars)
 /* Case-insensitive matching of (possibly abbreviated) keyword switches. */
 /* keyword is the constant keyword (must be lower case already), */
 /* minchars is length of minimum legal abbreviation. */
@@ -432,17 +427,17 @@
 
   while ((ca = *arg++) != '\0') {
     if ((ck = *keyword++) == '\0')
-      return 0;			/* arg longer than keyword, no good */
-    if (isupper(ca))		/* force arg to lcase (assume ck is already) */
+      return 0;                 /* arg longer than keyword, no good */
+    if (isupper(ca))            /* force arg to lcase (assume ck is already) */
       ca = tolower(ca);
     if (ca != ck)
-      return 0;			/* no good */
-    nmatched++;			/* count matched characters */
+      return 0;                 /* no good */
+    nmatched++;                 /* count matched characters */
   }
   /* reached end of argument; fail if it's too short for unique abbrev */
   if (nmatched < minchars)
     return 0;
-  return 1;			/* A-OK */
+  return 1;                     /* A-OK */
 }
 
 
@@ -454,7 +449,7 @@
 main (int argc, char **argv)
 {
   int argn;
-  char * arg;
+  char *arg;
   int verbose = 0, raw = 0;
 
   /* On Mac, fetch a command line. */
@@ -464,14 +459,14 @@
 
   progname = argv[0];
   if (progname == NULL || progname[0] == 0)
-    progname = "rdjpgcom";	/* in case C library doesn't provide it */
+    progname = "rdjpgcom";      /* in case C library doesn't provide it */
 
   /* Parse switches, if any */
   for (argn = 1; argn < argc; argn++) {
     arg = argv[argn];
     if (arg[0] != '-')
-      break;			/* not switch, must be file name */
-    arg++;			/* advance over '-' */
+      break;                    /* not switch, must be file name */
+    arg++;                      /* advance over '-' */
     if (keymatch(arg, "verbose", 1)) {
       verbose++;
     } else if (keymatch(arg, "raw", 1)) {
@@ -493,10 +488,10 @@
     }
   } else {
     /* default input file is stdin */
-#ifdef USE_SETMODE		/* need to hack file mode? */
+#ifdef USE_SETMODE              /* need to hack file mode? */
     setmode(fileno(stdin), O_BINARY);
 #endif
-#ifdef USE_FDOPEN		/* need to re-open in binary mode? */
+#ifdef USE_FDOPEN               /* need to re-open in binary mode? */
     if ((infile = fdopen(fileno(stdin), READ_BINARY)) == NULL) {
       fprintf(stderr, "%s: can't open stdin\n", progname);
       exit(EXIT_FAILURE);
@@ -511,5 +506,5 @@
 
   /* All done. */
   exit(EXIT_SUCCESS);
-  return 0;			/* suppress no-return-value warnings */
+  return 0;                     /* suppress no-return-value warnings */
 }
diff --git a/rdppm.c b/rdppm.c
index a757022..aef4923 100644
--- a/rdppm.c
+++ b/rdppm.c
@@ -1,10 +1,13 @@
 /*
  * rdppm.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2009 by Bill Allombert, Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to read input images in PPM/PGM format.
  * The extended 2-byte-per-sample raw PPM/PGM formats are supported.
@@ -19,7 +22,7 @@
  * the file is indeed PPM format).
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef PPM_SUPPORTED
 
@@ -41,30 +44,19 @@
 
 #ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
-#define UCH(x)	((int) (x))
+#define UCH(x)  ((int) (x))
 #else /* !HAVE_UNSIGNED_CHAR */
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
 typedef char U_CHAR;
-#define UCH(x)	((int) (x))
+#define UCH(x)  ((int) (x))
 #else
 typedef char U_CHAR;
-#define UCH(x)	((int) (x) & 0xFF)
+#define UCH(x)  ((int) (x) & 0xFF)
 #endif
 #endif /* HAVE_UNSIGNED_CHAR */
 
 
-#define	ReadOK(file,buffer,len)	(JFREAD(file,buffer,len) == ((size_t) (len)))
-
-
-/*
- * On most systems, reading individual bytes with getc() is drastically less
- * efficient than buffering a row at a time with fread().  On PCs, we must
- * allocate the buffer in near data space, because we are assuming small-data
- * memory model, wherein fread() can't reach far memory.  If you need to
- * process very wide images on a PC, you might have to compile in large-memory
- * model, or else replace fread() with a getc() loop --- which will be much
- * slower.
- */
+#define ReadOK(file,buffer,len) (JFREAD(file,buffer,len) == ((size_t) (len)))
 
 
 /* Private version of data source object */
@@ -72,17 +64,19 @@
 typedef struct {
   struct cjpeg_source_struct pub; /* public fields */
 
-  U_CHAR *iobuffer;		/* non-FAR pointer to I/O buffer */
-  JSAMPROW pixrow;		/* FAR pointer to same */
-  size_t buffer_width;		/* width of I/O buffer */
-  JSAMPLE *rescale;		/* => maxval-remapping array, or NULL */
+  /* Usually these two pointers point to the same place: */
+  U_CHAR *iobuffer;             /* fread's I/O buffer */
+  JSAMPROW pixrow;              /* compressor input buffer */
+  size_t buffer_width;          /* width of I/O buffer */
+  JSAMPLE *rescale;             /* => maxval-remapping array, or NULL */
+  int maxval;
 } ppm_source_struct;
 
-typedef ppm_source_struct * ppm_source_ptr;
+typedef ppm_source_struct *ppm_source_ptr;
 
 
 LOCAL(int)
-pbm_getc (FILE * infile)
+pbm_getc (FILE *infile)
 /* Read next char, skipping over any comments */
 /* A comment/newline sequence is returned as a newline */
 {
@@ -99,7 +93,7 @@
 
 
 LOCAL(unsigned int)
-read_pbm_integer (j_compress_ptr cinfo, FILE * infile)
+read_pbm_integer (j_compress_ptr cinfo, FILE *infile, unsigned int maxval)
 /* Read an unsigned decimal integer from the PPM file */
 /* Swallows one trailing character after the integer */
 /* Note that on a 16-bit-int machine, only values up to 64k can be read. */
@@ -123,6 +117,10 @@
     val *= 10;
     val += ch - '0';
   }
+
+  if (val > maxval)
+    ERREXIT(cinfo, JERR_PPM_TOOLARGE);
+
   return val;
 }
 
@@ -143,14 +141,15 @@
 /* This version is for reading text-format PGM files with any maxval */
 {
   ppm_source_ptr source = (ppm_source_ptr) sinfo;
-  FILE * infile = source->pub.input_file;
+  FILE *infile = source->pub.input_file;
   register JSAMPROW ptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
+  unsigned int maxval = source->maxval;
 
   ptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
-    *ptr++ = rescale[read_pbm_integer(cinfo, infile)];
+    *ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
   }
   return 1;
 }
@@ -161,16 +160,17 @@
 /* This version is for reading text-format PPM files with any maxval */
 {
   ppm_source_ptr source = (ppm_source_ptr) sinfo;
-  FILE * infile = source->pub.input_file;
+  FILE *infile = source->pub.input_file;
   register JSAMPROW ptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
+  unsigned int maxval = source->maxval;
 
   ptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
-    *ptr++ = rescale[read_pbm_integer(cinfo, infile)];
-    *ptr++ = rescale[read_pbm_integer(cinfo, infile)];
-    *ptr++ = rescale[read_pbm_integer(cinfo, infile)];
+    *ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
+    *ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
+    *ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
   }
   return 1;
 }
@@ -182,7 +182,7 @@
 {
   ppm_source_ptr source = (ppm_source_ptr) sinfo;
   register JSAMPROW ptr;
-  register U_CHAR * bufferptr;
+  register U_CHAR *bufferptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
 
@@ -203,7 +203,7 @@
 {
   ppm_source_ptr source = (ppm_source_ptr) sinfo;
   register JSAMPROW ptr;
-  register U_CHAR * bufferptr;
+  register U_CHAR *bufferptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
 
@@ -241,7 +241,7 @@
 {
   ppm_source_ptr source = (ppm_source_ptr) sinfo;
   register JSAMPROW ptr;
-  register U_CHAR * bufferptr;
+  register U_CHAR *bufferptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
 
@@ -265,7 +265,7 @@
 {
   ppm_source_ptr source = (ppm_source_ptr) sinfo;
   register JSAMPROW ptr;
-  register U_CHAR * bufferptr;
+  register U_CHAR *bufferptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
 
@@ -308,10 +308,10 @@
 
   /* detect unsupported variants (ie, PBM) before trying to read header */
   switch (c) {
-  case '2':			/* it's a text-format PGM file */
-  case '3':			/* it's a text-format PPM file */
-  case '5':			/* it's a raw-format PGM file */
-  case '6':			/* it's a raw-format PPM file */
+  case '2':                     /* it's a text-format PGM file */
+  case '3':                     /* it's a text-format PPM file */
+  case '5':                     /* it's a raw-format PGM file */
+  case '6':                     /* it's a raw-format PPM file */
     break;
   default:
     ERREXIT(cinfo, JERR_PPM_NOT);
@@ -319,9 +319,9 @@
   }
 
   /* fetch the remaining header info */
-  w = read_pbm_integer(cinfo, source->pub.input_file);
-  h = read_pbm_integer(cinfo, source->pub.input_file);
-  maxval = read_pbm_integer(cinfo, source->pub.input_file);
+  w = read_pbm_integer(cinfo, source->pub.input_file, 65535);
+  h = read_pbm_integer(cinfo, source->pub.input_file, 65535);
+  maxval = read_pbm_integer(cinfo, source->pub.input_file, 65535);
 
   if (w <= 0 || h <= 0 || maxval <= 0) /* error check */
     ERREXIT(cinfo, JERR_PPM_NOT);
@@ -329,14 +329,15 @@
   cinfo->data_precision = BITS_IN_JSAMPLE; /* we always rescale data to this */
   cinfo->image_width = (JDIMENSION) w;
   cinfo->image_height = (JDIMENSION) h;
+  source->maxval = maxval;
 
   /* initialize flags to most common settings */
-  need_iobuffer = TRUE;		/* do we need an I/O buffer? */
-  use_raw_buffer = FALSE;	/* do we map input buffer onto I/O buffer? */
-  need_rescale = TRUE;		/* do we need a rescale array? */
+  need_iobuffer = TRUE;         /* do we need an I/O buffer? */
+  use_raw_buffer = FALSE;       /* do we map input buffer onto I/O buffer? */
+  need_rescale = TRUE;          /* do we need a rescale array? */
 
   switch (c) {
-  case '2':			/* it's a text-format PGM file */
+  case '2':                     /* it's a text-format PGM file */
     cinfo->input_components = 1;
     cinfo->in_color_space = JCS_GRAYSCALE;
     TRACEMS2(cinfo, 1, JTRC_PGM_TEXT, w, h);
@@ -344,7 +345,7 @@
     need_iobuffer = FALSE;
     break;
 
-  case '3':			/* it's a text-format PPM file */
+  case '3':                     /* it's a text-format PPM file */
     cinfo->input_components = 3;
     cinfo->in_color_space = JCS_RGB;
     TRACEMS2(cinfo, 1, JTRC_PPM_TEXT, w, h);
@@ -352,13 +353,13 @@
     need_iobuffer = FALSE;
     break;
 
-  case '5':			/* it's a raw-format PGM file */
+  case '5':                     /* it's a raw-format PGM file */
     cinfo->input_components = 1;
     cinfo->in_color_space = JCS_GRAYSCALE;
     TRACEMS2(cinfo, 1, JTRC_PGM, w, h);
     if (maxval > 255) {
       source->pub.get_pixel_rows = get_word_gray_row;
-    } else if (maxval == MAXJSAMPLE && SIZEOF(JSAMPLE) == SIZEOF(U_CHAR)) {
+    } else if (maxval == MAXJSAMPLE && sizeof(JSAMPLE) == sizeof(U_CHAR)) {
       source->pub.get_pixel_rows = get_raw_row;
       use_raw_buffer = TRUE;
       need_rescale = FALSE;
@@ -367,13 +368,13 @@
     }
     break;
 
-  case '6':			/* it's a raw-format PPM file */
+  case '6':                     /* it's a raw-format PPM file */
     cinfo->input_components = 3;
     cinfo->in_color_space = JCS_RGB;
     TRACEMS2(cinfo, 1, JTRC_PPM, w, h);
     if (maxval > 255) {
       source->pub.get_pixel_rows = get_word_rgb_row;
-    } else if (maxval == MAXJSAMPLE && SIZEOF(JSAMPLE) == SIZEOF(U_CHAR)) {
+    } else if (maxval == MAXJSAMPLE && sizeof(JSAMPLE) == sizeof(U_CHAR)) {
       source->pub.get_pixel_rows = get_raw_row;
       use_raw_buffer = TRUE;
       need_rescale = FALSE;
@@ -386,17 +387,16 @@
   /* Allocate space for I/O buffer: 1 or 3 bytes or words/pixel. */
   if (need_iobuffer) {
     source->buffer_width = (size_t) w * cinfo->input_components *
-      ((maxval<=255) ? SIZEOF(U_CHAR) : (2*SIZEOF(U_CHAR)));
+      ((maxval <= 255) ? sizeof(U_CHAR) : (2 * sizeof(U_CHAR)));
     source->iobuffer = (U_CHAR *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  source->buffer_width);
+                                  source->buffer_width);
   }
 
   /* Create compressor input buffer. */
   if (use_raw_buffer) {
     /* For unscaled raw-input case, we can just map it onto the I/O buffer. */
     /* Synthesize a JSAMPARRAY pointer structure */
-    /* Cast here implies near->far pointer conversion on PCs */
     source->pixrow = (JSAMPROW) source->iobuffer;
     source->pub.buffer = & source->pixrow;
     source->pub.buffer_height = 1;
@@ -410,16 +410,18 @@
 
   /* Compute the rescaling array if required. */
   if (need_rescale) {
-    INT32 val, half_maxval;
+    long val, half_maxval;
 
     /* On 16-bit-int machines we have to be careful of maxval = 65535 */
     source->rescale = (JSAMPLE *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  (size_t) (((long) maxval + 1L) * SIZEOF(JSAMPLE)));
+                                  (size_t) (((long) maxval + 1L) *
+                                            sizeof(JSAMPLE)));
     half_maxval = maxval / 2;
-    for (val = 0; val <= (INT32) maxval; val++) {
+    for (val = 0; val <= (long) maxval; val++) {
       /* The multiplication here must be done in 32 bits to avoid overflow */
-      source->rescale[val] = (JSAMPLE) ((val*MAXJSAMPLE + half_maxval)/maxval);
+      source->rescale[val] = (JSAMPLE) ((val * MAXJSAMPLE + half_maxval) /
+                                        maxval);
     }
   }
 }
@@ -448,7 +450,7 @@
   /* Create module interface object */
   source = (ppm_source_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(ppm_source_struct));
+                                  sizeof(ppm_source_struct));
   /* Fill in method ptrs, except get_pixel_rows which start_input sets */
   source->pub.start_input = start_input_ppm;
   source->pub.finish_input = finish_input_ppm;
diff --git a/rdrle.c b/rdrle.c
index 542bc37..226c528 100644
--- a/rdrle.c
+++ b/rdrle.c
@@ -1,9 +1,12 @@
 /*
  * rdrle.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code and
+ * information relevant to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to read input images in Utah RLE format.
  * The Utah Raster Toolkit library is required (version 3.1 or later).
@@ -19,7 +22,7 @@
  * with updates from Robert Hutchinson.
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef RLE_SUPPORTED
 
@@ -38,7 +41,7 @@
 
 /*
  * We support the following types of RLE files:
- *   
+ *
  *   GRAYSCALE   - 8 bits, no colormap
  *   MAPPEDGRAY  - 8 bits, 1 channel colomap
  *   PSEUDOCOLOR - 8 bits, 3 channel colormap
@@ -59,16 +62,16 @@
  * then fetch the required row from the virtual array on subsequent calls.
  */
 
-typedef struct _rle_source_struct * rle_source_ptr;
+typedef struct _rle_source_struct *rle_source_ptr;
 
 typedef struct _rle_source_struct {
   struct cjpeg_source_struct pub; /* public fields */
 
   rle_kind visual;              /* actual type of input file */
   jvirt_sarray_ptr image;       /* virtual array to hold the image */
-  JDIMENSION row;		/* current row # in the virtual array */
+  JDIMENSION row;               /* current row # in the virtual array */
   rle_hdr header;               /* Input file information */
-  rle_pixel** rle_row;          /* holds a row returned by rle_getrow() */
+  rle_pixel **rle_row;          /* holds a row returned by rle_getrow() */
 
 } rle_source_struct;
 
@@ -111,10 +114,10 @@
   }
 
   /* Figure out what we have, set private vars and return values accordingly */
-  
+
   width  = source->header.xmax - source->header.xmin + 1;
   height = source->header.ymax - source->header.ymin + 1;
-  source->header.xmin = 0;		/* realign horizontally */
+  source->header.xmin = 0;              /* realign horizontally */
   source->header.xmax = width-1;
 
   cinfo->image_width      = width;
@@ -131,17 +134,17 @@
   } else if (source->header.ncolors == 1 && source->header.ncmap == 3) {
     source->visual     = PSEUDOCOLOR;
     TRACEMS3(cinfo, 1, JTRC_RLE_MAPPED, width, height,
-	     1 << source->header.cmaplen);
+             1 << source->header.cmaplen);
   } else if (source->header.ncolors == 3 && source->header.ncmap == 3) {
     source->visual     = TRUECOLOR;
     TRACEMS3(cinfo, 1, JTRC_RLE_FULLMAP, width, height,
-	     1 << source->header.cmaplen);
+             1 << source->header.cmaplen);
   } else if (source->header.ncolors == 3 && source->header.ncmap == 0) {
     source->visual     = DIRECTCOLOR;
     TRACEMS2(cinfo, 1, JTRC_RLE, width, height);
   } else
     ERREXIT(cinfo, JERR_RLE_UNSUPPORTED);
-  
+
   if (source->visual == GRAYSCALE || source->visual == MAPPEDGRAY) {
     cinfo->in_color_space   = JCS_GRAYSCALE;
     cinfo->input_components = 1;
@@ -213,7 +216,7 @@
   colormap = source->header.cmap;
   dest_row = source->pub.buffer[0];
   source->row--;
-  src_row = * (*cinfo->mem->access_virt_sarray)
+  src_row = *(*cinfo->mem->access_virt_sarray)
     ((j_common_ptr) cinfo, source->image, source->row, (JDIMENSION) 1, FALSE);
 
   for (col = cinfo->image_width; col > 0; col--) {
@@ -254,8 +257,7 @@
   rle_row = source->rle_row;
 
   /* Read the RLE data into our virtual array.
-   * We assume here that (a) rle_pixel is represented the same as JSAMPLE,
-   * and (b) we are not on a machine where FAR pointers differ from regular.
+   * We assume here that rle_pixel is represented the same as JSAMPLE.
    */
   RLE_CLR_BIT(source->header, RLE_ALPHA); /* don't read the alpha channel */
 
@@ -287,7 +289,7 @@
   case MAPPEDGRAY:
   case TRUECOLOR:
     for (row = 0; row < cinfo->image_height; row++) {
-      scanline = * (*cinfo->mem->access_virt_sarray)
+      scanline = *(*cinfo->mem->access_virt_sarray)
         ((j_common_ptr) cinfo, source->image, row, (JDIMENSION) 1, TRUE);
       rle_row = source->rle_row;
       rle_getrow(&source->header, rle_row);
@@ -310,7 +312,7 @@
 
   case DIRECTCOLOR:
     for (row = 0; row < cinfo->image_height; row++) {
-      scanline = * (*cinfo->mem->access_virt_sarray)
+      scanline = *(*cinfo->mem->access_virt_sarray)
         ((j_common_ptr) cinfo, source->image, row, (JDIMENSION) 1, TRUE);
       rle_getrow(&source->header, rle_row);
 
@@ -348,7 +350,7 @@
   source->row = cinfo->image_height;
 
   /* And fetch the topmost (bottommost) row */
-  return (*source->pub.get_pixel_rows) (cinfo, sinfo);   
+  return (*source->pub.get_pixel_rows) (cinfo, sinfo);
 }
 
 
@@ -375,7 +377,7 @@
   /* Create module interface object */
   source = (rle_source_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  SIZEOF(rle_source_struct));
+                                  sizeof(rle_source_struct));
   /* Fill in method ptrs */
   source->pub.start_input = start_input_rle;
   source->pub.finish_input = finish_input_rle;
diff --git a/rdswitch.c b/rdswitch.c
index fc0727a..7d870c3 100644
--- a/rdswitch.c
+++ b/rdswitch.c
@@ -5,28 +5,29 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to process some of cjpeg's more complicated
  * command-line switches.  Switches processed here are:
- *	-qtables file		Read quantization tables from text file
- *	-scans file		Read scan script from text file
- *	-quality N[,N,...]	Set quality ratings
- *	-qslots N[,N,...]	Set component quantization table selectors
- *	-sample HxV[,HxV,...]	Set component sampling factors
+ *      -qtables file           Read quantization tables from text file
+ *      -scans file             Read scan script from text file
+ *      -quality N[,N,...]      Set quality ratings
+ *      -qslots N[,N,...]       Set component quantization table selectors
+ *      -sample HxV[,HxV,...]   Set component sampling factors
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
-#include <ctype.h>		/* to declare isdigit(), isspace() */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include <ctype.h>              /* to declare isdigit(), isspace() */
 
 
 LOCAL(int)
-text_getc (FILE * file)
+text_getc (FILE *file)
 /* Read next char, skipping over any comments (# to end of line) */
 /* A comment/newline sequence is returned as a newline */
 {
   register int ch;
-  
+
   ch = getc(file);
   if (ch == '#') {
     do {
@@ -38,13 +39,13 @@
 
 
 LOCAL(boolean)
-read_text_integer (FILE * file, long * result, int * termchar)
+read_text_integer (FILE *file, long *result, int *termchar)
 /* Read an unsigned decimal integer from a file, store it in result */
 /* Reads one trailing character after the integer; returns it in termchar */
 {
   register int ch;
   register long val;
-  
+
   /* Skip any leading whitespace, detect EOF */
   do {
     ch = text_getc(file);
@@ -53,7 +54,7 @@
       return FALSE;
     }
   } while (isspace(ch));
-  
+
   if (! isdigit(ch)) {
     *termchar = ch;
     return FALSE;
@@ -77,7 +78,8 @@
 #endif
 
 GLOBAL(boolean)
-read_quant_tables (j_compress_ptr cinfo, char * filename, boolean force_baseline)
+read_quant_tables (j_compress_ptr cinfo, char *filename,
+                   boolean force_baseline)
 /* Read a set of quantization tables from the specified file.
  * The file is plain ASCII text: decimal numbers with whitespace between.
  * Comments preceded by '#' may be included in the file.
@@ -88,7 +90,7 @@
  * You must use -qslots if you want a different component->table mapping.
  */
 {
-  FILE * fp;
+  FILE *fp;
   int tblno, i, termchar;
   long val;
   unsigned int table[DCTSIZE2];
@@ -108,15 +110,15 @@
     table[0] = (unsigned int) val;
     for (i = 1; i < DCTSIZE2; i++) {
       if (! read_text_integer(fp, &val, &termchar)) {
-	fprintf(stderr, "Invalid table data in file %s\n", filename);
-	fclose(fp);
-	return FALSE;
+        fprintf(stderr, "Invalid table data in file %s\n", filename);
+        fclose(fp);
+        return FALSE;
       }
       table[i] = (unsigned int) val;
     }
 #if JPEG_LIB_VERSION >= 70
     jpeg_add_quant_table(cinfo, tblno, table, cinfo->q_scale_factor[tblno],
-			 force_baseline);
+                         force_baseline);
 #else
     jpeg_add_quant_table(cinfo, tblno, table, q_scale_factor[tblno],
                          force_baseline);
@@ -138,7 +140,7 @@
 #ifdef C_MULTISCAN_FILES_SUPPORTED
 
 LOCAL(boolean)
-read_scan_integer (FILE * file, long * result, int * termchar)
+read_scan_integer (FILE *file, long *result, int *termchar)
 /* Variant of read_text_integer that always looks for a non-space termchar;
  * this simplifies parsing of punctuation in scan scripts.
  */
@@ -150,7 +152,7 @@
   ch = *termchar;
   while (ch != EOF && isspace(ch))
     ch = text_getc(file);
-  if (isdigit(ch)) {		/* oops, put it back */
+  if (isdigit(ch)) {            /* oops, put it back */
     if (ungetc(ch, file) == EOF)
       return FALSE;
     ch = ' ';
@@ -167,7 +169,7 @@
 
 
 GLOBAL(boolean)
-read_scan_script (j_compress_ptr cinfo, char * filename)
+read_scan_script (j_compress_ptr cinfo, char *filename)
 /* Read a scan script from the specified text file.
  * Each entry in the file defines one scan to be emitted.
  * Entries are separated by semicolons ';'.
@@ -184,11 +186,11 @@
  * jcmaster.c will validate the script parameters.
  */
 {
-  FILE * fp;
+  FILE *fp;
   int scanno, ncomps, termchar;
   long val;
-  jpeg_scan_info * scanptr;
-#define MAX_SCANS  100		/* quite arbitrary limit */
+  jpeg_scan_info *scanptr;
+#define MAX_SCANS  100          /* quite arbitrary limit */
   jpeg_scan_info scans[MAX_SCANS];
 
   if ((fp = fopen(filename, "r")) == NULL) {
@@ -208,29 +210,29 @@
     ncomps = 1;
     while (termchar == ' ') {
       if (ncomps >= MAX_COMPS_IN_SCAN) {
-	fprintf(stderr, "Too many components in one scan in file %s\n",
-		filename);
-	fclose(fp);
-	return FALSE;
+        fprintf(stderr, "Too many components in one scan in file %s\n",
+                filename);
+        fclose(fp);
+        return FALSE;
       }
       if (! read_scan_integer(fp, &val, &termchar))
-	goto bogus;
+        goto bogus;
       scanptr->component_index[ncomps] = (int) val;
       ncomps++;
     }
     scanptr->comps_in_scan = ncomps;
     if (termchar == ':') {
       if (! read_scan_integer(fp, &val, &termchar) || termchar != ' ')
-	goto bogus;
+        goto bogus;
       scanptr->Ss = (int) val;
       if (! read_scan_integer(fp, &val, &termchar) || termchar != ' ')
-	goto bogus;
+        goto bogus;
       scanptr->Se = (int) val;
       if (! read_scan_integer(fp, &val, &termchar) || termchar != ' ')
-	goto bogus;
+        goto bogus;
       scanptr->Ah = (int) val;
       if (! read_scan_integer(fp, &val, &termchar))
-	goto bogus;
+        goto bogus;
       scanptr->Al = (int) val;
     } else {
       /* set non-progressive parameters */
@@ -261,8 +263,8 @@
      */
     scanptr = (jpeg_scan_info *)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  scanno * SIZEOF(jpeg_scan_info));
-    MEMCOPY(scanptr, scans, scanno * SIZEOF(jpeg_scan_info));
+                                  scanno * sizeof(jpeg_scan_info));
+    MEMCOPY(scanptr, scans, scanno * sizeof(jpeg_scan_info));
     cinfo->scan_info = scanptr;
     cinfo->num_scans = scanno;
   }
@@ -305,9 +307,9 @@
 jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
 {
   jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
-		       q_scale_factor[0], force_baseline);
+                       q_scale_factor[0], force_baseline);
   jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
-		       q_scale_factor[1], force_baseline);
+                       q_scale_factor[1], force_baseline);
 }
 #endif
 
@@ -319,17 +321,17 @@
  * If there are more q-table slots than parameters, the last value is replicated.
  */
 {
-  int val = 75;			/* default value */
+  int val = 75;                 /* default value */
   int tblno;
   char ch;
 
   for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) {
     if (*arg) {
-      ch = ',';			/* if not set by sscanf, will be ',' */
+      ch = ',';                 /* if not set by sscanf, will be ',' */
       if (sscanf(arg, "%d%c", &val, &ch) < 1)
-	return FALSE;
-      if (ch != ',')		/* syntax check */
-	return FALSE;
+        return FALSE;
+      if (ch != ',')            /* syntax check */
+        return FALSE;
       /* Convert user 0-100 rating to percentage scaling */
 #if JPEG_LIB_VERSION >= 70
       cinfo->q_scale_factor[tblno] = jpeg_quality_scaling(val);
@@ -337,7 +339,7 @@
       q_scale_factor[tblno] = jpeg_quality_scaling(val);
 #endif
       while (*arg && *arg++ != ',') /* advance to next segment of arg string */
-	;
+        ;
     } else {
       /* reached end of parameter, set remaining factors to last value */
 #if JPEG_LIB_VERSION >= 70
@@ -359,25 +361,25 @@
  * If there are more components than parameters, the last value is replicated.
  */
 {
-  int val = 0;			/* default table # */
+  int val = 0;                  /* default table # */
   int ci;
   char ch;
 
   for (ci = 0; ci < MAX_COMPONENTS; ci++) {
     if (*arg) {
-      ch = ',';			/* if not set by sscanf, will be ',' */
+      ch = ',';                 /* if not set by sscanf, will be ',' */
       if (sscanf(arg, "%d%c", &val, &ch) < 1)
-	return FALSE;
-      if (ch != ',')		/* syntax check */
-	return FALSE;
+        return FALSE;
+      if (ch != ',')            /* syntax check */
+        return FALSE;
       if (val < 0 || val >= NUM_QUANT_TBLS) {
-	fprintf(stderr, "JPEG quantization tables are numbered 0..%d\n",
-		NUM_QUANT_TBLS-1);
-	return FALSE;
+        fprintf(stderr, "JPEG quantization tables are numbered 0..%d\n",
+                NUM_QUANT_TBLS-1);
+        return FALSE;
       }
       cinfo->comp_info[ci].quant_tbl_no = val;
       while (*arg && *arg++ != ',') /* advance to next segment of arg string */
-	;
+        ;
     } else {
       /* reached end of parameter, set remaining components to last table */
       cinfo->comp_info[ci].quant_tbl_no = val;
@@ -399,19 +401,19 @@
 
   for (ci = 0; ci < MAX_COMPONENTS; ci++) {
     if (*arg) {
-      ch2 = ',';		/* if not set by sscanf, will be ',' */
+      ch2 = ',';                /* if not set by sscanf, will be ',' */
       if (sscanf(arg, "%d%c%d%c", &val1, &ch1, &val2, &ch2) < 3)
-	return FALSE;
+        return FALSE;
       if ((ch1 != 'x' && ch1 != 'X') || ch2 != ',') /* syntax check */
-	return FALSE;
+        return FALSE;
       if (val1 <= 0 || val1 > 4 || val2 <= 0 || val2 > 4) {
-	fprintf(stderr, "JPEG sampling factors must be 1..4\n");
-	return FALSE;
+        fprintf(stderr, "JPEG sampling factors must be 1..4\n");
+        return FALSE;
       }
       cinfo->comp_info[ci].h_samp_factor = val1;
       cinfo->comp_info[ci].v_samp_factor = val2;
       while (*arg && *arg++ != ',') /* advance to next segment of arg string */
-	;
+        ;
     } else {
       /* reached end of parameter, set remaining components to 1x1 sampling */
       cinfo->comp_info[ci].h_samp_factor = 1;
diff --git a/rdtarga.c b/rdtarga.c
index 4c2cd26..b9bbd07 100644
--- a/rdtarga.c
+++ b/rdtarga.c
@@ -1,9 +1,12 @@
 /*
  * rdtarga.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to read input images in Targa format.
  *
@@ -17,7 +20,7 @@
  * Based on code contributed by Lee Daniel Crocker.
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef TARGA_SUPPORTED
 
@@ -26,50 +29,49 @@
 
 #ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
-#define UCH(x)	((int) (x))
+#define UCH(x)  ((int) (x))
 #else /* !HAVE_UNSIGNED_CHAR */
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
 typedef char U_CHAR;
-#define UCH(x)	((int) (x))
+#define UCH(x)  ((int) (x))
 #else
 typedef char U_CHAR;
-#define UCH(x)	((int) (x) & 0xFF)
+#define UCH(x)  ((int) (x) & 0xFF)
 #endif
 #endif /* HAVE_UNSIGNED_CHAR */
 
 
-#define	ReadOK(file,buffer,len)	(JFREAD(file,buffer,len) == ((size_t) (len)))
+#define ReadOK(file,buffer,len) (JFREAD(file,buffer,len) == ((size_t) (len)))
 
 
 /* Private version of data source object */
 
-typedef struct _tga_source_struct * tga_source_ptr;
+typedef struct _tga_source_struct *tga_source_ptr;
 
 typedef struct _tga_source_struct {
   struct cjpeg_source_struct pub; /* public fields */
 
-  j_compress_ptr cinfo;		/* back link saves passing separate parm */
+  j_compress_ptr cinfo;         /* back link saves passing separate parm */
 
-  JSAMPARRAY colormap;		/* Targa colormap (converted to my format) */
+  JSAMPARRAY colormap;          /* Targa colormap (converted to my format) */
 
-  jvirt_sarray_ptr whole_image;	/* Needed if funny input row order */
-  JDIMENSION current_row;	/* Current logical row number to read */
+  jvirt_sarray_ptr whole_image; /* Needed if funny input row order */
+  JDIMENSION current_row;       /* Current logical row number to read */
 
   /* Pointer to routine to extract next Targa pixel from input file */
-  JMETHOD(void, read_pixel, (tga_source_ptr sinfo));
+  void (*read_pixel) (tga_source_ptr sinfo);
 
   /* Result of read_pixel is delivered here: */
   U_CHAR tga_pixel[4];
 
-  int pixel_size;		/* Bytes per Targa pixel (1 to 4) */
+  int pixel_size;               /* Bytes per Targa pixel (1 to 4) */
 
   /* State info for reading RLE-coded pixels; both counts must be init to 0 */
-  int block_count;		/* # of pixels remaining in RLE block */
-  int dup_pixel_count;		/* # of times to duplicate previous pixel */
+  int block_count;              /* # of pixels remaining in RLE block */
+  int dup_pixel_count;          /* # of times to duplicate previous pixel */
 
   /* This saves the correct pixel-row-expansion method for preload_image */
-  JMETHOD(JDIMENSION, get_pixel_rows, (j_compress_ptr cinfo,
-				       cjpeg_source_ptr sinfo));
+  JDIMENSION (*get_pixel_rows) (j_compress_ptr cinfo, cjpeg_source_ptr sinfo);
 } tga_source_struct;
 
 
@@ -148,9 +150,9 @@
   /* Time to read RLE block header? */
   if (--sinfo->block_count < 0) { /* decrement pixels remaining in block */
     i = read_byte(sinfo);
-    if (i & 0x80) {		/* Start of duplicate-pixel block? */
+    if (i & 0x80) {             /* Start of duplicate-pixel block? */
       sinfo->dup_pixel_count = i & 0x7F; /* number of dups after this one */
-      sinfo->block_count = 0;	/* then read new block header */
+      sinfo->block_count = 0;   /* then read new block header */
     } else {
       sinfo->block_count = i & 0x7F; /* number of pixels after this one */
     }
@@ -177,7 +179,7 @@
   tga_source_ptr source = (tga_source_ptr) sinfo;
   register JSAMPROW ptr;
   register JDIMENSION col;
-  
+
   ptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
     (*source->read_pixel) (source); /* Load next pixel into tga_pixel */
@@ -215,7 +217,7 @@
   register int t;
   register JSAMPROW ptr;
   register JDIMENSION col;
-  
+
   ptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
     (*source->read_pixel) (source); /* Load next pixel into tga_pixel */
@@ -242,7 +244,7 @@
   tga_source_ptr source = (tga_source_ptr) sinfo;
   register JSAMPROW ptr;
   register JDIMENSION col;
-  
+
   ptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
     (*source->read_pixel) (source); /* Load next pixel into tga_pixel */
@@ -338,8 +340,8 @@
   unsigned int width, height, maplen;
   boolean is_bottom_up;
 
-#define GET_2B(offset)	((unsigned int) UCH(targaheader[offset]) + \
-			 (((unsigned int) UCH(targaheader[offset+1])) << 8))
+#define GET_2B(offset)  ((unsigned int) UCH(targaheader[offset]) + \
+                         (((unsigned int) UCH(targaheader[offset+1])) << 8))
 
   if (! ReadOK(source->pub.input_file, targaheader, 18))
     ERREXIT(cinfo, JERR_INPUT_EOF);
@@ -355,17 +357,18 @@
   width = GET_2B(12);
   height = GET_2B(14);
   source->pixel_size = UCH(targaheader[16]) >> 3;
-  flags = UCH(targaheader[17]);	/* Image Descriptor byte */
+  flags = UCH(targaheader[17]); /* Image Descriptor byte */
 
-  is_bottom_up = ((flags & 0x20) == 0);	/* bit 5 set => top-down */
-  interlace_type = flags >> 6;	/* bits 6/7 are interlace code */
+  is_bottom_up = ((flags & 0x20) == 0); /* bit 5 set => top-down */
+  interlace_type = flags >> 6;  /* bits 6/7 are interlace code */
 
-  if (cmaptype > 1 ||		/* cmaptype must be 0 or 1 */
+  if (cmaptype > 1 ||           /* cmaptype must be 0 or 1 */
       source->pixel_size < 1 || source->pixel_size > 4 ||
       (UCH(targaheader[16]) & 7) != 0 || /* bits/pixel must be multiple of 8 */
-      interlace_type != 0)	/* currently don't allow interlaced image */
+      interlace_type != 0 ||      /* currently don't allow interlaced image */
+      width == 0 || height == 0)  /* image width/height must be non-zero */
     ERREXIT(cinfo, JERR_TGA_BADPARMS);
-  
+
   if (subtype > 8) {
     /* It's an RLE-coded file */
     source->read_pixel = read_rle_pixel;
@@ -377,18 +380,18 @@
   }
 
   /* Now should have subtype 1, 2, or 3 */
-  components = 3;		/* until proven different */
+  components = 3;               /* until proven different */
   cinfo->in_color_space = JCS_RGB;
 
   switch (subtype) {
-  case 1:			/* Colormapped image */
+  case 1:                       /* Colormapped image */
     if (source->pixel_size == 1 && cmaptype == 1)
       source->get_pixel_rows = get_8bit_row;
     else
       ERREXIT(cinfo, JERR_TGA_BADPARMS);
     TRACEMS2(cinfo, 1, JTRC_TGA_MAPPED, width, height);
     break;
-  case 2:			/* RGB image */
+  case 2:                       /* RGB image */
     switch (source->pixel_size) {
     case 2:
       source->get_pixel_rows = get_16bit_row;
@@ -405,7 +408,7 @@
     }
     TRACEMS2(cinfo, 1, JTRC_TGA, width, height);
     break;
-  case 3:			/* Grayscale image */
+  case 3:                       /* Grayscale image */
     components = 1;
     cinfo->in_color_space = JCS_GRAYSCALE;
     if (source->pixel_size == 1)
@@ -440,8 +443,8 @@
     source->pub.buffer_height = 1;
     source->pub.get_pixel_rows = source->get_pixel_rows;
   }
-  
-  while (idlen--)		/* Throw away ID field */
+
+  while (idlen--)               /* Throw away ID field */
     (void) read_byte(source);
 
   if (maplen > 0) {
@@ -453,7 +456,7 @@
     /* and read it from the file */
     read_colormap(source, (int) maplen, UCH(targaheader[7]));
   } else {
-    if (cmaptype)		/* but you promised a cmap! */
+    if (cmaptype)               /* but you promised a cmap! */
       ERREXIT(cinfo, JERR_TGA_BADPARMS);
     source->colormap = NULL;
   }
@@ -488,8 +491,8 @@
   /* Create module interface object */
   source = (tga_source_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(tga_source_struct));
-  source->cinfo = cinfo;	/* make back link for subroutines */
+                                  sizeof(tga_source_struct));
+  source->cinfo = cinfo;        /* make back link for subroutines */
   /* Fill in method ptrs, except get_pixel_rows which start_input sets */
   source->pub.start_input = start_input_tga;
   source->pub.finish_input = finish_input_tga;
diff --git a/rrtimer.h b/rrtimer.h
deleted file mode 100644
index 4db5e37..0000000
--- a/rrtimer.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (C)2004 Landmark Graphics Corporation
- * Copyright (C)2005 Sun Microsystems, Inc.
- *
- * This library is free software and may be redistributed and/or modified under
- * the terms of the wxWindows Library License, Version 3.1 or (at your option)
- * any later version.  The full license is in the LICENSE.txt file included
- * with this distribution.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * wxWindows Library License for more details.
- */
-
-#ifndef __RRTIMER_H__
-#define __RRTIMER_H__
-
-#ifdef __cplusplus
-
-#ifdef _WIN32
-#include <windows.h>
-#else
-#include <sys/time.h>
-#endif
-
-class rrtimer
-{
-	public:
-
-		rrtimer(void) : t1(0.0)
-		{
-			#ifdef _WIN32
-			highres=false;  tick=0.001;
-			LARGE_INTEGER Frequency;
-			if(QueryPerformanceFrequency(&Frequency)!=0)
-			{
-				tick=(double)1.0/(double)(Frequency.QuadPart);
-				highres=true;
-			}
-			#endif
-		}
-
-		void start(void)
-		{
-			t1=time();
-		}
-
-		double time(void)
-		{
-			#ifdef _WIN32
-			if(highres)
-			{
-				LARGE_INTEGER Time;
-				QueryPerformanceCounter(&Time);
-				return((double)(Time.QuadPart)*tick);
-			}
-			else
-				return((double)GetTickCount()*tick);
-			#else
-			struct timeval __tv;
-			gettimeofday(&__tv, (struct timezone *)NULL);
-			return((double)(__tv.tv_sec)+(double)(__tv.tv_usec)*0.000001);
-			#endif
-		}
-
-		double elapsed(void)
-		{
-			return time()-t1;
-		}
-
-	private:
-
-		#ifdef _WIN32
-		bool highres;  double tick;
-		#endif
-		double t1;
-};
-
-#endif  // __cplusplus
-
-#ifdef _WIN32
-
-#include <windows.h>
-
-__inline double rrtime(void)
-{
-	LARGE_INTEGER Frequency, Time;
-	if(QueryPerformanceFrequency(&Frequency)!=0)
-	{
-		QueryPerformanceCounter(&Time);
-		return (double)Time.QuadPart/(double)Frequency.QuadPart;
-	}
-	else return (double)GetTickCount()*0.001;
-}
-
-#else
-
-#include <sys/time.h>
-
-#ifdef sun
-#define __inline inline
-#endif
-
-static __inline double rrtime(void)
-{
-	struct timeval __tv;
-	gettimeofday(&__tv, (struct timezone *)NULL);
-	return((double)__tv.tv_sec+(double)__tv.tv_usec*0.000001);
-}
-
-#endif
-
-#endif
-
diff --git a/rrutil.h b/rrutil.h
deleted file mode 100644
index 4b61dbf..0000000
--- a/rrutil.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (C)2004 Landmark Graphics Corporation
- * Copyright (C)2005 Sun Microsystems, Inc.
- * Copyright (C)2010 D. R. Commander
- *
- * This library is free software and may be redistributed and/or modified under
- * the terms of the wxWindows Library License, Version 3.1 or (at your option)
- * any later version.  The full license is in the LICENSE.txt file included
- * with this distribution.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * wxWindows Library License for more details.
- */
-
-#ifndef __RRUTIL_H__
-#define __RRUTIL_H__
-
-#ifdef _WIN32
-	#include <windows.h>
-	#define sleep(t) Sleep((t)*1000)
-	#define usleep(t) Sleep((t)/1000)
-#else
-	#include <unistd.h>
-	#define stricmp strcasecmp
-	#define strnicmp strncasecmp
-#endif
-
-#ifndef min
- #define min(a,b) ((a)<(b)?(a):(b))
-#endif
-
-#ifndef max
- #define max(a,b) ((a)>(b)?(a):(b))
-#endif
-
-#define pow2(i) (1<<(i))
-#define isPow2(x) (((x)&(x-1))==0)
-
-#ifdef sgi
-#define _SC_NPROCESSORS_CONF _SC_NPROC_CONF
-#endif
-
-#ifdef sun
-#define __inline inline
-#endif
-
-static __inline int numprocs(void)
-{
-	#ifdef _WIN32
-	DWORD_PTR ProcAff, SysAff, i;  int count=0;
-	if(!GetProcessAffinityMask(GetCurrentProcess(), &ProcAff, &SysAff)) return(1);
-	for(i=0; i<sizeof(long*)*8; i++) if(ProcAff&(1LL<<i)) count++;
-	return(count);
-	#elif defined (__APPLE__)
-	return(1);
-	#else
-	long count=1;
-	if((count=sysconf(_SC_NPROCESSORS_CONF))!=-1) return((int)count);
-	else return(1);
-	#endif
-}
-
-#define byteswap(i) ( \
-	(((i) & 0xff000000) >> 24) | \
-	(((i) & 0x00ff0000) >>  8) | \
-	(((i) & 0x0000ff00) <<  8) | \
-	(((i) & 0x000000ff) << 24) )
-
-#define byteswap16(i) ( \
-	(((i) & 0xff00) >> 8) | \
-	(((i) & 0x00ff) << 8) )
-
-static __inline int littleendian(void)
-{
-	unsigned int value=1;
-	unsigned char *ptr=(unsigned char *)(&value);
-	if(ptr[0]==1 && ptr[3]==0) return 1;
-	else return 0;
-}
-
-#endif
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
new file mode 100644
index 0000000..37938ec
--- /dev/null
+++ b/simd/CMakeLists.txt
@@ -0,0 +1,80 @@
+if(NOT DEFINED NASM)
+  set(NASM nasm CACHE FILEPATH "Path to NASM/YASM executable")
+endif()
+
+if(SIMD_X86_64)
+  set(NAFLAGS -fwin64 -DWIN64 -D__x86_64__)
+else()
+  if(BORLAND)
+    set(NAFLAGS -fobj -DOBJ32)
+  else()
+    set(NAFLAGS -fwin32 -DWIN32)
+  endif()
+endif()
+set(NAFLAGS ${NAFLAGS} -I${CMAKE_SOURCE_DIR}/win/ -I${CMAKE_CURRENT_SOURCE_DIR}/)
+
+# This only works if building from the command line.  There is currently no way
+# to set a variable's value based on the build type when using the MSVC IDE.
+if(CMAKE_BUILD_TYPE STREQUAL "Debug"
+  OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
+  set(NAFLAGS ${NAFLAGS} -g)
+endif()
+
+if(SIMD_X86_64)
+  set(SIMD_BASENAMES jfdctflt-sse-64 jccolor-sse2-64 jcgray-sse2-64
+    jchuff-sse2-64 jcsample-sse2-64 jdcolor-sse2-64 jdmerge-sse2-64
+    jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64
+    jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64
+    jquanti-sse2-64)
+  message(STATUS "Building x86_64 SIMD extensions")
+else()
+  set(SIMD_BASENAMES jsimdcpu jfdctflt-3dn jidctflt-3dn jquant-3dn jccolor-mmx
+    jcgray-mmx jcsample-mmx jdcolor-mmx jdmerge-mmx jdsample-mmx jfdctfst-mmx
+    jfdctint-mmx jidctfst-mmx jidctint-mmx jidctred-mmx jquant-mmx jfdctflt-sse
+    jidctflt-sse jquant-sse jccolor-sse2 jcgray-sse2 jchuff-sse2 jcsample-sse2
+    jdcolor-sse2 jdmerge-sse2 jdsample-sse2 jfdctfst-sse2 jfdctint-sse2
+    jidctflt-sse2 jidctfst-sse2 jidctint-sse2 jidctred-sse2 jquantf-sse2
+    jquanti-sse2)
+  message(STATUS "Building i386 SIMD extensions")
+endif()
+
+if(MSVC_IDE)
+  set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}")
+else()
+  set(OBJDIR ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+file(GLOB INC_FILES *.inc)
+
+foreach(file ${SIMD_BASENAMES})
+  set(DEPFILE "")
+  set(SIMD_SRC ${CMAKE_CURRENT_SOURCE_DIR}/${file}.asm)
+  if(${file} MATCHES jccolor)
+    set(DEPFILE ${file})
+    string(REGEX REPLACE "jccolor" "jccolext" DEPFILE ${DEPFILE})
+    set(DEPFILE ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}.asm)
+  endif()
+  if(${file} MATCHES jcgray)
+    set(DEPFILE ${file})
+    string(REGEX REPLACE "jcgray" "jcgryext" DEPFILE ${DEPFILE})
+    set(DEPFILE ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}.asm)
+  endif()
+  if(${file} MATCHES jdcolor)
+    set(DEPFILE ${file})
+    string(REGEX REPLACE "jdcolor" "jdcolext" DEPFILE ${DEPFILE})
+    set(DEPFILE ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}.asm)
+  endif()
+  if(${file} MATCHES jdmerge)
+    set(DEPFILE ${file})
+    string(REGEX REPLACE "jdmerge" "jdmrgext" DEPFILE ${DEPFILE})
+    set(DEPFILE ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}.asm)
+  endif()
+  set(SIMD_OBJ ${OBJDIR}/${file}.obj)
+  add_custom_command(OUTPUT ${SIMD_OBJ}
+    DEPENDS ${SIMD_SRC} ${DEPFILE} ${INC_FILES}
+    COMMAND ${NASM} ${NAFLAGS} ${SIMD_SRC} -o${SIMD_OBJ})
+  set(SIMD_OBJS ${SIMD_OBJS} ${SIMD_OBJ})
+endforeach()
+
+set(SIMD_OBJS ${SIMD_OBJS} PARENT_SCOPE)
+add_custom_target(simd DEPENDS ${SIMD_OBJS})
diff --git a/simd/Makefile.am b/simd/Makefile.am
new file mode 100644
index 0000000..fad6c8c
--- /dev/null
+++ b/simd/Makefile.am
@@ -0,0 +1,97 @@
+noinst_LTLIBRARIES = libsimd.la
+
+BUILT_SOURCES = jsimdcfg.inc
+
+EXTRA_DIST = nasm_lt.sh CMakeLists.txt \
+	jccolext-mmx.asm   jcgryext-mmx.asm   jdcolext-mmx.asm   jdmrgext-mmx.asm \
+	jccolext-sse2.asm  jcgryext-sse2.asm  jdcolext-sse2.asm  jdmrgext-sse2.asm \
+	jccolext-sse2-64.asm  jcgryext-sse2-64.asm  jdcolext-sse2-64.asm \
+	jdmrgext-sse2-64.asm  jccolext-altivec.c    jcgryext-altivec.c \
+	jdcolext-altivec.c    jdmrgext-altivec.c
+
+if SIMD_X86_64
+
+libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
+	jcolsamp.inc jdct.inc jpeg_nbits_table.inc jfdctflt-sse-64.asm \
+	jccolor-sse2-64.asm   jcgray-sse2-64.asm    jchuff-sse2-64.asm \
+	jcsample-sse2-64.asm  jdcolor-sse2-64.asm   jdmerge-sse2-64.asm \
+	jdsample-sse2-64.asm  jfdctfst-sse2-64.asm  jfdctint-sse2-64.asm \
+	jidctflt-sse2-64.asm  jidctfst-sse2-64.asm  jidctint-sse2-64.asm \
+	jidctred-sse2-64.asm  jquantf-sse2-64.asm   jquanti-sse2-64.asm
+
+jccolor-sse2-64.lo:  jccolext-sse2-64.asm
+jcgray-sse2-64.lo:   jcgryext-sse2-64.asm
+jdcolor-sse2-64.lo:  jdcolext-sse2-64.asm
+jdmerge-sse2-64.lo:  jdmrgext-sse2-64.asm
+
+endif
+
+if SIMD_I386
+
+libsimd_la_SOURCES = jsimd_i386.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
+	jcolsamp.inc jdct.inc jpeg_nbits_table.inc jsimdcpu.asm \
+	jfdctflt-3dn.asm   jidctflt-3dn.asm   jquant-3dn.asm \
+	jccolor-mmx.asm    jcgray-mmx.asm     jcsample-mmx.asm \
+	jdcolor-mmx.asm    jdmerge-mmx.asm    jdsample-mmx.asm \
+	jfdctfst-mmx.asm   jfdctint-mmx.asm   jidctfst-mmx.asm \
+	jidctint-mmx.asm   jidctred-mmx.asm   jquant-mmx.asm \
+	jfdctflt-sse.asm   jidctflt-sse.asm   jquant-sse.asm \
+	jccolor-sse2.asm   jcgray-sse2.asm    jchuff-sse2.asm \
+	jcsample-sse2.asm  jdcolor-sse2.asm   jdmerge-sse2.asm \
+	jdsample-sse2.asm  jfdctfst-sse2.asm  jfdctint-sse2.asm \
+	jidctflt-sse2.asm  jidctfst-sse2.asm  jidctint-sse2.asm \
+	jidctred-sse2.asm  jquantf-sse2.asm   jquanti-sse2.asm
+
+jccolor-mmx.lo:   jccolext-mmx.asm
+jcgray.-mmx.lo:   jcgryext-mmx.asm
+jdcolor-mmx.lo:   jdcolext-mmx.asm
+jdmerge-mmx.lo:   jdmrgext-mmx.asm
+jccolor-sse2.lo:  jccolext-sse2.asm
+jcgray-sse2.lo:   jcgryext-sse2.asm
+jdcolor-sse2.lo:  jdcolext-sse2.asm
+jdmerge-sse2.lo:  jdmrgext-sse2.asm
+
+endif
+
+if SIMD_ARM
+
+libsimd_la_SOURCES = jsimd_arm.c jsimd_arm_neon.S
+
+endif
+
+if SIMD_ARM_64
+
+libsimd_la_SOURCES = jsimd_arm64.c jsimd_arm64_neon.S
+
+endif
+
+if SIMD_MIPS
+
+libsimd_la_SOURCES = jsimd_mips.c jsimd_mips_dspr2_asm.h jsimd_mips_dspr2.S
+
+endif
+
+if SIMD_POWERPC
+
+libsimd_la_SOURCES = jsimd_powerpc.c jsimd_altivec.h jcsample.h \
+	jccolor-altivec.c     jcgray-altivec.c      jcsample-altivec.c \
+	jdcolor-altivec.c     jdmerge-altivec.c     jdsample-altivec.c \
+	jfdctfst-altivec.c    jfdctint-altivec.c \
+	jidctfst-altivec.c    jidctint-altivec.c \
+	jquanti-altivec.c
+libsimd_la_CFLAGS = -maltivec
+
+jccolor-altivec.lo:  jccolext-altivec.c
+jcgray-altivec.lo:   jcgryext-altivec.c
+jdcolor-altivec.lo:  jdcolext-altivec.c
+jdmerge-altivec.lo:  jdmrgext-altivec.c
+
+endif
+
+AM_CPPFLAGS = -I$(top_srcdir)
+
+.asm.lo:
+	$(AM_V_GEN) $(LIBTOOL) $(AM_V_lt) --mode=compile --tag NASM $(srcdir)/nasm_lt.sh $(AM_V_lt) $(NASM) $(NAFLAGS) -I$(srcdir) -I. $< -o $@
+
+jsimdcfg.inc: $(srcdir)/jsimdcfg.inc.h ../jpeglib.h ../jconfig.h ../jmorecfg.h
+	$(AM_V_GEN) $(CPP) -I$(top_builddir) -I$(top_builddir)/simd $(srcdir)/jsimdcfg.inc.h | $(EGREP) "^[\;%]|^\ %" | sed 's%_cpp_protection_%%' | sed 's@% define@%define@g' > $@
diff --git a/simd/jcclrmmx.asm b/simd/jcclrmmx.asm
deleted file mode 100644
index 7c93401..0000000
--- a/simd/jcclrmmx.asm
+++ /dev/null
@@ -1,477 +0,0 @@
-;
-; jcclrmmx.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
-;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                           JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION img_width
-%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
-%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
-%define output_row(b)	(b)+20		; JDIMENSION output_row
-%define num_rows(b)	(b)+24		; int num_rows
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		8
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_rgb_ycc_convert_mmx) PRIVATE
-
-EXTN(jsimd_rgb_ycc_convert_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [img_width(eax)]	; num_cols
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	esi, JSAMPIMAGE [output_buf(eax)]
-	mov	ecx, JDIMENSION [output_row(eax)]
-	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
-	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
-	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-	pop	ecx
-
-	mov	esi, JSAMPARRAY [input_buf(eax)]
-	mov	eax, INT [num_rows(eax)]
-	test	eax,eax
-	jle	near .return
-	alignx	16,7
-.rowloop:
-	pushpic	eax
-	push	edx
-	push	ebx
-	push	edi
-	push	esi
-	push	ecx			; col
-
-	mov	esi, JSAMPROW [esi]	; inptr
-	mov	edi, JSAMPROW [edi]	; outptr0
-	mov	ebx, JSAMPROW [ebx]	; outptr1
-	mov	edx, JSAMPROW [edx]	; outptr2
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-
-	cmp	ecx, byte SIZEOF_MMWORD
-	jae	short .columnloop
-	alignx	16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-	push	eax
-	push	edx
-	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
-	test	cl, SIZEOF_BYTE
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_BYTE
-	xor	eax,eax
-	mov	al, BYTE [esi+ecx]
-.column_ld2:
-	test	cl, SIZEOF_WORD
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_WORD
-	xor	edx,edx
-	mov	dx, WORD [esi+ecx]
-	shl	eax, WORD_BIT
-	or	eax,edx
-.column_ld4:
-	movd	mmA,eax
-	pop	edx
-	pop	eax
-	test	cl, SIZEOF_DWORD
-	jz	short .column_ld8
-	sub	ecx, byte SIZEOF_DWORD
-	movd	mmG, DWORD [esi+ecx]
-	psllq	mmA, DWORD_BIT
-	por	mmA,mmG
-.column_ld8:
-	test	cl, SIZEOF_MMWORD
-	jz	short .column_ld16
-	movq	mmG,mmA
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	mov	ecx, SIZEOF_MMWORD
-	jmp	short .rgb_ycc_cnv
-.column_ld16:
-	test	cl, 2*SIZEOF_MMWORD
-	mov	ecx, SIZEOF_MMWORD
-	jz	short .rgb_ycc_cnv
-	movq	mmF,mmA
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-	jmp	short .rgb_ycc_cnv
-	alignx	16,7
-
-.columnloop:
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-	movq	mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
-	; mmA=(00 10 20 01 11 21 02 12)
-	; mmG=(22 03 13 23 04 14 24 05)
-	; mmF=(15 25 06 16 26 07 17 27)
-
-	movq      mmD,mmA
-	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 10 20 01)
-	psrlq     mmD,4*BYTE_BIT	; mmD=(11 21 02 12 -- -- -- --)
-
-	punpckhbw mmA,mmG		; mmA=(00 04 10 14 20 24 01 05)
-	psllq     mmG,4*BYTE_BIT	; mmG=(-- -- -- -- 22 03 13 23)
-
-	punpcklbw mmD,mmF		; mmD=(11 15 21 25 02 06 12 16)
-	punpckhbw mmG,mmF		; mmG=(22 26 03 07 13 17 23 27)
-
-	movq      mmE,mmA
-	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 04 10 14)
-	psrlq     mmE,4*BYTE_BIT	; mmE=(20 24 01 05 -- -- -- --)
-
-	punpckhbw mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
-	psllq     mmD,4*BYTE_BIT	; mmD=(-- -- -- -- 11 15 21 25)
-
-	punpcklbw mmE,mmG		; mmE=(20 22 24 26 01 03 05 07)
-	punpckhbw mmD,mmG		; mmD=(11 13 15 17 21 23 25 27)
-
-	pxor      mmH,mmH
-
-	movq      mmC,mmA
-	punpcklbw mmA,mmH		; mmA=(00 02 04 06)
-	punpckhbw mmC,mmH		; mmC=(10 12 14 16)
-
-	movq      mmB,mmE
-	punpcklbw mmE,mmH		; mmE=(20 22 24 26)
-	punpckhbw mmB,mmH		; mmB=(01 03 05 07)
-
-	movq      mmF,mmD
-	punpcklbw mmD,mmH		; mmD=(11 13 15 17)
-	punpckhbw mmF,mmH		; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-	test	cl, SIZEOF_MMWORD/8
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_MMWORD/8
-	movd	mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-	test	cl, SIZEOF_MMWORD/4
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_MMWORD/4
-	movq	mmF,mmA
-	movq	mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
-	test	cl, SIZEOF_MMWORD/2
-	mov	ecx, SIZEOF_MMWORD
-	jz	short .rgb_ycc_cnv
-	movq	mmD,mmA
-	movq	mmC,mmF
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-	jmp	short .rgb_ycc_cnv
-	alignx	16,7
-
-.columnloop:
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-	movq	mmD, MMWORD [esi+2*SIZEOF_MMWORD]
-	movq	mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
-	; mmA=(00 10 20 30 01 11 21 31)
-	; mmF=(02 12 22 32 03 13 23 33)
-	; mmD=(04 14 24 34 05 15 25 35)
-	; mmC=(06 16 26 36 07 17 27 37)
-
-	movq      mmB,mmA
-	punpcklbw mmA,mmF		; mmA=(00 02 10 12 20 22 30 32)
-	punpckhbw mmB,mmF		; mmB=(01 03 11 13 21 23 31 33)
-
-	movq      mmG,mmD
-	punpcklbw mmD,mmC		; mmD=(04 06 14 16 24 26 34 36)
-	punpckhbw mmG,mmC		; mmG=(05 07 15 17 25 27 35 37)
-
-	movq      mmE,mmA
-	punpcklwd mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
-	punpckhwd mmE,mmD		; mmE=(20 22 24 26 30 32 34 36)
-
-	movq      mmH,mmB
-	punpcklwd mmB,mmG		; mmB=(01 03 05 07 11 13 15 17)
-	punpckhwd mmH,mmG		; mmH=(21 23 25 27 31 33 35 37)
-
-	pxor      mmF,mmF
-
-	movq      mmC,mmA
-	punpcklbw mmA,mmF		; mmA=(00 02 04 06)
-	punpckhbw mmC,mmF		; mmC=(10 12 14 16)
-
-	movq      mmD,mmB
-	punpcklbw mmB,mmF		; mmB=(01 03 05 07)
-	punpckhbw mmD,mmF		; mmD=(11 13 15 17)
-
-	movq      mmG,mmE
-	punpcklbw mmE,mmF		; mmE=(20 22 24 26)
-	punpckhbw mmG,mmF		; mmG=(30 32 34 36)
-
-	punpcklbw mmF,mmH
-	punpckhbw mmH,mmH
-	psrlw     mmF,BYTE_BIT		; mmF=(21 23 25 27)
-	psrlw     mmH,BYTE_BIT		; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
-	; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
-	; (Original)
-	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-	;
-	; (This implementation)
-	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-	movq      MMWORD [wk(0)], mm0	; wk(0)=RE
-	movq      MMWORD [wk(1)], mm1	; wk(1)=RO
-	movq      MMWORD [wk(2)], mm4	; wk(2)=BE
-	movq      MMWORD [wk(3)], mm5	; wk(3)=BO
-
-	movq      mm6,mm1
-	punpcklwd mm1,mm3
-	punpckhwd mm6,mm3
-	movq      mm7,mm1
-	movq      mm4,mm6
-	pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-	pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-	movq      MMWORD [wk(4)], mm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-	movq      MMWORD [wk(5)], mm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	pxor      mm1,mm1
-	pxor      mm6,mm6
-	punpcklwd mm1,mm5		; mm1=BOL
-	punpckhwd mm6,mm5		; mm6=BOH
-	psrld     mm1,1			; mm1=BOL*FIX(0.500)
-	psrld     mm6,1			; mm6=BOH*FIX(0.500)
-
-	movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
-
-	paddd     mm7,mm1
-	paddd     mm4,mm6
-	paddd     mm7,mm5
-	paddd     mm4,mm5
-	psrld     mm7,SCALEBITS		; mm7=CbOL
-	psrld     mm4,SCALEBITS		; mm4=CbOH
-	packssdw  mm7,mm4		; mm7=CbO
-
-	movq      mm1, MMWORD [wk(2)]	; mm1=BE
-
-	movq      mm6,mm0
-	punpcklwd mm0,mm2
-	punpckhwd mm6,mm2
-	movq      mm5,mm0
-	movq      mm4,mm6
-	pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
-	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
-	pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-	movq      MMWORD [wk(6)], mm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-	movq      MMWORD [wk(7)], mm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	pxor      mm0,mm0
-	pxor      mm6,mm6
-	punpcklwd mm0,mm1		; mm0=BEL
-	punpckhwd mm6,mm1		; mm6=BEH
-	psrld     mm0,1			; mm0=BEL*FIX(0.500)
-	psrld     mm6,1			; mm6=BEH*FIX(0.500)
-
-	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
-	paddd     mm5,mm0
-	paddd     mm4,mm6
-	paddd     mm5,mm1
-	paddd     mm4,mm1
-	psrld     mm5,SCALEBITS		; mm5=CbEL
-	psrld     mm4,SCALEBITS		; mm4=CbEH
-	packssdw  mm5,mm4		; mm5=CbE
-
-	psllw     mm7,BYTE_BIT
-	por       mm5,mm7		; mm5=Cb
-	movq      MMWORD [ebx], mm5	; Save Cb
-
-	movq      mm0, MMWORD [wk(3)]	; mm0=BO
-	movq      mm6, MMWORD [wk(2)]	; mm6=BE
-	movq      mm1, MMWORD [wk(1)]	; mm1=RO
-
-	movq      mm4,mm0
-	punpcklwd mm0,mm3
-	punpckhwd mm4,mm3
-	movq      mm7,mm0
-	movq      mm5,mm4
-	pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-	pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-	movq      mm3,[GOTOFF(eax,PD_ONEHALF)]	; mm3=[PD_ONEHALF]
-
-	paddd     mm0, MMWORD [wk(4)]
-	paddd     mm4, MMWORD [wk(5)]
-	paddd     mm0,mm3
-	paddd     mm4,mm3
-	psrld     mm0,SCALEBITS		; mm0=YOL
-	psrld     mm4,SCALEBITS		; mm4=YOH
-	packssdw  mm0,mm4		; mm0=YO
-
-	pxor      mm3,mm3
-	pxor      mm4,mm4
-	punpcklwd mm3,mm1		; mm3=ROL
-	punpckhwd mm4,mm1		; mm4=ROH
-	psrld     mm3,1			; mm3=ROL*FIX(0.500)
-	psrld     mm4,1			; mm4=ROH*FIX(0.500)
-
-	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
-	paddd     mm7,mm3
-	paddd     mm5,mm4
-	paddd     mm7,mm1
-	paddd     mm5,mm1
-	psrld     mm7,SCALEBITS		; mm7=CrOL
-	psrld     mm5,SCALEBITS		; mm5=CrOH
-	packssdw  mm7,mm5		; mm7=CrO
-
-	movq      mm3, MMWORD [wk(0)]	; mm3=RE
-
-	movq      mm4,mm6
-	punpcklwd mm6,mm2
-	punpckhwd mm4,mm2
-	movq      mm1,mm6
-	movq      mm5,mm4
-	pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-	pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-	movq      mm2,[GOTOFF(eax,PD_ONEHALF)]	; mm2=[PD_ONEHALF]
-
-	paddd     mm6, MMWORD [wk(6)]
-	paddd     mm4, MMWORD [wk(7)]
-	paddd     mm6,mm2
-	paddd     mm4,mm2
-	psrld     mm6,SCALEBITS		; mm6=YEL
-	psrld     mm4,SCALEBITS		; mm4=YEH
-	packssdw  mm6,mm4		; mm6=YE
-
-	psllw     mm0,BYTE_BIT
-	por       mm6,mm0		; mm6=Y
-	movq      MMWORD [edi], mm6	; Save Y
-
-	pxor      mm2,mm2
-	pxor      mm4,mm4
-	punpcklwd mm2,mm3		; mm2=REL
-	punpckhwd mm4,mm3		; mm4=REH
-	psrld     mm2,1			; mm2=REL*FIX(0.500)
-	psrld     mm4,1			; mm4=REH*FIX(0.500)
-
-	movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
-
-	paddd     mm1,mm2
-	paddd     mm5,mm4
-	paddd     mm1,mm0
-	paddd     mm5,mm0
-	psrld     mm1,SCALEBITS		; mm1=CrEL
-	psrld     mm5,SCALEBITS		; mm5=CrEH
-	packssdw  mm1,mm5		; mm1=CrE
-
-	psllw     mm7,BYTE_BIT
-	por       mm1,mm7		; mm1=Cr
-	movq      MMWORD [edx], mm1	; Save Cr
-
-	sub	ecx, byte SIZEOF_MMWORD
-	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; inptr
-	add	edi, byte SIZEOF_MMWORD			; outptr0
-	add	ebx, byte SIZEOF_MMWORD			; outptr1
-	add	edx, byte SIZEOF_MMWORD			; outptr2
-	cmp	ecx, byte SIZEOF_MMWORD
-	jae	near .columnloop
-	test	ecx,ecx
-	jnz	near .column_ld1
-
-	pop	ecx			; col
-	pop	esi
-	pop	edi
-	pop	ebx
-	pop	edx
-	poppic	eax
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_buf
-	add	edi, byte SIZEOF_JSAMPROW
-	add	ebx, byte SIZEOF_JSAMPROW
-	add	edx, byte SIZEOF_JSAMPROW
-	dec	eax				; num_rows
-	jg	near .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcclrss2-64.asm b/simd/jcclrss2-64.asm
deleted file mode 100644
index a076365..0000000
--- a/simd/jcclrss2-64.asm
+++ /dev/null
@@ -1,485 +0,0 @@
-;
-; jcclrss2-64.asm - colorspace conversion (64-bit SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; Copyright (C) 2009, D. R. Commander.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-; r10 = JDIMENSION img_width
-; r11 = JSAMPARRAY input_buf
-; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		8
-
-	align	16
-
-	global	EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-	push	rbx
-
-	mov	ecx, r10d
-	test	rcx,rcx
-	jz	near .return
-
-	push	rcx
-
-	mov rsi, r12
-	mov ecx, r13d
-	mov	rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-	mov	rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
-	mov	rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
-	lea	rdi, [rdi+rcx*SIZEOF_JSAMPROW]
-	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
-	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
-	pop	rcx
-
-	mov rsi, r11
-	mov	eax, r14d
-	test	rax,rax
-	jle	near .return
-.rowloop:
-	push	rdx
-	push	rbx
-	push	rdi
-	push	rsi
-	push	rcx			; col
-
-	mov	rsi, JSAMPROW [rsi]	; inptr
-	mov	rdi, JSAMPROW [rdi]	; outptr0
-	mov	rbx, JSAMPROW [rbx]	; outptr1
-	mov	rdx, JSAMPROW [rdx]	; outptr2
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-	push	rax
-	push	rdx
-	lea	rcx,[rcx+rcx*2]		; imul ecx,RGB_PIXELSIZE
-	test	cl, SIZEOF_BYTE
-	jz	short .column_ld2
-	sub	rcx, byte SIZEOF_BYTE
-	movzx	rax, BYTE [rsi+rcx]
-.column_ld2:
-	test	cl, SIZEOF_WORD
-	jz	short .column_ld4
-	sub	rcx, byte SIZEOF_WORD
-	movzx	rdx, WORD [rsi+rcx]
-	shl	rax, WORD_BIT
-	or	rax,rdx
-.column_ld4:
-	movd	xmmA,eax
-	pop	rdx
-	pop	rax
-	test	cl, SIZEOF_DWORD
-	jz	short .column_ld8
-	sub	rcx, byte SIZEOF_DWORD
-	movd	xmmF, XMM_DWORD [rsi+rcx]
-	pslldq	xmmA, SIZEOF_DWORD
-	por	xmmA,xmmF
-.column_ld8:
-	test	cl, SIZEOF_MMWORD
-	jz	short .column_ld16
-	sub	rcx, byte SIZEOF_MMWORD
-	movq	xmmB, XMM_MMWORD [rsi+rcx]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmB
-.column_ld16:
-	test	cl, SIZEOF_XMMWORD
-	jz	short .column_ld32
-	movdqa	xmmF,xmmA
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	mov	rcx, SIZEOF_XMMWORD
-	jmp	short .rgb_ycc_cnv
-.column_ld32:
-	test	cl, 2*SIZEOF_XMMWORD
-	mov	rcx, SIZEOF_XMMWORD
-	jz	short .rgb_ycc_cnv
-	movdqa	xmmB,xmmA
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_ycc_cnv
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	movdqu	xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	movdqa    xmmG,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-	movdqa    xmmD,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-	movdqa    xmmE,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-	pxor      xmmH,xmmH
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmB,xmmE
-	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-	movdqa    xmmF,xmmD
-	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-	test	cl, SIZEOF_XMMWORD/16
-	jz	short .column_ld2
-	sub	rcx, byte SIZEOF_XMMWORD/16
-	movd	xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld2:
-	test	cl, SIZEOF_XMMWORD/8
-	jz	short .column_ld4
-	sub	rcx, byte SIZEOF_XMMWORD/8
-	movq	xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmE
-.column_ld4:
-	test	cl, SIZEOF_XMMWORD/4
-	jz	short .column_ld8
-	sub	rcx, byte SIZEOF_XMMWORD/4
-	movdqa	xmmE,xmmA
-	movdqu	xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld8:
-	test	cl, SIZEOF_XMMWORD/2
-	mov	rcx, SIZEOF_XMMWORD
-	jz	short .rgb_ycc_cnv
-	movdqa	xmmF,xmmA
-	movdqa	xmmH,xmmE
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_ycc_cnv
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-	movdqu	xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-	movdqa    xmmC,xmmF
-	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-	movdqa    xmmB,xmmA
-	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-	movdqa    xmmG,xmmD
-	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-	movdqa    xmmE,xmmA
-	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-	movdqa    xmmH,xmmB
-	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-	pxor      xmmF,xmmF
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmD,xmmB
-	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-	movdqa    xmmG,xmmE
-	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-	punpcklbw xmmF,xmmH
-	punpckhbw xmmH,xmmH
-	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
-	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-	; (Original)
-	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-	;
-	; (This implementation)
-	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
-	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
-	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
-	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
-
-	movdqa    xmm6,xmm1
-	punpcklwd xmm1,xmm3
-	punpckhwd xmm6,xmm3
-	movdqa    xmm7,xmm1
-	movdqa    xmm4,xmm6
-	pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-	pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	pxor      xmm1,xmm1
-	pxor      xmm6,xmm6
-	punpcklwd xmm1,xmm5		; xmm1=BOL
-	punpckhwd xmm6,xmm5		; xmm6=BOH
-	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
-	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
-
-	movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm7,xmm1
-	paddd     xmm4,xmm6
-	paddd     xmm7,xmm5
-	paddd     xmm4,xmm5
-	psrld     xmm7,SCALEBITS	; xmm7=CbOL
-	psrld     xmm4,SCALEBITS	; xmm4=CbOH
-	packssdw  xmm7,xmm4		; xmm7=CbO
-
-	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
-
-	movdqa    xmm6,xmm0
-	punpcklwd xmm0,xmm2
-	punpckhwd xmm6,xmm2
-	movdqa    xmm5,xmm0
-	movdqa    xmm4,xmm6
-	pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-	pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	pxor      xmm0,xmm0
-	pxor      xmm6,xmm6
-	punpcklwd xmm0,xmm1		; xmm0=BEL
-	punpckhwd xmm6,xmm1		; xmm6=BEH
-	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
-	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
-
-	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm5,xmm0
-	paddd     xmm4,xmm6
-	paddd     xmm5,xmm1
-	paddd     xmm4,xmm1
-	psrld     xmm5,SCALEBITS	; xmm5=CbEL
-	psrld     xmm4,SCALEBITS	; xmm4=CbEH
-	packssdw  xmm5,xmm4		; xmm5=CbE
-
-	psllw     xmm7,BYTE_BIT
-	por       xmm5,xmm7		; xmm5=Cb
-	movdqa    XMMWORD [rbx], xmm5	; Save Cb
-
-	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
-	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
-	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
-
-	movdqa    xmm4,xmm0
-	punpcklwd xmm0,xmm3
-	punpckhwd xmm4,xmm3
-	movdqa    xmm7,xmm0
-	movdqa    xmm5,xmm4
-	pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-	pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-	movdqa    xmm3,[rel PD_ONEHALF]	; xmm3=[PD_ONEHALF]
-
-	paddd     xmm0, XMMWORD [wk(4)]
-	paddd     xmm4, XMMWORD [wk(5)]
-	paddd     xmm0,xmm3
-	paddd     xmm4,xmm3
-	psrld     xmm0,SCALEBITS	; xmm0=YOL
-	psrld     xmm4,SCALEBITS	; xmm4=YOH
-	packssdw  xmm0,xmm4		; xmm0=YO
-
-	pxor      xmm3,xmm3
-	pxor      xmm4,xmm4
-	punpcklwd xmm3,xmm1		; xmm3=ROL
-	punpckhwd xmm4,xmm1		; xmm4=ROH
-	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
-	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
-
-	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm7,xmm3
-	paddd     xmm5,xmm4
-	paddd     xmm7,xmm1
-	paddd     xmm5,xmm1
-	psrld     xmm7,SCALEBITS	; xmm7=CrOL
-	psrld     xmm5,SCALEBITS	; xmm5=CrOH
-	packssdw  xmm7,xmm5		; xmm7=CrO
-
-	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
-
-	movdqa    xmm4,xmm6
-	punpcklwd xmm6,xmm2
-	punpckhwd xmm4,xmm2
-	movdqa    xmm1,xmm6
-	movdqa    xmm5,xmm4
-	pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-	pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-	movdqa    xmm2,[rel PD_ONEHALF]	; xmm2=[PD_ONEHALF]
-
-	paddd     xmm6, XMMWORD [wk(6)]
-	paddd     xmm4, XMMWORD [wk(7)]
-	paddd     xmm6,xmm2
-	paddd     xmm4,xmm2
-	psrld     xmm6,SCALEBITS	; xmm6=YEL
-	psrld     xmm4,SCALEBITS	; xmm4=YEH
-	packssdw  xmm6,xmm4		; xmm6=YE
-
-	psllw     xmm0,BYTE_BIT
-	por       xmm6,xmm0		; xmm6=Y
-	movdqa    XMMWORD [rdi], xmm6	; Save Y
-
-	pxor      xmm2,xmm2
-	pxor      xmm4,xmm4
-	punpcklwd xmm2,xmm3		; xmm2=REL
-	punpckhwd xmm4,xmm3		; xmm4=REH
-	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
-	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
-
-	movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm1,xmm2
-	paddd     xmm5,xmm4
-	paddd     xmm1,xmm0
-	paddd     xmm5,xmm0
-	psrld     xmm1,SCALEBITS	; xmm1=CrEL
-	psrld     xmm5,SCALEBITS	; xmm5=CrEH
-	packssdw  xmm1,xmm5		; xmm1=CrE
-
-	psllw     xmm7,BYTE_BIT
-	por       xmm1,xmm7		; xmm1=Cr
-	movdqa    XMMWORD [rdx], xmm1	; Save Cr
-
-	sub	rcx, byte SIZEOF_XMMWORD
-	add	rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
-	add	rdi, byte SIZEOF_XMMWORD		; outptr0
-	add	rbx, byte SIZEOF_XMMWORD		; outptr1
-	add	rdx, byte SIZEOF_XMMWORD		; outptr2
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	test	rcx,rcx
-	jnz	near .column_ld1
-
-	pop	rcx			; col
-	pop	rsi
-	pop	rdi
-	pop	rbx
-	pop	rdx
-
-	add	rsi, byte SIZEOF_JSAMPROW	; input_buf
-	add	rdi, byte SIZEOF_JSAMPROW
-	add	rbx, byte SIZEOF_JSAMPROW
-	add	rdx, byte SIZEOF_JSAMPROW
-	dec	rax				; num_rows
-	jg	near .rowloop
-
-.return:
-	pop	rbx
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcclrss2.asm b/simd/jcclrss2.asm
deleted file mode 100644
index bcd51fc..0000000
--- a/simd/jcclrss2.asm
+++ /dev/null
@@ -1,503 +0,0 @@
-;
-; jcclrss2.asm - colorspace conversion (SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION img_width
-%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
-%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
-%define output_row(b)	(b)+20		; JDIMENSION output_row
-%define num_rows(b)	(b)+24		; int num_rows
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		8
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-
-	global	EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [img_width(eax)]
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	esi, JSAMPIMAGE [output_buf(eax)]
-	mov	ecx, JDIMENSION [output_row(eax)]
-	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
-	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
-	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-	pop	ecx
-
-	mov	esi, JSAMPARRAY [input_buf(eax)]
-	mov	eax, INT [num_rows(eax)]
-	test	eax,eax
-	jle	near .return
-	alignx	16,7
-.rowloop:
-	pushpic	eax
-	push	edx
-	push	ebx
-	push	edi
-	push	esi
-	push	ecx			; col
-
-	mov	esi, JSAMPROW [esi]	; inptr
-	mov	edi, JSAMPROW [edi]	; outptr0
-	mov	ebx, JSAMPROW [ebx]	; outptr1
-	mov	edx, JSAMPROW [edx]	; outptr2
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	alignx	16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-	push	eax
-	push	edx
-	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
-	test	cl, SIZEOF_BYTE
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_BYTE
-	movzx	eax, BYTE [esi+ecx]
-.column_ld2:
-	test	cl, SIZEOF_WORD
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_WORD
-	movzx	edx, WORD [esi+ecx]
-	shl	eax, WORD_BIT
-	or	eax,edx
-.column_ld4:
-	movd	xmmA,eax
-	pop	edx
-	pop	eax
-	test	cl, SIZEOF_DWORD
-	jz	short .column_ld8
-	sub	ecx, byte SIZEOF_DWORD
-	movd	xmmF, XMM_DWORD [esi+ecx]
-	pslldq	xmmA, SIZEOF_DWORD
-	por	xmmA,xmmF
-.column_ld8:
-	test	cl, SIZEOF_MMWORD
-	jz	short .column_ld16
-	sub	ecx, byte SIZEOF_MMWORD
-	movq	xmmB, XMM_MMWORD [esi+ecx]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmB
-.column_ld16:
-	test	cl, SIZEOF_XMMWORD
-	jz	short .column_ld32
-	movdqa	xmmF,xmmA
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	mov	ecx, SIZEOF_XMMWORD
-	jmp	short .rgb_ycc_cnv
-.column_ld32:
-	test	cl, 2*SIZEOF_XMMWORD
-	mov	ecx, SIZEOF_XMMWORD
-	jz	short .rgb_ycc_cnv
-	movdqa	xmmB,xmmA
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_ycc_cnv
-	alignx	16,7
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	movdqu	xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	movdqa    xmmG,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-	movdqa    xmmD,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-	movdqa    xmmE,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-	pxor      xmmH,xmmH
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmB,xmmE
-	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-	movdqa    xmmF,xmmD
-	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-	test	cl, SIZEOF_XMMWORD/16
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_XMMWORD/16
-	movd	xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-	test	cl, SIZEOF_XMMWORD/8
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_XMMWORD/8
-	movq	xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmE
-.column_ld4:
-	test	cl, SIZEOF_XMMWORD/4
-	jz	short .column_ld8
-	sub	ecx, byte SIZEOF_XMMWORD/4
-	movdqa	xmmE,xmmA
-	movdqu	xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
-	test	cl, SIZEOF_XMMWORD/2
-	mov	ecx, SIZEOF_XMMWORD
-	jz	short .rgb_ycc_cnv
-	movdqa	xmmF,xmmA
-	movdqa	xmmH,xmmE
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_ycc_cnv
-	alignx	16,7
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
-	movdqu	xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-	movdqa    xmmC,xmmF
-	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-	movdqa    xmmB,xmmA
-	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-	movdqa    xmmG,xmmD
-	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-	movdqa    xmmE,xmmA
-	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-	movdqa    xmmH,xmmB
-	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-	pxor      xmmF,xmmF
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmD,xmmB
-	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-	movdqa    xmmG,xmmE
-	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-	punpcklbw xmmF,xmmH
-	punpckhbw xmmH,xmmH
-	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
-	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-	; (Original)
-	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-	;
-	; (This implementation)
-	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
-	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
-	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
-	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
-
-	movdqa    xmm6,xmm1
-	punpcklwd xmm1,xmm3
-	punpckhwd xmm6,xmm3
-	movdqa    xmm7,xmm1
-	movdqa    xmm4,xmm6
-	pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-	pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	pxor      xmm1,xmm1
-	pxor      xmm6,xmm6
-	punpcklwd xmm1,xmm5		; xmm1=BOL
-	punpckhwd xmm6,xmm5		; xmm6=BOH
-	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
-	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
-
-	movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm7,xmm1
-	paddd     xmm4,xmm6
-	paddd     xmm7,xmm5
-	paddd     xmm4,xmm5
-	psrld     xmm7,SCALEBITS	; xmm7=CbOL
-	psrld     xmm4,SCALEBITS	; xmm4=CbOH
-	packssdw  xmm7,xmm4		; xmm7=CbO
-
-	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
-
-	movdqa    xmm6,xmm0
-	punpcklwd xmm0,xmm2
-	punpckhwd xmm6,xmm2
-	movdqa    xmm5,xmm0
-	movdqa    xmm4,xmm6
-	pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-	pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	pxor      xmm0,xmm0
-	pxor      xmm6,xmm6
-	punpcklwd xmm0,xmm1		; xmm0=BEL
-	punpckhwd xmm6,xmm1		; xmm6=BEH
-	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
-	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
-
-	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm5,xmm0
-	paddd     xmm4,xmm6
-	paddd     xmm5,xmm1
-	paddd     xmm4,xmm1
-	psrld     xmm5,SCALEBITS	; xmm5=CbEL
-	psrld     xmm4,SCALEBITS	; xmm4=CbEH
-	packssdw  xmm5,xmm4		; xmm5=CbE
-
-	psllw     xmm7,BYTE_BIT
-	por       xmm5,xmm7		; xmm5=Cb
-	movdqa    XMMWORD [ebx], xmm5	; Save Cb
-
-	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
-	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
-	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
-
-	movdqa    xmm4,xmm0
-	punpcklwd xmm0,xmm3
-	punpckhwd xmm4,xmm3
-	movdqa    xmm7,xmm0
-	movdqa    xmm5,xmm4
-	pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-	pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-	movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)]	; xmm3=[PD_ONEHALF]
-
-	paddd     xmm0, XMMWORD [wk(4)]
-	paddd     xmm4, XMMWORD [wk(5)]
-	paddd     xmm0,xmm3
-	paddd     xmm4,xmm3
-	psrld     xmm0,SCALEBITS	; xmm0=YOL
-	psrld     xmm4,SCALEBITS	; xmm4=YOH
-	packssdw  xmm0,xmm4		; xmm0=YO
-
-	pxor      xmm3,xmm3
-	pxor      xmm4,xmm4
-	punpcklwd xmm3,xmm1		; xmm3=ROL
-	punpckhwd xmm4,xmm1		; xmm4=ROH
-	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
-	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
-
-	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm7,xmm3
-	paddd     xmm5,xmm4
-	paddd     xmm7,xmm1
-	paddd     xmm5,xmm1
-	psrld     xmm7,SCALEBITS	; xmm7=CrOL
-	psrld     xmm5,SCALEBITS	; xmm5=CrOH
-	packssdw  xmm7,xmm5		; xmm7=CrO
-
-	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
-
-	movdqa    xmm4,xmm6
-	punpcklwd xmm6,xmm2
-	punpckhwd xmm4,xmm2
-	movdqa    xmm1,xmm6
-	movdqa    xmm5,xmm4
-	pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-	pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-	movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)]	; xmm2=[PD_ONEHALF]
-
-	paddd     xmm6, XMMWORD [wk(6)]
-	paddd     xmm4, XMMWORD [wk(7)]
-	paddd     xmm6,xmm2
-	paddd     xmm4,xmm2
-	psrld     xmm6,SCALEBITS	; xmm6=YEL
-	psrld     xmm4,SCALEBITS	; xmm4=YEH
-	packssdw  xmm6,xmm4		; xmm6=YE
-
-	psllw     xmm0,BYTE_BIT
-	por       xmm6,xmm0		; xmm6=Y
-	movdqa    XMMWORD [edi], xmm6	; Save Y
-
-	pxor      xmm2,xmm2
-	pxor      xmm4,xmm4
-	punpcklwd xmm2,xmm3		; xmm2=REL
-	punpckhwd xmm4,xmm3		; xmm4=REH
-	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
-	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
-
-	movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
-
-	paddd     xmm1,xmm2
-	paddd     xmm5,xmm4
-	paddd     xmm1,xmm0
-	paddd     xmm5,xmm0
-	psrld     xmm1,SCALEBITS	; xmm1=CrEL
-	psrld     xmm5,SCALEBITS	; xmm5=CrEH
-	packssdw  xmm1,xmm5		; xmm1=CrE
-
-	psllw     xmm7,BYTE_BIT
-	por       xmm1,xmm7		; xmm1=Cr
-	movdqa    XMMWORD [edx], xmm1	; Save Cr
-
-	sub	ecx, byte SIZEOF_XMMWORD
-	add	esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
-	add	edi, byte SIZEOF_XMMWORD		; outptr0
-	add	ebx, byte SIZEOF_XMMWORD		; outptr1
-	add	edx, byte SIZEOF_XMMWORD		; outptr2
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	test	ecx,ecx
-	jnz	near .column_ld1
-
-	pop	ecx			; col
-	pop	esi
-	pop	edi
-	pop	ebx
-	pop	edx
-	poppic	eax
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_buf
-	add	edi, byte SIZEOF_JSAMPROW
-	add	ebx, byte SIZEOF_JSAMPROW
-	add	edx, byte SIZEOF_JSAMPROW
-	dec	eax				; num_rows
-	jg	near .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jccolext-altivec.c b/simd/jccolext-altivec.c
new file mode 100644
index 0000000..403aa96
--- /dev/null
+++ b/simd/jccolext-altivec.c
@@ -0,0 +1,267 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * Copyright (C) 2014, Jay Foad.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-altivec.c */
+
+
+void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
+                                    JSAMPIMAGE output_buf,
+                                    JDIMENSION output_row, int num_rows)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+    rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3 = {0};
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
+  __vector unsigned char rgb4 = {0};
+#endif
+  __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+  __vector unsigned short yl, yh, crl, crh, cbl, cbh;
+  __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
+
+  /* Constants */
+  __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+    pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
+    pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
+    pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
+  __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) },
+    pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+         outptr0 += 16, outptr1 += 16, outptr2 += 16) {
+
+#if __BIG_ENDIAN__
+      /* Load 16 pixels == 48 or 64 bytes */
+      offset = (size_t)inptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overread.  Since there is no way to
+           * read a partial AltiVec register, overread would occur on the last
+           * chunk of the last image row if the right edge is not on a 16-byte
+           * boundary.  It could also occur on other rows if the bytes per row
+           * is low enough.  Since we can't determine whether we're on the last
+           * image row, we have to assume every row is the last.
+           */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = vec_ld(0, tmpbuf);
+          rgb1 = vec_ld(16, tmpbuf);
+          rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
+          if (bytes > 16)
+            rgb1 = vec_ld(16, inptr);
+          if (bytes > 32)
+            rgb2 = vec_ld(32, inptr);
+          if (bytes > 48)
+            rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            rgb4 = vec_ld(64, inptr);
+#endif
+          unaligned_shift_index = vec_lvsl(0, inptr);
+          rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+          rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
+        }
+      } else {
+#endif /* __BIG_ENDIAN__ */
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = VEC_LD(0, tmpbuf);
+          rgb1 = VEC_LD(16, tmpbuf);
+          rgb2 = VEC_LD(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = VEC_LD(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = VEC_LD(0, inptr);
+          if (num_cols > 16)
+            rgb1 = VEC_LD(16, inptr);
+          if (num_cols > 32)
+            rgb2 = VEC_LD(32, inptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            rgb3 = VEC_LD(48, inptr);
+#endif
+        }
+#if __BIG_ENDIAN__
+      }
+#endif
+
+#if RGB_PIXELSIZE == 3
+      /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+      rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+      rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+      rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+      /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+      rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+      rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+      rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+      /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+       * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+       * ...
+       *
+       * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+      bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+      rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+      bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+      rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+      bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+      rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+      bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
+
+      /* (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       */
+
+      /* Calculate Y values */
+
+      y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+      y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+      y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+      y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+      y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+      y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+      y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+      y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+                    shift_pack_index);
+      yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+                    shift_pack_index);
+      y = vec_pack(yl, yh);
+      vec_st(y, 0, outptr0);
+
+      /* Calculate Cb values */
+      cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
+      cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
+                                   (__vector unsigned int)cb0);
+      cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
+                                   (__vector unsigned int)cb1);
+      cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
+                                   (__vector unsigned int)cb2);
+      cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
+                                   (__vector unsigned int)cb3);
+      cbl = vec_perm((__vector unsigned short)cb0,
+                     (__vector unsigned short)cb1, shift_pack_index);
+      cbh = vec_perm((__vector unsigned short)cb2,
+                     (__vector unsigned short)cb3, shift_pack_index);
+      cb = vec_pack(cbl, cbh);
+      vec_st(cb, 0, outptr1);
+
+      /* Calculate Cr values */
+      cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
+      cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
+                                   (__vector unsigned int)cr0);
+      cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
+                                   (__vector unsigned int)cr1);
+      cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
+                                   (__vector unsigned int)cr2);
+      cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
+                                   (__vector unsigned int)cr3);
+      crl = vec_perm((__vector unsigned short)cr0,
+                     (__vector unsigned short)cr1, shift_pack_index);
+      crh = vec_perm((__vector unsigned short)cr2,
+                     (__vector unsigned short)cr3, shift_pack_index);
+      cr = vec_pack(crl, crh);
+      vec_st(cr, 0, outptr2);
+    }
+  }
+}
diff --git a/simd/jccolext-mmx.asm b/simd/jccolext-mmx.asm
new file mode 100644
index 0000000..d3d47a5
--- /dev/null
+++ b/simd/jccolext-mmx.asm
@@ -0,0 +1,477 @@
+;
+; jccolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
+;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                           JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)    (b)+8           ; JDIMENSION img_width
+%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
+%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
+%define output_row(b)   (b)+20          ; JDIMENSION output_row
+%define num_rows(b)     (b)+24          ; int num_rows
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM          8
+%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
+
+        align   16
+        global  EXTN(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic eax             ; make a room for GOT address
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx                     ; get GOT address
+        movpic  POINTER [gotptr], ebx   ; save GOT address
+
+        mov     ecx, JDIMENSION [img_width(eax)]        ; num_cols
+        test    ecx,ecx
+        jz      near .return
+
+        push    ecx
+
+        mov     esi, JSAMPIMAGE [output_buf(eax)]
+        mov     ecx, JDIMENSION [output_row(eax)]
+        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+        mov     ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+        mov     edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
+        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+        pop     ecx
+
+        mov     esi, JSAMPARRAY [input_buf(eax)]
+        mov     eax, INT [num_rows(eax)]
+        test    eax,eax
+        jle     near .return
+        alignx  16,7
+.rowloop:
+        pushpic eax
+        push    edx
+        push    ebx
+        push    edi
+        push    esi
+        push    ecx                     ; col
+
+        mov     esi, JSAMPROW [esi]     ; inptr
+        mov     edi, JSAMPROW [edi]     ; outptr0
+        mov     ebx, JSAMPROW [ebx]     ; outptr1
+        mov     edx, JSAMPROW [edx]     ; outptr2
+        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
+
+        cmp     ecx, byte SIZEOF_MMWORD
+        jae     short .columnloop
+        alignx  16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+        push    eax
+        push    edx
+        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
+        test    cl, SIZEOF_BYTE
+        jz      short .column_ld2
+        sub     ecx, byte SIZEOF_BYTE
+        xor     eax,eax
+        mov     al, BYTE [esi+ecx]
+.column_ld2:
+        test    cl, SIZEOF_WORD
+        jz      short .column_ld4
+        sub     ecx, byte SIZEOF_WORD
+        xor     edx,edx
+        mov     dx, WORD [esi+ecx]
+        shl     eax, WORD_BIT
+        or      eax,edx
+.column_ld4:
+        movd    mmA,eax
+        pop     edx
+        pop     eax
+        test    cl, SIZEOF_DWORD
+        jz      short .column_ld8
+        sub     ecx, byte SIZEOF_DWORD
+        movd    mmG, DWORD [esi+ecx]
+        psllq   mmA, DWORD_BIT
+        por     mmA,mmG
+.column_ld8:
+        test    cl, SIZEOF_MMWORD
+        jz      short .column_ld16
+        movq    mmG,mmA
+        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+        mov     ecx, SIZEOF_MMWORD
+        jmp     short .rgb_ycc_cnv
+.column_ld16:
+        test    cl, 2*SIZEOF_MMWORD
+        mov     ecx, SIZEOF_MMWORD
+        jz      short .rgb_ycc_cnv
+        movq    mmF,mmA
+        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+        jmp     short .rgb_ycc_cnv
+        alignx  16,7
+
+.columnloop:
+        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+        movq    mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+        ; mmA=(00 10 20 01 11 21 02 12)
+        ; mmG=(22 03 13 23 04 14 24 05)
+        ; mmF=(15 25 06 16 26 07 17 27)
+
+        movq      mmD,mmA
+        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 10 20 01)
+        psrlq     mmD,4*BYTE_BIT        ; mmD=(11 21 02 12 -- -- -- --)
+
+        punpckhbw mmA,mmG               ; mmA=(00 04 10 14 20 24 01 05)
+        psllq     mmG,4*BYTE_BIT        ; mmG=(-- -- -- -- 22 03 13 23)
+
+        punpcklbw mmD,mmF               ; mmD=(11 15 21 25 02 06 12 16)
+        punpckhbw mmG,mmF               ; mmG=(22 26 03 07 13 17 23 27)
+
+        movq      mmE,mmA
+        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 04 10 14)
+        psrlq     mmE,4*BYTE_BIT        ; mmE=(20 24 01 05 -- -- -- --)
+
+        punpckhbw mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
+        psllq     mmD,4*BYTE_BIT        ; mmD=(-- -- -- -- 11 15 21 25)
+
+        punpcklbw mmE,mmG               ; mmE=(20 22 24 26 01 03 05 07)
+        punpckhbw mmD,mmG               ; mmD=(11 13 15 17 21 23 25 27)
+
+        pxor      mmH,mmH
+
+        movq      mmC,mmA
+        punpcklbw mmA,mmH               ; mmA=(00 02 04 06)
+        punpckhbw mmC,mmH               ; mmC=(10 12 14 16)
+
+        movq      mmB,mmE
+        punpcklbw mmE,mmH               ; mmE=(20 22 24 26)
+        punpckhbw mmB,mmH               ; mmB=(01 03 05 07)
+
+        movq      mmF,mmD
+        punpcklbw mmD,mmH               ; mmD=(11 13 15 17)
+        punpckhbw mmF,mmH               ; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+        test    cl, SIZEOF_MMWORD/8
+        jz      short .column_ld2
+        sub     ecx, byte SIZEOF_MMWORD/8
+        movd    mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+        test    cl, SIZEOF_MMWORD/4
+        jz      short .column_ld4
+        sub     ecx, byte SIZEOF_MMWORD/4
+        movq    mmF,mmA
+        movq    mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+        test    cl, SIZEOF_MMWORD/2
+        mov     ecx, SIZEOF_MMWORD
+        jz      short .rgb_ycc_cnv
+        movq    mmD,mmA
+        movq    mmC,mmF
+        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+        jmp     short .rgb_ycc_cnv
+        alignx  16,7
+
+.columnloop:
+        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+        movq    mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+        movq    mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+        ; mmA=(00 10 20 30 01 11 21 31)
+        ; mmF=(02 12 22 32 03 13 23 33)
+        ; mmD=(04 14 24 34 05 15 25 35)
+        ; mmC=(06 16 26 36 07 17 27 37)
+
+        movq      mmB,mmA
+        punpcklbw mmA,mmF               ; mmA=(00 02 10 12 20 22 30 32)
+        punpckhbw mmB,mmF               ; mmB=(01 03 11 13 21 23 31 33)
+
+        movq      mmG,mmD
+        punpcklbw mmD,mmC               ; mmD=(04 06 14 16 24 26 34 36)
+        punpckhbw mmG,mmC               ; mmG=(05 07 15 17 25 27 35 37)
+
+        movq      mmE,mmA
+        punpcklwd mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
+        punpckhwd mmE,mmD               ; mmE=(20 22 24 26 30 32 34 36)
+
+        movq      mmH,mmB
+        punpcklwd mmB,mmG               ; mmB=(01 03 05 07 11 13 15 17)
+        punpckhwd mmH,mmG               ; mmH=(21 23 25 27 31 33 35 37)
+
+        pxor      mmF,mmF
+
+        movq      mmC,mmA
+        punpcklbw mmA,mmF               ; mmA=(00 02 04 06)
+        punpckhbw mmC,mmF               ; mmC=(10 12 14 16)
+
+        movq      mmD,mmB
+        punpcklbw mmB,mmF               ; mmB=(01 03 05 07)
+        punpckhbw mmD,mmF               ; mmD=(11 13 15 17)
+
+        movq      mmG,mmE
+        punpcklbw mmE,mmF               ; mmE=(20 22 24 26)
+        punpckhbw mmG,mmF               ; mmG=(30 32 34 36)
+
+        punpcklbw mmF,mmH
+        punpckhbw mmH,mmH
+        psrlw     mmF,BYTE_BIT          ; mmF=(21 23 25 27)
+        psrlw     mmH,BYTE_BIT          ; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+        ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+        ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+        ; (Original)
+        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+        ;
+        ; (This implementation)
+        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+        movq      MMWORD [wk(0)], mm0   ; wk(0)=RE
+        movq      MMWORD [wk(1)], mm1   ; wk(1)=RO
+        movq      MMWORD [wk(2)], mm4   ; wk(2)=BE
+        movq      MMWORD [wk(3)], mm5   ; wk(3)=BO
+
+        movq      mm6,mm1
+        punpcklwd mm1,mm3
+        punpckhwd mm6,mm3
+        movq      mm7,mm1
+        movq      mm4,mm6
+        pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+        pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+        pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+        movq      MMWORD [wk(4)], mm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+        movq      MMWORD [wk(5)], mm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+        pxor      mm1,mm1
+        pxor      mm6,mm6
+        punpcklwd mm1,mm5               ; mm1=BOL
+        punpckhwd mm6,mm5               ; mm6=BOH
+        psrld     mm1,1                 ; mm1=BOL*FIX(0.500)
+        psrld     mm6,1                 ; mm6=BOH*FIX(0.500)
+
+        movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
+
+        paddd     mm7,mm1
+        paddd     mm4,mm6
+        paddd     mm7,mm5
+        paddd     mm4,mm5
+        psrld     mm7,SCALEBITS         ; mm7=CbOL
+        psrld     mm4,SCALEBITS         ; mm4=CbOH
+        packssdw  mm7,mm4               ; mm7=CbO
+
+        movq      mm1, MMWORD [wk(2)]   ; mm1=BE
+
+        movq      mm6,mm0
+        punpcklwd mm0,mm2
+        punpckhwd mm6,mm2
+        movq      mm5,mm0
+        movq      mm4,mm6
+        pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+        pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+        pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+        movq      MMWORD [wk(6)], mm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+        movq      MMWORD [wk(7)], mm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+        pxor      mm0,mm0
+        pxor      mm6,mm6
+        punpcklwd mm0,mm1               ; mm0=BEL
+        punpckhwd mm6,mm1               ; mm6=BEH
+        psrld     mm0,1                 ; mm0=BEL*FIX(0.500)
+        psrld     mm6,1                 ; mm6=BEH*FIX(0.500)
+
+        movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+        paddd     mm5,mm0
+        paddd     mm4,mm6
+        paddd     mm5,mm1
+        paddd     mm4,mm1
+        psrld     mm5,SCALEBITS         ; mm5=CbEL
+        psrld     mm4,SCALEBITS         ; mm4=CbEH
+        packssdw  mm5,mm4               ; mm5=CbE
+
+        psllw     mm7,BYTE_BIT
+        por       mm5,mm7               ; mm5=Cb
+        movq      MMWORD [ebx], mm5     ; Save Cb
+
+        movq      mm0, MMWORD [wk(3)]   ; mm0=BO
+        movq      mm6, MMWORD [wk(2)]   ; mm6=BE
+        movq      mm1, MMWORD [wk(1)]   ; mm1=RO
+
+        movq      mm4,mm0
+        punpcklwd mm0,mm3
+        punpckhwd mm4,mm3
+        movq      mm7,mm0
+        movq      mm5,mm4
+        pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+        pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+        pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+        movq      mm3,[GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
+
+        paddd     mm0, MMWORD [wk(4)]
+        paddd     mm4, MMWORD [wk(5)]
+        paddd     mm0,mm3
+        paddd     mm4,mm3
+        psrld     mm0,SCALEBITS         ; mm0=YOL
+        psrld     mm4,SCALEBITS         ; mm4=YOH
+        packssdw  mm0,mm4               ; mm0=YO
+
+        pxor      mm3,mm3
+        pxor      mm4,mm4
+        punpcklwd mm3,mm1               ; mm3=ROL
+        punpckhwd mm4,mm1               ; mm4=ROH
+        psrld     mm3,1                 ; mm3=ROL*FIX(0.500)
+        psrld     mm4,1                 ; mm4=ROH*FIX(0.500)
+
+        movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+        paddd     mm7,mm3
+        paddd     mm5,mm4
+        paddd     mm7,mm1
+        paddd     mm5,mm1
+        psrld     mm7,SCALEBITS         ; mm7=CrOL
+        psrld     mm5,SCALEBITS         ; mm5=CrOH
+        packssdw  mm7,mm5               ; mm7=CrO
+
+        movq      mm3, MMWORD [wk(0)]   ; mm3=RE
+
+        movq      mm4,mm6
+        punpcklwd mm6,mm2
+        punpckhwd mm4,mm2
+        movq      mm1,mm6
+        movq      mm5,mm4
+        pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+        pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+        pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+        movq      mm2,[GOTOFF(eax,PD_ONEHALF)]  ; mm2=[PD_ONEHALF]
+
+        paddd     mm6, MMWORD [wk(6)]
+        paddd     mm4, MMWORD [wk(7)]
+        paddd     mm6,mm2
+        paddd     mm4,mm2
+        psrld     mm6,SCALEBITS         ; mm6=YEL
+        psrld     mm4,SCALEBITS         ; mm4=YEH
+        packssdw  mm6,mm4               ; mm6=YE
+
+        psllw     mm0,BYTE_BIT
+        por       mm6,mm0               ; mm6=Y
+        movq      MMWORD [edi], mm6     ; Save Y
+
+        pxor      mm2,mm2
+        pxor      mm4,mm4
+        punpcklwd mm2,mm3               ; mm2=REL
+        punpckhwd mm4,mm3               ; mm4=REH
+        psrld     mm2,1                 ; mm2=REL*FIX(0.500)
+        psrld     mm4,1                 ; mm4=REH*FIX(0.500)
+
+        movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
+
+        paddd     mm1,mm2
+        paddd     mm5,mm4
+        paddd     mm1,mm0
+        paddd     mm5,mm0
+        psrld     mm1,SCALEBITS         ; mm1=CrEL
+        psrld     mm5,SCALEBITS         ; mm5=CrEH
+        packssdw  mm1,mm5               ; mm1=CrE
+
+        psllw     mm7,BYTE_BIT
+        por       mm1,mm7               ; mm1=Cr
+        movq      MMWORD [edx], mm1     ; Save Cr
+
+        sub     ecx, byte SIZEOF_MMWORD
+        add     esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; inptr
+        add     edi, byte SIZEOF_MMWORD                 ; outptr0
+        add     ebx, byte SIZEOF_MMWORD                 ; outptr1
+        add     edx, byte SIZEOF_MMWORD                 ; outptr2
+        cmp     ecx, byte SIZEOF_MMWORD
+        jae     near .columnloop
+        test    ecx,ecx
+        jnz     near .column_ld1
+
+        pop     ecx                     ; col
+        pop     esi
+        pop     edi
+        pop     ebx
+        pop     edx
+        poppic  eax
+
+        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
+        add     edi, byte SIZEOF_JSAMPROW
+        add     ebx, byte SIZEOF_JSAMPROW
+        add     edx, byte SIZEOF_JSAMPROW
+        dec     eax                             ; num_rows
+        jg      near .rowloop
+
+        emms            ; empty MMX state
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jccolext-sse2-64.asm b/simd/jccolext-sse2-64.asm
new file mode 100644
index 0000000..7ad4343
--- /dev/null
+++ b/simd/jccolext-sse2-64.asm
@@ -0,0 +1,485 @@
+;
+; jccolext.asm - colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
+;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                             JDIMENSION output_row, int num_rows);
+;
+
+; r10 = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13 = JDIMENSION output_row
+; r14 = int num_rows
+
+%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          8
+
+        align   16
+
+        global  EXTN(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [wk(0)]
+        collect_args
+        push    rbx
+
+        mov     ecx, r10d
+        test    rcx,rcx
+        jz      near .return
+
+        push    rcx
+
+        mov rsi, r12
+        mov ecx, r13d
+        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+        mov     rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+        mov     rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+        lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+        lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+        pop     rcx
+
+        mov rsi, r11
+        mov     eax, r14d
+        test    rax,rax
+        jle     near .return
+.rowloop:
+        push    rdx
+        push    rbx
+        push    rdi
+        push    rsi
+        push    rcx                     ; col
+
+        mov     rsi, JSAMPROW [rsi]     ; inptr
+        mov     rdi, JSAMPROW [rdi]     ; outptr0
+        mov     rbx, JSAMPROW [rbx]     ; outptr1
+        mov     rdx, JSAMPROW [rdx]     ; outptr2
+
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jae     near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+        push    rax
+        push    rdx
+        lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
+        test    cl, SIZEOF_BYTE
+        jz      short .column_ld2
+        sub     rcx, byte SIZEOF_BYTE
+        movzx   rax, BYTE [rsi+rcx]
+.column_ld2:
+        test    cl, SIZEOF_WORD
+        jz      short .column_ld4
+        sub     rcx, byte SIZEOF_WORD
+        movzx   rdx, WORD [rsi+rcx]
+        shl     rax, WORD_BIT
+        or      rax,rdx
+.column_ld4:
+        movd    xmmA,eax
+        pop     rdx
+        pop     rax
+        test    cl, SIZEOF_DWORD
+        jz      short .column_ld8
+        sub     rcx, byte SIZEOF_DWORD
+        movd    xmmF, XMM_DWORD [rsi+rcx]
+        pslldq  xmmA, SIZEOF_DWORD
+        por     xmmA,xmmF
+.column_ld8:
+        test    cl, SIZEOF_MMWORD
+        jz      short .column_ld16
+        sub     rcx, byte SIZEOF_MMWORD
+        movq    xmmB, XMM_MMWORD [rsi+rcx]
+        pslldq  xmmA, SIZEOF_MMWORD
+        por     xmmA,xmmB
+.column_ld16:
+        test    cl, SIZEOF_XMMWORD
+        jz      short .column_ld32
+        movdqa  xmmF,xmmA
+        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        mov     rcx, SIZEOF_XMMWORD
+        jmp     short .rgb_ycc_cnv
+.column_ld32:
+        test    cl, 2*SIZEOF_XMMWORD
+        mov     rcx, SIZEOF_XMMWORD
+        jz      short .rgb_ycc_cnv
+        movdqa  xmmB,xmmA
+        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+        jmp     short .rgb_ycc_cnv
+
+.columnloop:
+        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+        movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+        movdqa    xmmG,xmmA
+        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+        movdqa    xmmD,xmmA
+        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+        movdqa    xmmE,xmmA
+        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+        pxor      xmmH,xmmH
+
+        movdqa    xmmC,xmmA
+        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
+        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+        movdqa    xmmB,xmmE
+        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
+        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+        movdqa    xmmF,xmmD
+        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
+        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+        test    cl, SIZEOF_XMMWORD/16
+        jz      short .column_ld2
+        sub     rcx, byte SIZEOF_XMMWORD/16
+        movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+        test    cl, SIZEOF_XMMWORD/8
+        jz      short .column_ld4
+        sub     rcx, byte SIZEOF_XMMWORD/8
+        movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+        pslldq  xmmA, SIZEOF_MMWORD
+        por     xmmA,xmmE
+.column_ld4:
+        test    cl, SIZEOF_XMMWORD/4
+        jz      short .column_ld8
+        sub     rcx, byte SIZEOF_XMMWORD/4
+        movdqa  xmmE,xmmA
+        movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+        test    cl, SIZEOF_XMMWORD/2
+        mov     rcx, SIZEOF_XMMWORD
+        jz      short .rgb_ycc_cnv
+        movdqa  xmmF,xmmA
+        movdqa  xmmH,xmmE
+        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+        jmp     short .rgb_ycc_cnv
+
+.columnloop:
+        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+        movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+        movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+        movdqa    xmmD,xmmA
+        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+        movdqa    xmmC,xmmF
+        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+        movdqa    xmmB,xmmA
+        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+        movdqa    xmmG,xmmD
+        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+        movdqa    xmmE,xmmA
+        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+        movdqa    xmmH,xmmB
+        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+        pxor      xmmF,xmmF
+
+        movdqa    xmmC,xmmA
+        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
+        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+        movdqa    xmmD,xmmB
+        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
+        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+        movdqa    xmmG,xmmE
+        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
+        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+        punpcklbw xmmF,xmmH
+        punpckhbw xmmH,xmmH
+        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+        ; (Original)
+        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+        ;
+        ; (This implementation)
+        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE
+        movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO
+        movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE
+        movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO
+
+        movdqa    xmm6,xmm1
+        punpcklwd xmm1,xmm3
+        punpckhwd xmm6,xmm3
+        movdqa    xmm7,xmm1
+        movdqa    xmm4,xmm6
+        pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+        pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+        pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+        movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+        movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+        pxor      xmm1,xmm1
+        pxor      xmm6,xmm6
+        punpcklwd xmm1,xmm5             ; xmm1=BOL
+        punpckhwd xmm6,xmm5             ; xmm6=BOH
+        psrld     xmm1,1                ; xmm1=BOL*FIX(0.500)
+        psrld     xmm6,1                ; xmm6=BOH*FIX(0.500)
+
+        movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
+
+        paddd     xmm7,xmm1
+        paddd     xmm4,xmm6
+        paddd     xmm7,xmm5
+        paddd     xmm4,xmm5
+        psrld     xmm7,SCALEBITS        ; xmm7=CbOL
+        psrld     xmm4,SCALEBITS        ; xmm4=CbOH
+        packssdw  xmm7,xmm4             ; xmm7=CbO
+
+        movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE
+
+        movdqa    xmm6,xmm0
+        punpcklwd xmm0,xmm2
+        punpckhwd xmm6,xmm2
+        movdqa    xmm5,xmm0
+        movdqa    xmm4,xmm6
+        pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+        pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+        pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+        movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+        movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+        pxor      xmm0,xmm0
+        pxor      xmm6,xmm6
+        punpcklwd xmm0,xmm1             ; xmm0=BEL
+        punpckhwd xmm6,xmm1             ; xmm6=BEH
+        psrld     xmm0,1                ; xmm0=BEL*FIX(0.500)
+        psrld     xmm6,1                ; xmm6=BEH*FIX(0.500)
+
+        movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+        paddd     xmm5,xmm0
+        paddd     xmm4,xmm6
+        paddd     xmm5,xmm1
+        paddd     xmm4,xmm1
+        psrld     xmm5,SCALEBITS        ; xmm5=CbEL
+        psrld     xmm4,SCALEBITS        ; xmm4=CbEH
+        packssdw  xmm5,xmm4             ; xmm5=CbE
+
+        psllw     xmm7,BYTE_BIT
+        por       xmm5,xmm7             ; xmm5=Cb
+        movdqa    XMMWORD [rbx], xmm5   ; Save Cb
+
+        movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO
+        movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE
+        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO
+
+        movdqa    xmm4,xmm0
+        punpcklwd xmm0,xmm3
+        punpckhwd xmm4,xmm3
+        movdqa    xmm7,xmm0
+        movdqa    xmm5,xmm4
+        pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+        pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+        pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+        movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
+
+        paddd     xmm0, XMMWORD [wk(4)]
+        paddd     xmm4, XMMWORD [wk(5)]
+        paddd     xmm0,xmm3
+        paddd     xmm4,xmm3
+        psrld     xmm0,SCALEBITS        ; xmm0=YOL
+        psrld     xmm4,SCALEBITS        ; xmm4=YOH
+        packssdw  xmm0,xmm4             ; xmm0=YO
+
+        pxor      xmm3,xmm3
+        pxor      xmm4,xmm4
+        punpcklwd xmm3,xmm1             ; xmm3=ROL
+        punpckhwd xmm4,xmm1             ; xmm4=ROH
+        psrld     xmm3,1                ; xmm3=ROL*FIX(0.500)
+        psrld     xmm4,1                ; xmm4=ROH*FIX(0.500)
+
+        movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+        paddd     xmm7,xmm3
+        paddd     xmm5,xmm4
+        paddd     xmm7,xmm1
+        paddd     xmm5,xmm1
+        psrld     xmm7,SCALEBITS        ; xmm7=CrOL
+        psrld     xmm5,SCALEBITS        ; xmm5=CrOH
+        packssdw  xmm7,xmm5             ; xmm7=CrO
+
+        movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE
+
+        movdqa    xmm4,xmm6
+        punpcklwd xmm6,xmm2
+        punpckhwd xmm4,xmm2
+        movdqa    xmm1,xmm6
+        movdqa    xmm5,xmm4
+        pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+        pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+        pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+        movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
+
+        paddd     xmm6, XMMWORD [wk(6)]
+        paddd     xmm4, XMMWORD [wk(7)]
+        paddd     xmm6,xmm2
+        paddd     xmm4,xmm2
+        psrld     xmm6,SCALEBITS        ; xmm6=YEL
+        psrld     xmm4,SCALEBITS        ; xmm4=YEH
+        packssdw  xmm6,xmm4             ; xmm6=YE
+
+        psllw     xmm0,BYTE_BIT
+        por       xmm6,xmm0             ; xmm6=Y
+        movdqa    XMMWORD [rdi], xmm6   ; Save Y
+
+        pxor      xmm2,xmm2
+        pxor      xmm4,xmm4
+        punpcklwd xmm2,xmm3             ; xmm2=REL
+        punpckhwd xmm4,xmm3             ; xmm4=REH
+        psrld     xmm2,1                ; xmm2=REL*FIX(0.500)
+        psrld     xmm4,1                ; xmm4=REH*FIX(0.500)
+
+        movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
+
+        paddd     xmm1,xmm2
+        paddd     xmm5,xmm4
+        paddd     xmm1,xmm0
+        paddd     xmm5,xmm0
+        psrld     xmm1,SCALEBITS        ; xmm1=CrEL
+        psrld     xmm5,SCALEBITS        ; xmm5=CrEH
+        packssdw  xmm1,xmm5             ; xmm1=CrE
+
+        psllw     xmm7,BYTE_BIT
+        por       xmm1,xmm7             ; xmm1=Cr
+        movdqa    XMMWORD [rdx], xmm1   ; Save Cr
+
+        sub     rcx, byte SIZEOF_XMMWORD
+        add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+        add     rdi, byte SIZEOF_XMMWORD                ; outptr0
+        add     rbx, byte SIZEOF_XMMWORD                ; outptr1
+        add     rdx, byte SIZEOF_XMMWORD                ; outptr2
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jae     near .columnloop
+        test    rcx,rcx
+        jnz     near .column_ld1
+
+        pop     rcx                     ; col
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rdx
+
+        add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
+        add     rdi, byte SIZEOF_JSAMPROW
+        add     rbx, byte SIZEOF_JSAMPROW
+        add     rdx, byte SIZEOF_JSAMPROW
+        dec     rax                             ; num_rows
+        jg      near .rowloop
+
+.return:
+        pop     rbx
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jccolext-sse2.asm b/simd/jccolext-sse2.asm
new file mode 100644
index 0000000..cc38e98
--- /dev/null
+++ b/simd/jccolext-sse2.asm
@@ -0,0 +1,503 @@
+;
+; jccolext.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
+;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                             JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)    (b)+8           ; JDIMENSION img_width
+%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
+%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
+%define output_row(b)   (b)+20          ; JDIMENSION output_row
+%define num_rows(b)     (b)+24          ; int num_rows
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          8
+%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
+
+        align   16
+
+        global  EXTN(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic eax             ; make a room for GOT address
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx                     ; get GOT address
+        movpic  POINTER [gotptr], ebx   ; save GOT address
+
+        mov     ecx, JDIMENSION [img_width(eax)]
+        test    ecx,ecx
+        jz      near .return
+
+        push    ecx
+
+        mov     esi, JSAMPIMAGE [output_buf(eax)]
+        mov     ecx, JDIMENSION [output_row(eax)]
+        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+        mov     ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+        mov     edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
+        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+        pop     ecx
+
+        mov     esi, JSAMPARRAY [input_buf(eax)]
+        mov     eax, INT [num_rows(eax)]
+        test    eax,eax
+        jle     near .return
+        alignx  16,7
+.rowloop:
+        pushpic eax
+        push    edx
+        push    ebx
+        push    edi
+        push    esi
+        push    ecx                     ; col
+
+        mov     esi, JSAMPROW [esi]     ; inptr
+        mov     edi, JSAMPROW [edi]     ; outptr0
+        mov     ebx, JSAMPROW [ebx]     ; outptr1
+        mov     edx, JSAMPROW [edx]     ; outptr2
+        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
+
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jae     near .columnloop
+        alignx  16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+        push    eax
+        push    edx
+        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
+        test    cl, SIZEOF_BYTE
+        jz      short .column_ld2
+        sub     ecx, byte SIZEOF_BYTE
+        movzx   eax, BYTE [esi+ecx]
+.column_ld2:
+        test    cl, SIZEOF_WORD
+        jz      short .column_ld4
+        sub     ecx, byte SIZEOF_WORD
+        movzx   edx, WORD [esi+ecx]
+        shl     eax, WORD_BIT
+        or      eax,edx
+.column_ld4:
+        movd    xmmA,eax
+        pop     edx
+        pop     eax
+        test    cl, SIZEOF_DWORD
+        jz      short .column_ld8
+        sub     ecx, byte SIZEOF_DWORD
+        movd    xmmF, XMM_DWORD [esi+ecx]
+        pslldq  xmmA, SIZEOF_DWORD
+        por     xmmA,xmmF
+.column_ld8:
+        test    cl, SIZEOF_MMWORD
+        jz      short .column_ld16
+        sub     ecx, byte SIZEOF_MMWORD
+        movq    xmmB, XMM_MMWORD [esi+ecx]
+        pslldq  xmmA, SIZEOF_MMWORD
+        por     xmmA,xmmB
+.column_ld16:
+        test    cl, SIZEOF_XMMWORD
+        jz      short .column_ld32
+        movdqa  xmmF,xmmA
+        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        mov     ecx, SIZEOF_XMMWORD
+        jmp     short .rgb_ycc_cnv
+.column_ld32:
+        test    cl, 2*SIZEOF_XMMWORD
+        mov     ecx, SIZEOF_XMMWORD
+        jz      short .rgb_ycc_cnv
+        movdqa  xmmB,xmmA
+        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+        jmp     short .rgb_ycc_cnv
+        alignx  16,7
+
+.columnloop:
+        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+        movdqu  xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+        movdqa    xmmG,xmmA
+        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+        movdqa    xmmD,xmmA
+        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+        movdqa    xmmE,xmmA
+        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+        pxor      xmmH,xmmH
+
+        movdqa    xmmC,xmmA
+        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
+        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+        movdqa    xmmB,xmmE
+        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
+        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+        movdqa    xmmF,xmmD
+        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
+        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+        test    cl, SIZEOF_XMMWORD/16
+        jz      short .column_ld2
+        sub     ecx, byte SIZEOF_XMMWORD/16
+        movd    xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+        test    cl, SIZEOF_XMMWORD/8
+        jz      short .column_ld4
+        sub     ecx, byte SIZEOF_XMMWORD/8
+        movq    xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+        pslldq  xmmA, SIZEOF_MMWORD
+        por     xmmA,xmmE
+.column_ld4:
+        test    cl, SIZEOF_XMMWORD/4
+        jz      short .column_ld8
+        sub     ecx, byte SIZEOF_XMMWORD/4
+        movdqa  xmmE,xmmA
+        movdqu  xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+        test    cl, SIZEOF_XMMWORD/2
+        mov     ecx, SIZEOF_XMMWORD
+        jz      short .rgb_ycc_cnv
+        movdqa  xmmF,xmmA
+        movdqa  xmmH,xmmE
+        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+        jmp     short .rgb_ycc_cnv
+        alignx  16,7
+
+.columnloop:
+        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+        movdqu  xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+        movdqu  xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+        movdqa    xmmD,xmmA
+        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+        movdqa    xmmC,xmmF
+        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+        movdqa    xmmB,xmmA
+        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+        movdqa    xmmG,xmmD
+        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+        movdqa    xmmE,xmmA
+        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+        movdqa    xmmH,xmmB
+        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+        pxor      xmmF,xmmF
+
+        movdqa    xmmC,xmmA
+        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
+        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+        movdqa    xmmD,xmmB
+        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
+        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+        movdqa    xmmG,xmmE
+        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
+        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+        punpcklbw xmmF,xmmH
+        punpckhbw xmmH,xmmH
+        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+        ; (Original)
+        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+        ;
+        ; (This implementation)
+        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE
+        movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO
+        movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE
+        movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO
+
+        movdqa    xmm6,xmm1
+        punpcklwd xmm1,xmm3
+        punpckhwd xmm6,xmm3
+        movdqa    xmm7,xmm1
+        movdqa    xmm4,xmm6
+        pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+        pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+        pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+        movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+        movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+        pxor      xmm1,xmm1
+        pxor      xmm6,xmm6
+        punpcklwd xmm1,xmm5             ; xmm1=BOL
+        punpckhwd xmm6,xmm5             ; xmm6=BOH
+        psrld     xmm1,1                ; xmm1=BOL*FIX(0.500)
+        psrld     xmm6,1                ; xmm6=BOH*FIX(0.500)
+
+        movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
+
+        paddd     xmm7,xmm1
+        paddd     xmm4,xmm6
+        paddd     xmm7,xmm5
+        paddd     xmm4,xmm5
+        psrld     xmm7,SCALEBITS        ; xmm7=CbOL
+        psrld     xmm4,SCALEBITS        ; xmm4=CbOH
+        packssdw  xmm7,xmm4             ; xmm7=CbO
+
+        movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE
+
+        movdqa    xmm6,xmm0
+        punpcklwd xmm0,xmm2
+        punpckhwd xmm6,xmm2
+        movdqa    xmm5,xmm0
+        movdqa    xmm4,xmm6
+        pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+        pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+        pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+        movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+        movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+        pxor      xmm0,xmm0
+        pxor      xmm6,xmm6
+        punpcklwd xmm0,xmm1             ; xmm0=BEL
+        punpckhwd xmm6,xmm1             ; xmm6=BEH
+        psrld     xmm0,1                ; xmm0=BEL*FIX(0.500)
+        psrld     xmm6,1                ; xmm6=BEH*FIX(0.500)
+
+        movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+        paddd     xmm5,xmm0
+        paddd     xmm4,xmm6
+        paddd     xmm5,xmm1
+        paddd     xmm4,xmm1
+        psrld     xmm5,SCALEBITS        ; xmm5=CbEL
+        psrld     xmm4,SCALEBITS        ; xmm4=CbEH
+        packssdw  xmm5,xmm4             ; xmm5=CbE
+
+        psllw     xmm7,BYTE_BIT
+        por       xmm5,xmm7             ; xmm5=Cb
+        movdqa    XMMWORD [ebx], xmm5   ; Save Cb
+
+        movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO
+        movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE
+        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO
+
+        movdqa    xmm4,xmm0
+        punpcklwd xmm0,xmm3
+        punpckhwd xmm4,xmm3
+        movdqa    xmm7,xmm0
+        movdqa    xmm5,xmm4
+        pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+        pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+        pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+        movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
+
+        paddd     xmm0, XMMWORD [wk(4)]
+        paddd     xmm4, XMMWORD [wk(5)]
+        paddd     xmm0,xmm3
+        paddd     xmm4,xmm3
+        psrld     xmm0,SCALEBITS        ; xmm0=YOL
+        psrld     xmm4,SCALEBITS        ; xmm4=YOH
+        packssdw  xmm0,xmm4             ; xmm0=YO
+
+        pxor      xmm3,xmm3
+        pxor      xmm4,xmm4
+        punpcklwd xmm3,xmm1             ; xmm3=ROL
+        punpckhwd xmm4,xmm1             ; xmm4=ROH
+        psrld     xmm3,1                ; xmm3=ROL*FIX(0.500)
+        psrld     xmm4,1                ; xmm4=ROH*FIX(0.500)
+
+        movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+        paddd     xmm7,xmm3
+        paddd     xmm5,xmm4
+        paddd     xmm7,xmm1
+        paddd     xmm5,xmm1
+        psrld     xmm7,SCALEBITS        ; xmm7=CrOL
+        psrld     xmm5,SCALEBITS        ; xmm5=CrOH
+        packssdw  xmm7,xmm5             ; xmm7=CrO
+
+        movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE
+
+        movdqa    xmm4,xmm6
+        punpcklwd xmm6,xmm2
+        punpckhwd xmm4,xmm2
+        movdqa    xmm1,xmm6
+        movdqa    xmm5,xmm4
+        pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+        pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+        pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+        movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
+
+        paddd     xmm6, XMMWORD [wk(6)]
+        paddd     xmm4, XMMWORD [wk(7)]
+        paddd     xmm6,xmm2
+        paddd     xmm4,xmm2
+        psrld     xmm6,SCALEBITS        ; xmm6=YEL
+        psrld     xmm4,SCALEBITS        ; xmm4=YEH
+        packssdw  xmm6,xmm4             ; xmm6=YE
+
+        psllw     xmm0,BYTE_BIT
+        por       xmm6,xmm0             ; xmm6=Y
+        movdqa    XMMWORD [edi], xmm6   ; Save Y
+
+        pxor      xmm2,xmm2
+        pxor      xmm4,xmm4
+        punpcklwd xmm2,xmm3             ; xmm2=REL
+        punpckhwd xmm4,xmm3             ; xmm4=REH
+        psrld     xmm2,1                ; xmm2=REL*FIX(0.500)
+        psrld     xmm4,1                ; xmm4=REH*FIX(0.500)
+
+        movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
+
+        paddd     xmm1,xmm2
+        paddd     xmm5,xmm4
+        paddd     xmm1,xmm0
+        paddd     xmm5,xmm0
+        psrld     xmm1,SCALEBITS        ; xmm1=CrEL
+        psrld     xmm5,SCALEBITS        ; xmm5=CrEH
+        packssdw  xmm1,xmm5             ; xmm1=CrE
+
+        psllw     xmm7,BYTE_BIT
+        por       xmm1,xmm7             ; xmm1=Cr
+        movdqa    XMMWORD [edx], xmm1   ; Save Cr
+
+        sub     ecx, byte SIZEOF_XMMWORD
+        add     esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+        add     edi, byte SIZEOF_XMMWORD                ; outptr0
+        add     ebx, byte SIZEOF_XMMWORD                ; outptr1
+        add     edx, byte SIZEOF_XMMWORD                ; outptr2
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jae     near .columnloop
+        test    ecx,ecx
+        jnz     near .column_ld1
+
+        pop     ecx                     ; col
+        pop     esi
+        pop     edi
+        pop     ebx
+        pop     edx
+        poppic  eax
+
+        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
+        add     edi, byte SIZEOF_JSAMPROW
+        add     ebx, byte SIZEOF_JSAMPROW
+        add     edx, byte SIZEOF_JSAMPROW
+        dec     eax                             ; num_rows
+        jg      near .rowloop
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jccolor-altivec.c b/simd/jccolor-altivec.c
new file mode 100644
index 0000000..04b8708
--- /dev/null
+++ b/simd/jccolor-altivec.c
@@ -0,0 +1,104 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_081 5329                 /* FIX(0.08131) */
+#define F_0_114 7471                 /* FIX(0.11400) */
+#define F_0_168 11059                /* FIX(0.16874) */
+#define F_0_250 16384                /* FIX(0.25000) */
+#define F_0_299 19595                /* FIX(0.29900) */
+#define F_0_331 21709                /* FIX(0.33126) */
+#define F_0_418 27439                /* FIX(0.41869) */
+#define F_0_500 32768                /* FIX(0.50000) */
+#define F_0_587 38470                /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
+#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
+#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
+#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
+#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
+#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
+#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
+#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
diff --git a/simd/jccolmmx.asm b/simd/jccolor-mmx.asm
similarity index 67%
rename from simd/jccolmmx.asm
rename to simd/jccolor-mmx.asm
index 1867abe..c5d3764 100644
--- a/simd/jccolmmx.asm
+++ b/simd/jccolor-mmx.asm
@@ -1,5 +1,5 @@
 ;
-; jccolmmx.asm - colorspace conversion (MMX)
+; jccolor.asm - colorspace conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright 2009 D. R. Commander
@@ -21,40 +21,40 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS	16
+%define SCALEBITS       16
 
-F_0_081	equ	 5329			; FIX(0.08131)
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_168	equ	11059			; FIX(0.16874)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_331	equ	21709			; FIX(0.33126)
-F_0_418	equ	27439			; FIX(0.41869)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+F_0_081 equ      5329                   ; FIX(0.08131)
+F_0_114 equ      7471                   ; FIX(0.11400)
+F_0_168 equ     11059                   ; FIX(0.16874)
+F_0_250 equ     16384                   ; FIX(0.25000)
+F_0_299 equ     19595                   ; FIX(0.29900)
+F_0_331 equ     21709                   ; FIX(0.33126)
+F_0_418 equ     27439                   ; FIX(0.41869)
+F_0_587 equ     38470                   ; FIX(0.58700)
+F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 
-	alignz	16
-	global	EXTN(jconst_rgb_ycc_convert_mmx) PRIVATE
+        alignz  16
+        global  EXTN(jconst_rgb_ycc_convert_mmx)
 
 EXTN(jconst_rgb_ycc_convert_mmx):
 
-PW_F0299_F0337	times 2 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 2 dw  F_0_114, F_0_250
-PW_MF016_MF033	times 2 dw -F_0_168,-F_0_331
-PW_MF008_MF041	times 2 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ	times 2 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF	times 2 dd  (1 << (SCALEBITS-1))
+PW_F0299_F0337  times 2 dw  F_0_299, F_0_337
+PW_F0114_F0250  times 2 dw  F_0_114, F_0_250
+PW_MF016_MF033  times 2 dw -F_0_168,-F_0_331
+PW_MF008_MF041  times 2 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ times 2 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 2 dd  (1 << (SCALEBITS-1))
 
-	alignz	16
+        alignz  16
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
+        SECTION SEG_TEXT
+        BITS    32
 
-%include "jcclrmmx.asm"
+%include "jccolext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -65,7 +65,7 @@
 %define RGB_BLUE EXT_RGB_BLUE
 %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 %define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
-%include "jcclrmmx.asm"
+%include "jccolext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -76,7 +76,7 @@
 %define RGB_BLUE EXT_RGBX_BLUE
 %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 %define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
-%include "jcclrmmx.asm"
+%include "jccolext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -87,7 +87,7 @@
 %define RGB_BLUE EXT_BGR_BLUE
 %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 %define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
-%include "jcclrmmx.asm"
+%include "jccolext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -98,7 +98,7 @@
 %define RGB_BLUE EXT_BGRX_BLUE
 %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 %define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
-%include "jcclrmmx.asm"
+%include "jccolext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -109,7 +109,7 @@
 %define RGB_BLUE EXT_XBGR_BLUE
 %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 %define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
-%include "jcclrmmx.asm"
+%include "jccolext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -120,4 +120,4 @@
 %define RGB_BLUE EXT_XRGB_BLUE
 %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 %define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
-%include "jcclrmmx.asm"
+%include "jccolext-mmx.asm"
diff --git a/simd/jccolss2-64.asm b/simd/jccolor-sse2-64.asm
similarity index 66%
rename from simd/jccolss2-64.asm
rename to simd/jccolor-sse2-64.asm
index 6370293..55c7e12 100644
--- a/simd/jccolss2-64.asm
+++ b/simd/jccolor-sse2-64.asm
@@ -1,5 +1,5 @@
 ;
-; jccolss2-64.asm - colorspace conversion (64-bit SSE2)
+; jccolor.asm - colorspace conversion (64-bit SSE2)
 ;
 ; x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -18,40 +18,40 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS	16
+%define SCALEBITS       16
 
-F_0_081	equ	 5329			; FIX(0.08131)
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_168	equ	11059			; FIX(0.16874)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_331	equ	21709			; FIX(0.33126)
-F_0_418	equ	27439			; FIX(0.41869)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+F_0_081 equ      5329                   ; FIX(0.08131)
+F_0_114 equ      7471                   ; FIX(0.11400)
+F_0_168 equ     11059                   ; FIX(0.16874)
+F_0_250 equ     16384                   ; FIX(0.25000)
+F_0_299 equ     19595                   ; FIX(0.29900)
+F_0_331 equ     21709                   ; FIX(0.33126)
+F_0_418 equ     27439                   ; FIX(0.41869)
+F_0_587 equ     38470                   ; FIX(0.58700)
+F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 
-	alignz	16
-	global	EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE
+        alignz  16
+        global  EXTN(jconst_rgb_ycc_convert_sse2)
 
 EXTN(jconst_rgb_ycc_convert_sse2):
 
-PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
-PW_MF016_MF033	times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041	times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ	times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
+PW_MF016_MF033  times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041  times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
 
-	alignz	16
+        alignz  16
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
+        SECTION SEG_TEXT
+        BITS    64
 
-%include "jcclrss2-64.asm"
+%include "jccolext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -62,7 +62,7 @@
 %define RGB_BLUE EXT_RGB_BLUE
 %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 %define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
-%include "jcclrss2-64.asm"
+%include "jccolext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -73,7 +73,7 @@
 %define RGB_BLUE EXT_RGBX_BLUE
 %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 %define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
-%include "jcclrss2-64.asm"
+%include "jccolext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -84,7 +84,7 @@
 %define RGB_BLUE EXT_BGR_BLUE
 %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 %define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
-%include "jcclrss2-64.asm"
+%include "jccolext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -95,7 +95,7 @@
 %define RGB_BLUE EXT_BGRX_BLUE
 %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 %define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
-%include "jcclrss2-64.asm"
+%include "jccolext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -106,7 +106,7 @@
 %define RGB_BLUE EXT_XBGR_BLUE
 %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 %define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
-%include "jcclrss2-64.asm"
+%include "jccolext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -117,4 +117,4 @@
 %define RGB_BLUE EXT_XRGB_BLUE
 %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 %define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
-%include "jcclrss2-64.asm"
+%include "jccolext-sse2-64.asm"
diff --git a/simd/jccolss2.asm b/simd/jccolor-sse2.asm
similarity index 66%
rename from simd/jccolss2.asm
rename to simd/jccolor-sse2.asm
index abd6721..890e2a3 100644
--- a/simd/jccolss2.asm
+++ b/simd/jccolor-sse2.asm
@@ -1,5 +1,5 @@
 ;
-; jccolss2.asm - colorspace conversion (SSE2)
+; jccolor.asm - colorspace conversion (SSE2)
 ;
 ; x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -18,40 +18,40 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS	16
+%define SCALEBITS       16
 
-F_0_081	equ	 5329			; FIX(0.08131)
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_168	equ	11059			; FIX(0.16874)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_331	equ	21709			; FIX(0.33126)
-F_0_418	equ	27439			; FIX(0.41869)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+F_0_081 equ      5329                   ; FIX(0.08131)
+F_0_114 equ      7471                   ; FIX(0.11400)
+F_0_168 equ     11059                   ; FIX(0.16874)
+F_0_250 equ     16384                   ; FIX(0.25000)
+F_0_299 equ     19595                   ; FIX(0.29900)
+F_0_331 equ     21709                   ; FIX(0.33126)
+F_0_418 equ     27439                   ; FIX(0.41869)
+F_0_587 equ     38470                   ; FIX(0.58700)
+F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 
-	alignz	16
-	global	EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE
+        alignz  16
+        global  EXTN(jconst_rgb_ycc_convert_sse2)
 
 EXTN(jconst_rgb_ycc_convert_sse2):
 
-PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
-PW_MF016_MF033	times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041	times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ	times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
+PW_MF016_MF033  times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041  times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
 
-	alignz	16
+        alignz  16
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
+        SECTION SEG_TEXT
+        BITS    32
 
-%include "jcclrss2.asm"
+%include "jccolext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -62,7 +62,7 @@
 %define RGB_BLUE EXT_RGB_BLUE
 %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 %define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
-%include "jcclrss2.asm"
+%include "jccolext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -73,7 +73,7 @@
 %define RGB_BLUE EXT_RGBX_BLUE
 %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 %define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
-%include "jcclrss2.asm"
+%include "jccolext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -84,7 +84,7 @@
 %define RGB_BLUE EXT_BGR_BLUE
 %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 %define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
-%include "jcclrss2.asm"
+%include "jccolext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -95,7 +95,7 @@
 %define RGB_BLUE EXT_BGRX_BLUE
 %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 %define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
-%include "jcclrss2.asm"
+%include "jccolext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -106,7 +106,7 @@
 %define RGB_BLUE EXT_XBGR_BLUE
 %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 %define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
-%include "jcclrss2.asm"
+%include "jccolext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -117,4 +117,4 @@
 %define RGB_BLUE EXT_XRGB_BLUE
 %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 %define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
-%include "jcclrss2.asm"
+%include "jccolext-sse2.asm"
diff --git a/simd/jcgrass2.asm b/simd/jcgrass2.asm
deleted file mode 100644
index 4a32e66..0000000
--- a/simd/jcgrass2.asm
+++ /dev/null
@@ -1,113 +0,0 @@
-;
-; jcgrass2.asm - grayscale colorspace conversion (SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; Copyright (C) 2011, D. R. Commander.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
-
-EXTN(jconst_rgb_gray_convert_sse2):
-
-PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
-PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-
-%include "jcgryss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
-%include "jcgryss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
-%include "jcgryss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
-%include "jcgryss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
-%include "jcgryss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
-%include "jcgryss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
-%include "jcgryss2.asm"
diff --git a/simd/jcgray-altivec.c b/simd/jcgray-altivec.c
new file mode 100644
index 0000000..b52fade
--- /dev/null
+++ b/simd/jcgray-altivec.c
@@ -0,0 +1,99 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_114 7471                 /* FIX(0.11400) */
+#define F_0_250 16384                /* FIX(0.25000) */
+#define F_0_299 19595                /* FIX(0.29900) */
+#define F_0_587 38470                /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
+#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
+#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
+#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
+#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
+#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
+#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
+#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
+#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
+#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
+#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
+#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
diff --git a/simd/jcgrammx.asm b/simd/jcgray-mmx.asm
similarity index 74%
rename from simd/jcgrammx.asm
rename to simd/jcgray-mmx.asm
index 8553b23..b2708ad 100644
--- a/simd/jcgrammx.asm
+++ b/simd/jcgray-mmx.asm
@@ -1,5 +1,5 @@
 ;
-; jcgrammx.asm - grayscale colorspace conversion (MMX)
+; jcgray.asm - grayscale colorspace conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright 2011 D. R. Commander
@@ -21,33 +21,33 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS	16
+%define SCALEBITS       16
 
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+F_0_114 equ      7471                   ; FIX(0.11400)
+F_0_250 equ     16384                   ; FIX(0.25000)
+F_0_299 equ     19595                   ; FIX(0.29900)
+F_0_587 equ     38470                   ; FIX(0.58700)
+F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 
-	alignz	16
-	global	EXTN(jconst_rgb_gray_convert_mmx) PRIVATE
+        alignz  16
+        global  EXTN(jconst_rgb_gray_convert_mmx)
 
 EXTN(jconst_rgb_gray_convert_mmx):
 
-PW_F0299_F0337	times 2 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 2 dw  F_0_114, F_0_250
-PD_ONEHALF	times 2 dd  (1 << (SCALEBITS-1))
+PW_F0299_F0337  times 2 dw  F_0_299, F_0_337
+PW_F0114_F0250  times 2 dw  F_0_114, F_0_250
+PD_ONEHALF      times 2 dd  (1 << (SCALEBITS-1))
 
-	alignz	16
+        alignz  16
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
+        SECTION SEG_TEXT
+        BITS    32
 
-%include "jcgrymmx.asm"
+%include "jcgryext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -58,7 +58,7 @@
 %define RGB_BLUE EXT_RGB_BLUE
 %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 %define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx
-%include "jcgrymmx.asm"
+%include "jcgryext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -69,7 +69,7 @@
 %define RGB_BLUE EXT_RGBX_BLUE
 %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 %define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx
-%include "jcgrymmx.asm"
+%include "jcgryext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -80,7 +80,7 @@
 %define RGB_BLUE EXT_BGR_BLUE
 %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 %define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx
-%include "jcgrymmx.asm"
+%include "jcgryext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -91,7 +91,7 @@
 %define RGB_BLUE EXT_BGRX_BLUE
 %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 %define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx
-%include "jcgrymmx.asm"
+%include "jcgryext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -102,7 +102,7 @@
 %define RGB_BLUE EXT_XBGR_BLUE
 %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 %define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx
-%include "jcgrymmx.asm"
+%include "jcgryext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -113,4 +113,4 @@
 %define RGB_BLUE EXT_XRGB_BLUE
 %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 %define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx
-%include "jcgrymmx.asm"
+%include "jcgryext-mmx.asm"
diff --git a/simd/jcgrass2-64.asm b/simd/jcgray-sse2-64.asm
similarity index 73%
copy from simd/jcgrass2-64.asm
copy to simd/jcgray-sse2-64.asm
index 7f025f9..dfc0577 100644
--- a/simd/jcgrass2-64.asm
+++ b/simd/jcgray-sse2-64.asm
@@ -1,5 +1,5 @@
 ;
-; jcgrass2-64.asm - grayscale colorspace conversion (64-bit SSE2)
+; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
 ;
 ; x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -18,33 +18,33 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS	16
+%define SCALEBITS       16
 
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+F_0_114 equ      7471                   ; FIX(0.11400)
+F_0_250 equ     16384                   ; FIX(0.25000)
+F_0_299 equ     19595                   ; FIX(0.29900)
+F_0_587 equ     38470                   ; FIX(0.58700)
+F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 
-	alignz	16
-	global	EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
+        alignz  16
+        global  EXTN(jconst_rgb_gray_convert_sse2)
 
 EXTN(jconst_rgb_gray_convert_sse2):
 
-PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
-PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
 
-	alignz	16
+        alignz  16
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
+        SECTION SEG_TEXT
+        BITS    64
 
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -55,7 +55,7 @@
 %define RGB_BLUE EXT_RGB_BLUE
 %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 %define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -66,7 +66,7 @@
 %define RGB_BLUE EXT_RGBX_BLUE
 %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 %define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -77,7 +77,7 @@
 %define RGB_BLUE EXT_BGR_BLUE
 %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 %define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -88,7 +88,7 @@
 %define RGB_BLUE EXT_BGRX_BLUE
 %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 %define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -99,7 +99,7 @@
 %define RGB_BLUE EXT_XBGR_BLUE
 %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 %define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -110,4 +110,4 @@
 %define RGB_BLUE EXT_XRGB_BLUE
 %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 %define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2-64.asm"
diff --git a/simd/jcgrass2-64.asm b/simd/jcgray-sse2.asm
similarity index 73%
rename from simd/jcgrass2-64.asm
rename to simd/jcgray-sse2.asm
index 7f025f9..5fa7273 100644
--- a/simd/jcgrass2-64.asm
+++ b/simd/jcgray-sse2.asm
@@ -1,5 +1,5 @@
 ;
-; jcgrass2-64.asm - grayscale colorspace conversion (64-bit SSE2)
+; jcgray.asm - grayscale colorspace conversion (SSE2)
 ;
 ; x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -18,33 +18,33 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS	16
+%define SCALEBITS       16
 
-F_0_114	equ	 7471			; FIX(0.11400)
-F_0_250	equ	16384			; FIX(0.25000)
-F_0_299	equ	19595			; FIX(0.29900)
-F_0_587	equ	38470			; FIX(0.58700)
-F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+F_0_114 equ      7471                   ; FIX(0.11400)
+F_0_250 equ     16384                   ; FIX(0.25000)
+F_0_299 equ     19595                   ; FIX(0.29900)
+F_0_587 equ     38470                   ; FIX(0.58700)
+F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 
-	alignz	16
-	global	EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
+        alignz  16
+        global  EXTN(jconst_rgb_gray_convert_sse2)
 
 EXTN(jconst_rgb_gray_convert_sse2):
 
-PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
-PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
 
-	alignz	16
+        alignz  16
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
+        SECTION SEG_TEXT
+        BITS    32
 
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -55,7 +55,7 @@
 %define RGB_BLUE EXT_RGB_BLUE
 %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 %define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -66,7 +66,7 @@
 %define RGB_BLUE EXT_RGBX_BLUE
 %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 %define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -77,7 +77,7 @@
 %define RGB_BLUE EXT_BGR_BLUE
 %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 %define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -88,7 +88,7 @@
 %define RGB_BLUE EXT_BGRX_BLUE
 %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 %define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -99,7 +99,7 @@
 %define RGB_BLUE EXT_XBGR_BLUE
 %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 %define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -110,4 +110,4 @@
 %define RGB_BLUE EXT_XRGB_BLUE
 %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 %define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
-%include "jcgryss2-64.asm"
+%include "jcgryext-sse2.asm"
diff --git a/simd/jcgryext-altivec.c b/simd/jcgryext-altivec.c
new file mode 100644
index 0000000..c171615
--- /dev/null
+++ b/simd/jcgryext-altivec.c
@@ -0,0 +1,227 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * Copyright (C) 2014, Jay Foad.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-altivec.c */
+
+
+void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
+                                     JSAMPARRAY input_buf,
+                                     JSAMPIMAGE output_buf,
+                                     JDIMENSION output_row, int num_rows)
+{
+  JSAMPROW inptr, outptr;
+  int pitch = img_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+#endif
+
+  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+    rgbg0, rgbg1, rgbg2, rgbg3, y;
+#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3 = {0};
+#endif
+#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
+  __vector unsigned char rgb4 = {0};
+#endif
+  __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+  __vector unsigned short yl, yh;
+  __vector int y0, y1, y2, y3;
+
+  /* Constants */
+  __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
+    pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
+         outptr += 16) {
+
+#if __BIG_ENDIAN__
+      /* Load 16 pixels == 48 or 64 bytes */
+      offset = (size_t)inptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overread.  Since there is no way to
+           * read a partial AltiVec register, overread would occur on the last
+           * chunk of the last image row if the right edge is not on a 16-byte
+           * boundary.  It could also occur on other rows if the bytes per row
+           * is low enough.  Since we can't determine whether we're on the last
+           * image row, we have to assume every row is the last.
+           */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = vec_ld(0, tmpbuf);
+          rgb1 = vec_ld(16, tmpbuf);
+          rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
+          if (bytes > 16)
+            rgb1 = vec_ld(16, inptr);
+          if (bytes > 32)
+            rgb2 = vec_ld(32, inptr);
+          if (bytes > 48)
+            rgb3 = vec_ld(48, inptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            rgb4 = vec_ld(64, inptr);
+#endif
+          unaligned_shift_index = vec_lvsl(0, inptr);
+          rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+          rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
+#endif
+        }
+      } else {
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
+          rgb0 = vec_ld(0, tmpbuf);
+          rgb1 = vec_ld(16, tmpbuf);
+          rgb2 = vec_ld(32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          rgb3 = vec_ld(48, tmpbuf);
+#endif
+        } else {
+          /* Fast path */
+          rgb0 = vec_ld(0, inptr);
+          if (num_cols > 16)
+            rgb1 = vec_ld(16, inptr);
+          if (num_cols > 32)
+            rgb2 = vec_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            rgb3 = vec_ld(48, inptr);
+#endif
+        }
+      }
+#else
+      /* Little endian */
+      rgb0 = vec_vsx_ld(0, inptr);
+      if (num_cols > 16)
+        rgb1 = vec_vsx_ld(16, inptr);
+      if (num_cols > 32)
+        rgb2 = vec_vsx_ld(32, inptr);
+#if RGB_PIXELSIZE == 4
+      if (num_cols > 48)
+        rgb3 = vec_vsx_ld(48, inptr);
+#endif
+#endif
+
+#if RGB_PIXELSIZE == 3
+      /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
+      rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
+      rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
+      rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
+#else
+      /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       *
+       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+       */
+      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
+      rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
+      rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
+      rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
+#endif
+
+      /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+       * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+       * ...
+       *
+       * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
+      bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
+      rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
+      bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
+      rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
+      bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
+      rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
+      bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
+
+      /* (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       */
+
+      /* Calculate Y values */
+
+      y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
+      y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
+      y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
+      y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
+      y0 = vec_msums(bg0, pw_f0114_f0250, y0);
+      y1 = vec_msums(bg1, pw_f0114_f0250, y1);
+      y2 = vec_msums(bg2, pw_f0114_f0250, y2);
+      y3 = vec_msums(bg3, pw_f0114_f0250, y3);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
+                    shift_pack_index);
+      yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
+                    shift_pack_index);
+      y = vec_pack(yl, yh);
+      vec_st(y, 0, outptr);
+    }
+  }
+}
diff --git a/simd/jcgryext-mmx.asm b/simd/jcgryext-mmx.asm
new file mode 100644
index 0000000..13b9600
--- /dev/null
+++ b/simd/jcgryext-mmx.asm
@@ -0,0 +1,357 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2011 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width,
+;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                             JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)    (b)+8           ; JDIMENSION img_width
+%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
+%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
+%define output_row(b)   (b)+20          ; JDIMENSION output_row
+%define num_rows(b)     (b)+24          ; int num_rows
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM          2
+%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
+
+        align   16
+        global  EXTN(jsimd_rgb_gray_convert_mmx)
+
+EXTN(jsimd_rgb_gray_convert_mmx):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic eax             ; make a room for GOT address
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx                     ; get GOT address
+        movpic  POINTER [gotptr], ebx   ; save GOT address
+
+        mov     ecx, JDIMENSION [img_width(eax)]        ; num_cols
+        test    ecx,ecx
+        jz      near .return
+
+        push    ecx
+
+        mov     esi, JSAMPIMAGE [output_buf(eax)]
+        mov     ecx, JDIMENSION [output_row(eax)]
+        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+        pop     ecx
+
+        mov     esi, JSAMPARRAY [input_buf(eax)]
+        mov     eax, INT [num_rows(eax)]
+        test    eax,eax
+        jle     near .return
+        alignx  16,7
+.rowloop:
+        pushpic eax
+        push    edi
+        push    esi
+        push    ecx                     ; col
+
+        mov     esi, JSAMPROW [esi]     ; inptr
+        mov     edi, JSAMPROW [edi]     ; outptr0
+        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
+
+        cmp     ecx, byte SIZEOF_MMWORD
+        jae     short .columnloop
+        alignx  16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+        push    eax
+        push    edx
+        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
+        test    cl, SIZEOF_BYTE
+        jz      short .column_ld2
+        sub     ecx, byte SIZEOF_BYTE
+        xor     eax,eax
+        mov     al, BYTE [esi+ecx]
+.column_ld2:
+        test    cl, SIZEOF_WORD
+        jz      short .column_ld4
+        sub     ecx, byte SIZEOF_WORD
+        xor     edx,edx
+        mov     dx, WORD [esi+ecx]
+        shl     eax, WORD_BIT
+        or      eax,edx
+.column_ld4:
+        movd    mmA,eax
+        pop     edx
+        pop     eax
+        test    cl, SIZEOF_DWORD
+        jz      short .column_ld8
+        sub     ecx, byte SIZEOF_DWORD
+        movd    mmG, DWORD [esi+ecx]
+        psllq   mmA, DWORD_BIT
+        por     mmA,mmG
+.column_ld8:
+        test    cl, SIZEOF_MMWORD
+        jz      short .column_ld16
+        movq    mmG,mmA
+        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+        mov     ecx, SIZEOF_MMWORD
+        jmp     short .rgb_gray_cnv
+.column_ld16:
+        test    cl, 2*SIZEOF_MMWORD
+        mov     ecx, SIZEOF_MMWORD
+        jz      short .rgb_gray_cnv
+        movq    mmF,mmA
+        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+        jmp     short .rgb_gray_cnv
+        alignx  16,7
+
+.columnloop:
+        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+        movq    mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+        ; mmA=(00 10 20 01 11 21 02 12)
+        ; mmG=(22 03 13 23 04 14 24 05)
+        ; mmF=(15 25 06 16 26 07 17 27)
+
+        movq      mmD,mmA
+        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 10 20 01)
+        psrlq     mmD,4*BYTE_BIT        ; mmD=(11 21 02 12 -- -- -- --)
+
+        punpckhbw mmA,mmG               ; mmA=(00 04 10 14 20 24 01 05)
+        psllq     mmG,4*BYTE_BIT        ; mmG=(-- -- -- -- 22 03 13 23)
+
+        punpcklbw mmD,mmF               ; mmD=(11 15 21 25 02 06 12 16)
+        punpckhbw mmG,mmF               ; mmG=(22 26 03 07 13 17 23 27)
+
+        movq      mmE,mmA
+        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 04 10 14)
+        psrlq     mmE,4*BYTE_BIT        ; mmE=(20 24 01 05 -- -- -- --)
+
+        punpckhbw mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
+        psllq     mmD,4*BYTE_BIT        ; mmD=(-- -- -- -- 11 15 21 25)
+
+        punpcklbw mmE,mmG               ; mmE=(20 22 24 26 01 03 05 07)
+        punpckhbw mmD,mmG               ; mmD=(11 13 15 17 21 23 25 27)
+
+        pxor      mmH,mmH
+
+        movq      mmC,mmA
+        punpcklbw mmA,mmH               ; mmA=(00 02 04 06)
+        punpckhbw mmC,mmH               ; mmC=(10 12 14 16)
+
+        movq      mmB,mmE
+        punpcklbw mmE,mmH               ; mmE=(20 22 24 26)
+        punpckhbw mmB,mmH               ; mmB=(01 03 05 07)
+
+        movq      mmF,mmD
+        punpcklbw mmD,mmH               ; mmD=(11 13 15 17)
+        punpckhbw mmF,mmH               ; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+        test    cl, SIZEOF_MMWORD/8
+        jz      short .column_ld2
+        sub     ecx, byte SIZEOF_MMWORD/8
+        movd    mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+        test    cl, SIZEOF_MMWORD/4
+        jz      short .column_ld4
+        sub     ecx, byte SIZEOF_MMWORD/4
+        movq    mmF,mmA
+        movq    mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+        test    cl, SIZEOF_MMWORD/2
+        mov     ecx, SIZEOF_MMWORD
+        jz      short .rgb_gray_cnv
+        movq    mmD,mmA
+        movq    mmC,mmF
+        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+        jmp     short .rgb_gray_cnv
+        alignx  16,7
+
+.columnloop:
+        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+        movq    mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+        movq    mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+        ; mmA=(00 10 20 30 01 11 21 31)
+        ; mmF=(02 12 22 32 03 13 23 33)
+        ; mmD=(04 14 24 34 05 15 25 35)
+        ; mmC=(06 16 26 36 07 17 27 37)
+
+        movq      mmB,mmA
+        punpcklbw mmA,mmF               ; mmA=(00 02 10 12 20 22 30 32)
+        punpckhbw mmB,mmF               ; mmB=(01 03 11 13 21 23 31 33)
+
+        movq      mmG,mmD
+        punpcklbw mmD,mmC               ; mmD=(04 06 14 16 24 26 34 36)
+        punpckhbw mmG,mmC               ; mmG=(05 07 15 17 25 27 35 37)
+
+        movq      mmE,mmA
+        punpcklwd mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
+        punpckhwd mmE,mmD               ; mmE=(20 22 24 26 30 32 34 36)
+
+        movq      mmH,mmB
+        punpcklwd mmB,mmG               ; mmB=(01 03 05 07 11 13 15 17)
+        punpckhwd mmH,mmG               ; mmH=(21 23 25 27 31 33 35 37)
+
+        pxor      mmF,mmF
+
+        movq      mmC,mmA
+        punpcklbw mmA,mmF               ; mmA=(00 02 04 06)
+        punpckhbw mmC,mmF               ; mmC=(10 12 14 16)
+
+        movq      mmD,mmB
+        punpcklbw mmB,mmF               ; mmB=(01 03 05 07)
+        punpckhbw mmD,mmF               ; mmD=(11 13 15 17)
+
+        movq      mmG,mmE
+        punpcklbw mmE,mmF               ; mmE=(20 22 24 26)
+        punpckhbw mmG,mmF               ; mmG=(30 32 34 36)
+
+        punpcklbw mmF,mmH
+        punpckhbw mmH,mmH
+        psrlw     mmF,BYTE_BIT          ; mmF=(21 23 25 27)
+        psrlw     mmH,BYTE_BIT          ; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+        ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+        ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+        ; (Original)
+        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+        ;
+        ; (This implementation)
+        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+        movq      mm6,mm1
+        punpcklwd mm1,mm3
+        punpckhwd mm6,mm3
+        pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+        movq      mm7, mm6      ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+        movq      mm6,mm0
+        punpcklwd mm0,mm2
+        punpckhwd mm6,mm2
+        pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+        movq      MMWORD [wk(0)], mm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+        movq      MMWORD [wk(1)], mm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+        movq      mm0, mm5      ; mm0=BO
+        movq      mm6, mm4      ; mm6=BE
+
+        movq      mm4,mm0
+        punpcklwd mm0,mm3
+        punpckhwd mm4,mm3
+        pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+        movq      mm3,[GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
+
+        paddd     mm0, mm1
+        paddd     mm4, mm7
+        paddd     mm0,mm3
+        paddd     mm4,mm3
+        psrld     mm0,SCALEBITS         ; mm0=YOL
+        psrld     mm4,SCALEBITS         ; mm4=YOH
+        packssdw  mm0,mm4               ; mm0=YO
+
+        movq      mm4,mm6
+        punpcklwd mm6,mm2
+        punpckhwd mm4,mm2
+        pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+        movq      mm2,[GOTOFF(eax,PD_ONEHALF)]  ; mm2=[PD_ONEHALF]
+
+        paddd     mm6, MMWORD [wk(0)]
+        paddd     mm4, MMWORD [wk(1)]
+        paddd     mm6,mm2
+        paddd     mm4,mm2
+        psrld     mm6,SCALEBITS         ; mm6=YEL
+        psrld     mm4,SCALEBITS         ; mm4=YEH
+        packssdw  mm6,mm4               ; mm6=YE
+
+        psllw     mm0,BYTE_BIT
+        por       mm6,mm0               ; mm6=Y
+        movq      MMWORD [edi], mm6     ; Save Y
+
+        sub     ecx, byte SIZEOF_MMWORD
+        add     esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; inptr
+        add     edi, byte SIZEOF_MMWORD                 ; outptr0
+        cmp     ecx, byte SIZEOF_MMWORD
+        jae     near .columnloop
+        test    ecx,ecx
+        jnz     near .column_ld1
+
+        pop     ecx                     ; col
+        pop     esi
+        pop     edi
+        poppic  eax
+
+        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
+        add     edi, byte SIZEOF_JSAMPROW
+        dec     eax                             ; num_rows
+        jg      near .rowloop
+
+        emms            ; empty MMX state
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jcgryext-sse2-64.asm b/simd/jcgryext-sse2-64.asm
new file mode 100644
index 0000000..82c0fc8
--- /dev/null
+++ b/simd/jcgryext-sse2-64.asm
@@ -0,0 +1,364 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2011, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
+;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                              JDIMENSION output_row, int num_rows);
+;
+
+; r10 = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13 = JDIMENSION output_row
+; r14 = int num_rows
+
+%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+
+        global  EXTN(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [wk(0)]
+        collect_args
+        push    rbx
+
+        mov     ecx, r10d
+        test    rcx,rcx
+        jz      near .return
+
+        push    rcx
+
+        mov rsi, r12
+        mov ecx, r13d
+        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+        pop     rcx
+
+        mov rsi, r11
+        mov     eax, r14d
+        test    rax,rax
+        jle     near .return
+.rowloop:
+        push    rdi
+        push    rsi
+        push    rcx                     ; col
+
+        mov     rsi, JSAMPROW [rsi]     ; inptr
+        mov     rdi, JSAMPROW [rdi]     ; outptr0
+
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jae     near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+        push    rax
+        push    rdx
+        lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
+        test    cl, SIZEOF_BYTE
+        jz      short .column_ld2
+        sub     rcx, byte SIZEOF_BYTE
+        movzx   rax, BYTE [rsi+rcx]
+.column_ld2:
+        test    cl, SIZEOF_WORD
+        jz      short .column_ld4
+        sub     rcx, byte SIZEOF_WORD
+        movzx   rdx, WORD [rsi+rcx]
+        shl     rax, WORD_BIT
+        or      rax,rdx
+.column_ld4:
+        movd    xmmA,eax
+        pop     rdx
+        pop     rax
+        test    cl, SIZEOF_DWORD
+        jz      short .column_ld8
+        sub     rcx, byte SIZEOF_DWORD
+        movd    xmmF, XMM_DWORD [rsi+rcx]
+        pslldq  xmmA, SIZEOF_DWORD
+        por     xmmA,xmmF
+.column_ld8:
+        test    cl, SIZEOF_MMWORD
+        jz      short .column_ld16
+        sub     rcx, byte SIZEOF_MMWORD
+        movq    xmmB, XMM_MMWORD [rsi+rcx]
+        pslldq  xmmA, SIZEOF_MMWORD
+        por     xmmA,xmmB
+.column_ld16:
+        test    cl, SIZEOF_XMMWORD
+        jz      short .column_ld32
+        movdqa  xmmF,xmmA
+        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        mov     rcx, SIZEOF_XMMWORD
+        jmp     short .rgb_gray_cnv
+.column_ld32:
+        test    cl, 2*SIZEOF_XMMWORD
+        mov     rcx, SIZEOF_XMMWORD
+        jz      short .rgb_gray_cnv
+        movdqa  xmmB,xmmA
+        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+        jmp     short .rgb_gray_cnv
+
+.columnloop:
+        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+        movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+        movdqa    xmmG,xmmA
+        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+        movdqa    xmmD,xmmA
+        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+        movdqa    xmmE,xmmA
+        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+        pxor      xmmH,xmmH
+
+        movdqa    xmmC,xmmA
+        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
+        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+        movdqa    xmmB,xmmE
+        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
+        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+        movdqa    xmmF,xmmD
+        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
+        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+        test    cl, SIZEOF_XMMWORD/16
+        jz      short .column_ld2
+        sub     rcx, byte SIZEOF_XMMWORD/16
+        movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+        test    cl, SIZEOF_XMMWORD/8
+        jz      short .column_ld4
+        sub     rcx, byte SIZEOF_XMMWORD/8
+        movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+        pslldq  xmmA, SIZEOF_MMWORD
+        por     xmmA,xmmE
+.column_ld4:
+        test    cl, SIZEOF_XMMWORD/4
+        jz      short .column_ld8
+        sub     rcx, byte SIZEOF_XMMWORD/4
+        movdqa  xmmE,xmmA
+        movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+        test    cl, SIZEOF_XMMWORD/2
+        mov     rcx, SIZEOF_XMMWORD
+        jz      short .rgb_gray_cnv
+        movdqa  xmmF,xmmA
+        movdqa  xmmH,xmmE
+        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+        jmp     short .rgb_gray_cnv
+
+.columnloop:
+        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+        movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+        movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+        movdqa    xmmD,xmmA
+        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+        movdqa    xmmC,xmmF
+        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+        movdqa    xmmB,xmmA
+        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+        movdqa    xmmG,xmmD
+        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+        movdqa    xmmE,xmmA
+        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+        movdqa    xmmH,xmmB
+        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+        pxor      xmmF,xmmF
+
+        movdqa    xmmC,xmmA
+        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
+        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+        movdqa    xmmD,xmmB
+        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
+        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+        movdqa    xmmG,xmmE
+        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
+        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+        punpcklbw xmmF,xmmH
+        punpckhbw xmmH,xmmH
+        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+        ; (Original)
+        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+        ;
+        ; (This implementation)
+        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+        movdqa    xmm6,xmm1
+        punpcklwd xmm1,xmm3
+        punpckhwd xmm6,xmm3
+        pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+        movdqa    xmm7, xmm6    ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+        movdqa    xmm6,xmm0
+        punpcklwd xmm0,xmm2
+        punpckhwd xmm6,xmm2
+        pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+        movdqa    XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+        movdqa    xmm0, xmm5    ; xmm0=BO
+        movdqa    xmm6, xmm4    ; xmm6=BE
+
+        movdqa    xmm4,xmm0
+        punpcklwd xmm0,xmm3
+        punpckhwd xmm4,xmm3
+        pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+        movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
+
+        paddd     xmm0, xmm1
+        paddd     xmm4, xmm7
+        paddd     xmm0,xmm3
+        paddd     xmm4,xmm3
+        psrld     xmm0,SCALEBITS        ; xmm0=YOL
+        psrld     xmm4,SCALEBITS        ; xmm4=YOH
+        packssdw  xmm0,xmm4             ; xmm0=YO
+
+        movdqa    xmm4,xmm6
+        punpcklwd xmm6,xmm2
+        punpckhwd xmm4,xmm2
+        pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+        movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
+
+        paddd     xmm6, XMMWORD [wk(0)]
+        paddd     xmm4, XMMWORD [wk(1)]
+        paddd     xmm6,xmm2
+        paddd     xmm4,xmm2
+        psrld     xmm6,SCALEBITS        ; xmm6=YEL
+        psrld     xmm4,SCALEBITS        ; xmm4=YEH
+        packssdw  xmm6,xmm4             ; xmm6=YE
+
+        psllw     xmm0,BYTE_BIT
+        por       xmm6,xmm0             ; xmm6=Y
+        movdqa    XMMWORD [rdi], xmm6   ; Save Y
+
+        sub     rcx, byte SIZEOF_XMMWORD
+        add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+        add     rdi, byte SIZEOF_XMMWORD                ; outptr0
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jae     near .columnloop
+        test    rcx,rcx
+        jnz     near .column_ld1
+
+        pop     rcx                     ; col
+        pop     rsi
+        pop     rdi
+
+        add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
+        add     rdi, byte SIZEOF_JSAMPROW
+        dec     rax                             ; num_rows
+        jg      near .rowloop
+
+.return:
+        pop     rbx
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jcgryext-sse2.asm b/simd/jcgryext-sse2.asm
new file mode 100644
index 0000000..1097b29
--- /dev/null
+++ b/simd/jcgryext-sse2.asm
@@ -0,0 +1,383 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2011, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
+;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                              JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)    (b)+8           ; JDIMENSION img_width
+%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
+%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
+%define output_row(b)   (b)+20          ; JDIMENSION output_row
+%define num_rows(b)     (b)+24          ; int num_rows
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
+
+        align   16
+
+        global  EXTN(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic eax             ; make a room for GOT address
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx                     ; get GOT address
+        movpic  POINTER [gotptr], ebx   ; save GOT address
+
+        mov     ecx, JDIMENSION [img_width(eax)]
+        test    ecx,ecx
+        jz      near .return
+
+        push    ecx
+
+        mov     esi, JSAMPIMAGE [output_buf(eax)]
+        mov     ecx, JDIMENSION [output_row(eax)]
+        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+        pop     ecx
+
+        mov     esi, JSAMPARRAY [input_buf(eax)]
+        mov     eax, INT [num_rows(eax)]
+        test    eax,eax
+        jle     near .return
+        alignx  16,7
+.rowloop:
+        pushpic eax
+        push    edi
+        push    esi
+        push    ecx                     ; col
+
+        mov     esi, JSAMPROW [esi]     ; inptr
+        mov     edi, JSAMPROW [edi]     ; outptr0
+        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
+
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jae     near .columnloop
+        alignx  16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+        push    eax
+        push    edx
+        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
+        test    cl, SIZEOF_BYTE
+        jz      short .column_ld2
+        sub     ecx, byte SIZEOF_BYTE
+        movzx   eax, BYTE [esi+ecx]
+.column_ld2:
+        test    cl, SIZEOF_WORD
+        jz      short .column_ld4
+        sub     ecx, byte SIZEOF_WORD
+        movzx   edx, WORD [esi+ecx]
+        shl     eax, WORD_BIT
+        or      eax,edx
+.column_ld4:
+        movd    xmmA,eax
+        pop     edx
+        pop     eax
+        test    cl, SIZEOF_DWORD
+        jz      short .column_ld8
+        sub     ecx, byte SIZEOF_DWORD
+        movd    xmmF, XMM_DWORD [esi+ecx]
+        pslldq  xmmA, SIZEOF_DWORD
+        por     xmmA,xmmF
+.column_ld8:
+        test    cl, SIZEOF_MMWORD
+        jz      short .column_ld16
+        sub     ecx, byte SIZEOF_MMWORD
+        movq    xmmB, XMM_MMWORD [esi+ecx]
+        pslldq  xmmA, SIZEOF_MMWORD
+        por     xmmA,xmmB
+.column_ld16:
+        test    cl, SIZEOF_XMMWORD
+        jz      short .column_ld32
+        movdqa  xmmF,xmmA
+        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        mov     ecx, SIZEOF_XMMWORD
+        jmp     short .rgb_gray_cnv
+.column_ld32:
+        test    cl, 2*SIZEOF_XMMWORD
+        mov     ecx, SIZEOF_XMMWORD
+        jz      short .rgb_gray_cnv
+        movdqa  xmmB,xmmA
+        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+        jmp     short .rgb_gray_cnv
+        alignx  16,7
+
+.columnloop:
+        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+        movdqu  xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+        movdqa    xmmG,xmmA
+        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+        movdqa    xmmD,xmmA
+        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+        movdqa    xmmE,xmmA
+        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+        pxor      xmmH,xmmH
+
+        movdqa    xmmC,xmmA
+        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
+        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+        movdqa    xmmB,xmmE
+        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
+        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+        movdqa    xmmF,xmmD
+        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
+        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+        test    cl, SIZEOF_XMMWORD/16
+        jz      short .column_ld2
+        sub     ecx, byte SIZEOF_XMMWORD/16
+        movd    xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+        test    cl, SIZEOF_XMMWORD/8
+        jz      short .column_ld4
+        sub     ecx, byte SIZEOF_XMMWORD/8
+        movq    xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+        pslldq  xmmA, SIZEOF_MMWORD
+        por     xmmA,xmmE
+.column_ld4:
+        test    cl, SIZEOF_XMMWORD/4
+        jz      short .column_ld8
+        sub     ecx, byte SIZEOF_XMMWORD/4
+        movdqa  xmmE,xmmA
+        movdqu  xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+        test    cl, SIZEOF_XMMWORD/2
+        mov     ecx, SIZEOF_XMMWORD
+        jz      short .rgb_gray_cnv
+        movdqa  xmmF,xmmA
+        movdqa  xmmH,xmmE
+        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+        jmp     short .rgb_gray_cnv
+        alignx  16,7
+
+.columnloop:
+        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+        movdqu  xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+        movdqu  xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+        movdqa    xmmD,xmmA
+        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+        movdqa    xmmC,xmmF
+        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+        movdqa    xmmB,xmmA
+        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+        movdqa    xmmG,xmmD
+        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+        movdqa    xmmE,xmmA
+        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+        movdqa    xmmH,xmmB
+        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+        pxor      xmmF,xmmF
+
+        movdqa    xmmC,xmmA
+        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
+        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+        movdqa    xmmD,xmmB
+        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
+        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+        movdqa    xmmG,xmmE
+        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
+        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+        punpcklbw xmmF,xmmH
+        punpckhbw xmmH,xmmH
+        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+        ; (Original)
+        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+        ;
+        ; (This implementation)
+        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+        movdqa    xmm6,xmm1
+        punpcklwd xmm1,xmm3
+        punpckhwd xmm6,xmm3
+        pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+        movdqa    xmm7, xmm6    ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+        movdqa    xmm6,xmm0
+        punpcklwd xmm0,xmm2
+        punpckhwd xmm6,xmm2
+        pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+        movdqa    XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+        movdqa    xmm0, xmm5    ; xmm0=BO
+        movdqa    xmm6, xmm4    ; xmm6=BE
+
+        movdqa    xmm4,xmm0
+        punpcklwd xmm0,xmm3
+        punpckhwd xmm4,xmm3
+        pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+        movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
+
+        paddd     xmm0, xmm1
+        paddd     xmm4, xmm7
+        paddd     xmm0,xmm3
+        paddd     xmm4,xmm3
+        psrld     xmm0,SCALEBITS        ; xmm0=YOL
+        psrld     xmm4,SCALEBITS        ; xmm4=YOH
+        packssdw  xmm0,xmm4             ; xmm0=YO
+
+        movdqa    xmm4,xmm6
+        punpcklwd xmm6,xmm2
+        punpckhwd xmm4,xmm2
+        pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+        movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
+
+        paddd     xmm6, XMMWORD [wk(0)]
+        paddd     xmm4, XMMWORD [wk(1)]
+        paddd     xmm6,xmm2
+        paddd     xmm4,xmm2
+        psrld     xmm6,SCALEBITS        ; xmm6=YEL
+        psrld     xmm4,SCALEBITS        ; xmm4=YEH
+        packssdw  xmm6,xmm4             ; xmm6=YE
+
+        psllw     xmm0,BYTE_BIT
+        por       xmm6,xmm0             ; xmm6=Y
+        movdqa    XMMWORD [edi], xmm6   ; Save Y
+
+        sub     ecx, byte SIZEOF_XMMWORD
+        add     esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+        add     edi, byte SIZEOF_XMMWORD                ; outptr0
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jae     near .columnloop
+        test    ecx,ecx
+        jnz     near .column_ld1
+
+        pop     ecx                     ; col
+        pop     esi
+        pop     edi
+        poppic  eax
+
+        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
+        add     edi, byte SIZEOF_JSAMPROW
+        dec     eax                             ; num_rows
+        jg      near .rowloop
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jcgrymmx.asm b/simd/jcgrymmx.asm
deleted file mode 100644
index c85a5cb..0000000
--- a/simd/jcgrymmx.asm
+++ /dev/null
@@ -1,357 +0,0 @@
-;
-; jcgrymmx.asm - grayscale colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2011 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION img_width
-%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
-%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
-%define output_row(b)	(b)+20		; JDIMENSION output_row
-%define num_rows(b)	(b)+24		; int num_rows
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_rgb_gray_convert_mmx) PRIVATE
-
-EXTN(jsimd_rgb_gray_convert_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [img_width(eax)]	; num_cols
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	esi, JSAMPIMAGE [output_buf(eax)]
-	mov	ecx, JDIMENSION [output_row(eax)]
-	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
-
-	pop	ecx
-
-	mov	esi, JSAMPARRAY [input_buf(eax)]
-	mov	eax, INT [num_rows(eax)]
-	test	eax,eax
-	jle	near .return
-	alignx	16,7
-.rowloop:
-	pushpic	eax
-	push	edi
-	push	esi
-	push	ecx			; col
-
-	mov	esi, JSAMPROW [esi]	; inptr
-	mov	edi, JSAMPROW [edi]	; outptr0
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-
-	cmp	ecx, byte SIZEOF_MMWORD
-	jae	short .columnloop
-	alignx	16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-	push	eax
-	push	edx
-	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
-	test	cl, SIZEOF_BYTE
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_BYTE
-	xor	eax,eax
-	mov	al, BYTE [esi+ecx]
-.column_ld2:
-	test	cl, SIZEOF_WORD
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_WORD
-	xor	edx,edx
-	mov	dx, WORD [esi+ecx]
-	shl	eax, WORD_BIT
-	or	eax,edx
-.column_ld4:
-	movd	mmA,eax
-	pop	edx
-	pop	eax
-	test	cl, SIZEOF_DWORD
-	jz	short .column_ld8
-	sub	ecx, byte SIZEOF_DWORD
-	movd	mmG, DWORD [esi+ecx]
-	psllq	mmA, DWORD_BIT
-	por	mmA,mmG
-.column_ld8:
-	test	cl, SIZEOF_MMWORD
-	jz	short .column_ld16
-	movq	mmG,mmA
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	mov	ecx, SIZEOF_MMWORD
-	jmp	short .rgb_gray_cnv
-.column_ld16:
-	test	cl, 2*SIZEOF_MMWORD
-	mov	ecx, SIZEOF_MMWORD
-	jz	short .rgb_gray_cnv
-	movq	mmF,mmA
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-	jmp	short .rgb_gray_cnv
-	alignx	16,7
-
-.columnloop:
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-	movq	mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_gray_cnv:
-	; mmA=(00 10 20 01 11 21 02 12)
-	; mmG=(22 03 13 23 04 14 24 05)
-	; mmF=(15 25 06 16 26 07 17 27)
-
-	movq      mmD,mmA
-	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 10 20 01)
-	psrlq     mmD,4*BYTE_BIT	; mmD=(11 21 02 12 -- -- -- --)
-
-	punpckhbw mmA,mmG		; mmA=(00 04 10 14 20 24 01 05)
-	psllq     mmG,4*BYTE_BIT	; mmG=(-- -- -- -- 22 03 13 23)
-
-	punpcklbw mmD,mmF		; mmD=(11 15 21 25 02 06 12 16)
-	punpckhbw mmG,mmF		; mmG=(22 26 03 07 13 17 23 27)
-
-	movq      mmE,mmA
-	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 04 10 14)
-	psrlq     mmE,4*BYTE_BIT	; mmE=(20 24 01 05 -- -- -- --)
-
-	punpckhbw mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
-	psllq     mmD,4*BYTE_BIT	; mmD=(-- -- -- -- 11 15 21 25)
-
-	punpcklbw mmE,mmG		; mmE=(20 22 24 26 01 03 05 07)
-	punpckhbw mmD,mmG		; mmD=(11 13 15 17 21 23 25 27)
-
-	pxor      mmH,mmH
-
-	movq      mmC,mmA
-	punpcklbw mmA,mmH		; mmA=(00 02 04 06)
-	punpckhbw mmC,mmH		; mmC=(10 12 14 16)
-
-	movq      mmB,mmE
-	punpcklbw mmE,mmH		; mmE=(20 22 24 26)
-	punpckhbw mmB,mmH		; mmB=(01 03 05 07)
-
-	movq      mmF,mmD
-	punpcklbw mmD,mmH		; mmD=(11 13 15 17)
-	punpckhbw mmF,mmH		; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-	test	cl, SIZEOF_MMWORD/8
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_MMWORD/8
-	movd	mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-	test	cl, SIZEOF_MMWORD/4
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_MMWORD/4
-	movq	mmF,mmA
-	movq	mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
-	test	cl, SIZEOF_MMWORD/2
-	mov	ecx, SIZEOF_MMWORD
-	jz	short .rgb_gray_cnv
-	movq	mmD,mmA
-	movq	mmC,mmF
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-	jmp	short .rgb_gray_cnv
-	alignx	16,7
-
-.columnloop:
-	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-	movq	mmD, MMWORD [esi+2*SIZEOF_MMWORD]
-	movq	mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_gray_cnv:
-	; mmA=(00 10 20 30 01 11 21 31)
-	; mmF=(02 12 22 32 03 13 23 33)
-	; mmD=(04 14 24 34 05 15 25 35)
-	; mmC=(06 16 26 36 07 17 27 37)
-
-	movq      mmB,mmA
-	punpcklbw mmA,mmF		; mmA=(00 02 10 12 20 22 30 32)
-	punpckhbw mmB,mmF		; mmB=(01 03 11 13 21 23 31 33)
-
-	movq      mmG,mmD
-	punpcklbw mmD,mmC		; mmD=(04 06 14 16 24 26 34 36)
-	punpckhbw mmG,mmC		; mmG=(05 07 15 17 25 27 35 37)
-
-	movq      mmE,mmA
-	punpcklwd mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
-	punpckhwd mmE,mmD		; mmE=(20 22 24 26 30 32 34 36)
-
-	movq      mmH,mmB
-	punpcklwd mmB,mmG		; mmB=(01 03 05 07 11 13 15 17)
-	punpckhwd mmH,mmG		; mmH=(21 23 25 27 31 33 35 37)
-
-	pxor      mmF,mmF
-
-	movq      mmC,mmA
-	punpcklbw mmA,mmF		; mmA=(00 02 04 06)
-	punpckhbw mmC,mmF		; mmC=(10 12 14 16)
-
-	movq      mmD,mmB
-	punpcklbw mmB,mmF		; mmB=(01 03 05 07)
-	punpckhbw mmD,mmF		; mmD=(11 13 15 17)
-
-	movq      mmG,mmE
-	punpcklbw mmE,mmF		; mmE=(20 22 24 26)
-	punpckhbw mmG,mmF		; mmG=(30 32 34 36)
-
-	punpcklbw mmF,mmH
-	punpckhbw mmH,mmH
-	psrlw     mmF,BYTE_BIT		; mmF=(21 23 25 27)
-	psrlw     mmH,BYTE_BIT		; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
-	; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
-	; (Original)
-	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-	;
-	; (This implementation)
-	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-	movq      mm6,mm1
-	punpcklwd mm1,mm3
-	punpckhwd mm6,mm3
-	pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	movq      mm7, mm6	; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	movq      mm6,mm0
-	punpcklwd mm0,mm2
-	punpckhwd mm6,mm2
-	pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
-	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	movq      MMWORD [wk(0)], mm0	; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-	movq      MMWORD [wk(1)], mm6	; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	movq      mm0, mm5	; mm0=BO
-	movq      mm6, mm4	; mm6=BE
-
-	movq      mm4,mm0
-	punpcklwd mm0,mm3
-	punpckhwd mm4,mm3
-	pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-	movq      mm3,[GOTOFF(eax,PD_ONEHALF)]	; mm3=[PD_ONEHALF]
-
-	paddd     mm0, mm1
-	paddd     mm4, mm7
-	paddd     mm0,mm3
-	paddd     mm4,mm3
-	psrld     mm0,SCALEBITS		; mm0=YOL
-	psrld     mm4,SCALEBITS		; mm4=YOH
-	packssdw  mm0,mm4		; mm0=YO
-
-	movq      mm4,mm6
-	punpcklwd mm6,mm2
-	punpckhwd mm4,mm2
-	pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-	movq      mm2,[GOTOFF(eax,PD_ONEHALF)]	; mm2=[PD_ONEHALF]
-
-	paddd     mm6, MMWORD [wk(0)]
-	paddd     mm4, MMWORD [wk(1)]
-	paddd     mm6,mm2
-	paddd     mm4,mm2
-	psrld     mm6,SCALEBITS		; mm6=YEL
-	psrld     mm4,SCALEBITS		; mm4=YEH
-	packssdw  mm6,mm4		; mm6=YE
-
-	psllw     mm0,BYTE_BIT
-	por       mm6,mm0		; mm6=Y
-	movq      MMWORD [edi], mm6	; Save Y
-
-	sub	ecx, byte SIZEOF_MMWORD
-	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; inptr
-	add	edi, byte SIZEOF_MMWORD			; outptr0
-	cmp	ecx, byte SIZEOF_MMWORD
-	jae	near .columnloop
-	test	ecx,ecx
-	jnz	near .column_ld1
-
-	pop	ecx			; col
-	pop	esi
-	pop	edi
-	poppic	eax
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_buf
-	add	edi, byte SIZEOF_JSAMPROW
-	dec	eax				; num_rows
-	jg	near .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcgryss2-64.asm b/simd/jcgryss2-64.asm
deleted file mode 100644
index 103a920..0000000
--- a/simd/jcgryss2-64.asm
+++ /dev/null
@@ -1,364 +0,0 @@
-;
-; jcgryss2-64.asm - grayscale colorspace conversion (64-bit SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; Copyright (C) 2011, D. R. Commander.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
-;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                              JDIMENSION output_row, int num_rows);
-;
-
-; r10 = JDIMENSION img_width
-; r11 = JSAMPARRAY input_buf
-; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-
-	global	EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
-
-EXTN(jsimd_rgb_gray_convert_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-	push	rbx
-
-	mov	ecx, r10d
-	test	rcx,rcx
-	jz	near .return
-
-	push	rcx
-
-	mov rsi, r12
-	mov ecx, r13d
-	mov	rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-	lea	rdi, [rdi+rcx*SIZEOF_JSAMPROW]
-
-	pop	rcx
-
-	mov rsi, r11
-	mov	eax, r14d
-	test	rax,rax
-	jle	near .return
-.rowloop:
-	push	rdi
-	push	rsi
-	push	rcx			; col
-
-	mov	rsi, JSAMPROW [rsi]	; inptr
-	mov	rdi, JSAMPROW [rdi]	; outptr0
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-	push	rax
-	push	rdx
-	lea	rcx,[rcx+rcx*2]		; imul ecx,RGB_PIXELSIZE
-	test	cl, SIZEOF_BYTE
-	jz	short .column_ld2
-	sub	rcx, byte SIZEOF_BYTE
-	movzx	rax, BYTE [rsi+rcx]
-.column_ld2:
-	test	cl, SIZEOF_WORD
-	jz	short .column_ld4
-	sub	rcx, byte SIZEOF_WORD
-	movzx	rdx, WORD [rsi+rcx]
-	shl	rax, WORD_BIT
-	or	rax,rdx
-.column_ld4:
-	movd	xmmA,eax
-	pop	rdx
-	pop	rax
-	test	cl, SIZEOF_DWORD
-	jz	short .column_ld8
-	sub	rcx, byte SIZEOF_DWORD
-	movd	xmmF, XMM_DWORD [rsi+rcx]
-	pslldq	xmmA, SIZEOF_DWORD
-	por	xmmA,xmmF
-.column_ld8:
-	test	cl, SIZEOF_MMWORD
-	jz	short .column_ld16
-	sub	rcx, byte SIZEOF_MMWORD
-	movq	xmmB, XMM_MMWORD [rsi+rcx]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmB
-.column_ld16:
-	test	cl, SIZEOF_XMMWORD
-	jz	short .column_ld32
-	movdqa	xmmF,xmmA
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	mov	rcx, SIZEOF_XMMWORD
-	jmp	short .rgb_gray_cnv
-.column_ld32:
-	test	cl, 2*SIZEOF_XMMWORD
-	mov	rcx, SIZEOF_XMMWORD
-	jz	short .rgb_gray_cnv
-	movdqa	xmmB,xmmA
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_gray_cnv
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	movdqu	xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	movdqa    xmmG,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-	movdqa    xmmD,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-	movdqa    xmmE,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-	pxor      xmmH,xmmH
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmB,xmmE
-	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-	movdqa    xmmF,xmmD
-	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-	test	cl, SIZEOF_XMMWORD/16
-	jz	short .column_ld2
-	sub	rcx, byte SIZEOF_XMMWORD/16
-	movd	xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld2:
-	test	cl, SIZEOF_XMMWORD/8
-	jz	short .column_ld4
-	sub	rcx, byte SIZEOF_XMMWORD/8
-	movq	xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmE
-.column_ld4:
-	test	cl, SIZEOF_XMMWORD/4
-	jz	short .column_ld8
-	sub	rcx, byte SIZEOF_XMMWORD/4
-	movdqa	xmmE,xmmA
-	movdqu	xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld8:
-	test	cl, SIZEOF_XMMWORD/2
-	mov	rcx, SIZEOF_XMMWORD
-	jz	short .rgb_gray_cnv
-	movdqa	xmmF,xmmA
-	movdqa	xmmH,xmmE
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_gray_cnv
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-	movdqu	xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-	movdqa    xmmC,xmmF
-	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-	movdqa    xmmB,xmmA
-	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-	movdqa    xmmG,xmmD
-	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-	movdqa    xmmE,xmmA
-	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-	movdqa    xmmH,xmmB
-	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-	pxor      xmmF,xmmF
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmD,xmmB
-	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-	movdqa    xmmG,xmmE
-	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-	punpcklbw xmmF,xmmH
-	punpckhbw xmmH,xmmH
-	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
-	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-	; (Original)
-	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-	;
-	; (This implementation)
-	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-	movdqa    xmm6,xmm1
-	punpcklwd xmm1,xmm3
-	punpckhwd xmm6,xmm3
-	pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	movdqa    xmm7, xmm6	; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	movdqa    xmm6,xmm0
-	punpcklwd xmm0,xmm2
-	punpckhwd xmm6,xmm2
-	pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-	movdqa    XMMWORD [wk(1)], xmm6	; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	movdqa    xmm0, xmm5	; xmm0=BO
-	movdqa    xmm6, xmm4	; xmm6=BE
-
-	movdqa    xmm4,xmm0
-	punpcklwd xmm0,xmm3
-	punpckhwd xmm4,xmm3
-	pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-	movdqa    xmm3,[rel PD_ONEHALF]	; xmm3=[PD_ONEHALF]
-
-	paddd     xmm0, xmm1
-	paddd     xmm4, xmm7
-	paddd     xmm0,xmm3
-	paddd     xmm4,xmm3
-	psrld     xmm0,SCALEBITS	; xmm0=YOL
-	psrld     xmm4,SCALEBITS	; xmm4=YOH
-	packssdw  xmm0,xmm4		; xmm0=YO
-
-	movdqa    xmm4,xmm6
-	punpcklwd xmm6,xmm2
-	punpckhwd xmm4,xmm2
-	pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-	movdqa    xmm2,[rel PD_ONEHALF]	; xmm2=[PD_ONEHALF]
-
-	paddd     xmm6, XMMWORD [wk(0)]
-	paddd     xmm4, XMMWORD [wk(1)]
-	paddd     xmm6,xmm2
-	paddd     xmm4,xmm2
-	psrld     xmm6,SCALEBITS	; xmm6=YEL
-	psrld     xmm4,SCALEBITS	; xmm4=YEH
-	packssdw  xmm6,xmm4		; xmm6=YE
-
-	psllw     xmm0,BYTE_BIT
-	por       xmm6,xmm0		; xmm6=Y
-	movdqa    XMMWORD [rdi], xmm6	; Save Y
-
-	sub	rcx, byte SIZEOF_XMMWORD
-	add	rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
-	add	rdi, byte SIZEOF_XMMWORD		; outptr0
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	test	rcx,rcx
-	jnz	near .column_ld1
-
-	pop	rcx			; col
-	pop	rsi
-	pop	rdi
-
-	add	rsi, byte SIZEOF_JSAMPROW	; input_buf
-	add	rdi, byte SIZEOF_JSAMPROW
-	dec	rax				; num_rows
-	jg	near .rowloop
-
-.return:
-	pop	rbx
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcgryss2.asm b/simd/jcgryss2.asm
deleted file mode 100644
index 53d5f94..0000000
--- a/simd/jcgryss2.asm
+++ /dev/null
@@ -1,383 +0,0 @@
-;
-; jcgryss2.asm - grayscale colorspace conversion (SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; Copyright (C) 2011, D. R. Commander.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
-;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                              JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION img_width
-%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
-%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
-%define output_row(b)	(b)+20		; JDIMENSION output_row
-%define num_rows(b)	(b)+24		; int num_rows
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-
-	global	EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
-
-EXTN(jsimd_rgb_gray_convert_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [img_width(eax)]
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	esi, JSAMPIMAGE [output_buf(eax)]
-	mov	ecx, JDIMENSION [output_row(eax)]
-	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
-
-	pop	ecx
-
-	mov	esi, JSAMPARRAY [input_buf(eax)]
-	mov	eax, INT [num_rows(eax)]
-	test	eax,eax
-	jle	near .return
-	alignx	16,7
-.rowloop:
-	pushpic	eax
-	push	edi
-	push	esi
-	push	ecx			; col
-
-	mov	esi, JSAMPROW [esi]	; inptr
-	mov	edi, JSAMPROW [edi]	; outptr0
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	alignx	16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-	push	eax
-	push	edx
-	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
-	test	cl, SIZEOF_BYTE
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_BYTE
-	movzx	eax, BYTE [esi+ecx]
-.column_ld2:
-	test	cl, SIZEOF_WORD
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_WORD
-	movzx	edx, WORD [esi+ecx]
-	shl	eax, WORD_BIT
-	or	eax,edx
-.column_ld4:
-	movd	xmmA,eax
-	pop	edx
-	pop	eax
-	test	cl, SIZEOF_DWORD
-	jz	short .column_ld8
-	sub	ecx, byte SIZEOF_DWORD
-	movd	xmmF, XMM_DWORD [esi+ecx]
-	pslldq	xmmA, SIZEOF_DWORD
-	por	xmmA,xmmF
-.column_ld8:
-	test	cl, SIZEOF_MMWORD
-	jz	short .column_ld16
-	sub	ecx, byte SIZEOF_MMWORD
-	movq	xmmB, XMM_MMWORD [esi+ecx]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmB
-.column_ld16:
-	test	cl, SIZEOF_XMMWORD
-	jz	short .column_ld32
-	movdqa	xmmF,xmmA
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	mov	ecx, SIZEOF_XMMWORD
-	jmp	short .rgb_gray_cnv
-.column_ld32:
-	test	cl, 2*SIZEOF_XMMWORD
-	mov	ecx, SIZEOF_XMMWORD
-	jz	short .rgb_gray_cnv
-	movdqa	xmmB,xmmA
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_gray_cnv
-	alignx	16,7
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	movdqu	xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	movdqa    xmmG,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-	movdqa    xmmD,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-	movdqa    xmmE,xmmA
-	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-	pxor      xmmH,xmmH
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmB,xmmE
-	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-	movdqa    xmmF,xmmD
-	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-	test	cl, SIZEOF_XMMWORD/16
-	jz	short .column_ld2
-	sub	ecx, byte SIZEOF_XMMWORD/16
-	movd	xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-	test	cl, SIZEOF_XMMWORD/8
-	jz	short .column_ld4
-	sub	ecx, byte SIZEOF_XMMWORD/8
-	movq	xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
-	pslldq	xmmA, SIZEOF_MMWORD
-	por	xmmA,xmmE
-.column_ld4:
-	test	cl, SIZEOF_XMMWORD/4
-	jz	short .column_ld8
-	sub	ecx, byte SIZEOF_XMMWORD/4
-	movdqa	xmmE,xmmA
-	movdqu	xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
-	test	cl, SIZEOF_XMMWORD/2
-	mov	ecx, SIZEOF_XMMWORD
-	jz	short .rgb_gray_cnv
-	movdqa	xmmF,xmmA
-	movdqa	xmmH,xmmE
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	jmp	short .rgb_gray_cnv
-	alignx	16,7
-
-.columnloop:
-	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	movdqu	xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
-	movdqu	xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-	movdqa    xmmC,xmmF
-	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-	movdqa    xmmB,xmmA
-	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-	movdqa    xmmG,xmmD
-	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-	movdqa    xmmE,xmmA
-	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-	movdqa    xmmH,xmmB
-	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-	pxor      xmmF,xmmF
-
-	movdqa    xmmC,xmmA
-	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
-	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-	movdqa    xmmD,xmmB
-	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
-	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-	movdqa    xmmG,xmmE
-	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
-	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-	punpcklbw xmmF,xmmH
-	punpckhbw xmmH,xmmH
-	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
-	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-	; (Original)
-	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-	;
-	; (This implementation)
-	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-	movdqa    xmm6,xmm1
-	punpcklwd xmm1,xmm3
-	punpckhwd xmm6,xmm3
-	pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	movdqa    xmm7, xmm6	; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-	movdqa    xmm6,xmm0
-	punpcklwd xmm0,xmm2
-	punpckhwd xmm6,xmm2
-	pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-	movdqa    XMMWORD [wk(1)], xmm6	; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-	movdqa    xmm0, xmm5	; xmm0=BO
-	movdqa    xmm6, xmm4	; xmm6=BE
-
-	movdqa    xmm4,xmm0
-	punpcklwd xmm0,xmm3
-	punpckhwd xmm4,xmm3
-	pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-	movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)]	; xmm3=[PD_ONEHALF]
-
-	paddd     xmm0, xmm1
-	paddd     xmm4, xmm7
-	paddd     xmm0,xmm3
-	paddd     xmm4,xmm3
-	psrld     xmm0,SCALEBITS	; xmm0=YOL
-	psrld     xmm4,SCALEBITS	; xmm4=YOH
-	packssdw  xmm0,xmm4		; xmm0=YO
-
-	movdqa    xmm4,xmm6
-	punpcklwd xmm6,xmm2
-	punpckhwd xmm4,xmm2
-	pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-	movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)]	; xmm2=[PD_ONEHALF]
-
-	paddd     xmm6, XMMWORD [wk(0)]
-	paddd     xmm4, XMMWORD [wk(1)]
-	paddd     xmm6,xmm2
-	paddd     xmm4,xmm2
-	psrld     xmm6,SCALEBITS	; xmm6=YEL
-	psrld     xmm4,SCALEBITS	; xmm4=YEH
-	packssdw  xmm6,xmm4		; xmm6=YE
-
-	psllw     xmm0,BYTE_BIT
-	por       xmm6,xmm0		; xmm6=Y
-	movdqa    XMMWORD [edi], xmm6	; Save Y
-
-	sub	ecx, byte SIZEOF_XMMWORD
-	add	esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
-	add	edi, byte SIZEOF_XMMWORD		; outptr0
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	test	ecx,ecx
-	jnz	near .column_ld1
-
-	pop	ecx			; col
-	pop	esi
-	pop	edi
-	poppic	eax
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_buf
-	add	edi, byte SIZEOF_JSAMPROW
-	dec	eax				; num_rows
-	jg	near .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jchuff-sse2-64.asm b/simd/jchuff-sse2-64.asm
new file mode 100644
index 0000000..84eaeeb
--- /dev/null
+++ b/simd/jchuff-sse2-64.asm
@@ -0,0 +1,361 @@
+;
+; jchuff-sse2-64.asm - Huffman entropy encoding (64-bit SSE2)
+;
+; Copyright 2009-2011, 2014-2016 D. R. Commander.
+; Copyright 2015 Matthieu Darbois
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based directly on jchuff.c; see jchuff.c for more
+; details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+%include "jpeg_nbits_table.inc"
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+
+; These macros perform the same task as the emit_bits() function in the
+; original libjpeg code.  In addition to reducing overhead by explicitly
+; inlining the code, additional performance is achieved by taking into
+; account the size of the bit buffer and waiting until it is almost full
+; before emptying it.  This mostly benefits 64-bit platforms, since 6
+; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+
+%macro EMIT_BYTE 0
+        sub put_bits, 8  ; put_bits -= 8;
+        mov rdx, put_buffer
+        mov ecx, put_bits
+        shr rdx, cl  ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
+        mov byte [buffer], dl  ; *buffer++ = c;
+        add buffer, 1
+        cmp dl, 0xFF  ; need to stuff a zero byte?
+        jne %%.EMIT_BYTE_END
+        mov byte [buffer], 0  ; *buffer++ = 0;
+        add buffer, 1
+%%.EMIT_BYTE_END:
+%endmacro
+
+%macro PUT_BITS 1
+        add put_bits, ecx  ; put_bits += size;
+        shl put_buffer, cl  ; put_buffer = (put_buffer << size);
+        or  put_buffer, %1
+%endmacro
+
+%macro CHECKBUF31 0
+        cmp put_bits, 32  ; if (put_bits > 31) {
+        jl %%.CHECKBUF31_END
+        EMIT_BYTE
+        EMIT_BYTE
+        EMIT_BYTE
+        EMIT_BYTE
+%%.CHECKBUF31_END:
+%endmacro
+
+%macro CHECKBUF47 0
+        cmp put_bits, 48  ; if (put_bits > 47) {
+        jl %%.CHECKBUF47_END
+        EMIT_BYTE
+        EMIT_BYTE
+        EMIT_BYTE
+        EMIT_BYTE
+        EMIT_BYTE
+        EMIT_BYTE
+%%.CHECKBUF47_END:
+%endmacro
+
+%macro EMIT_BITS 2
+        CHECKBUF47
+        mov ecx, %2
+        PUT_BITS %1
+%endmacro
+
+%macro kloop_prepare 37  ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
+    pxor xmm8, xmm8  ; __m128i neg = _mm_setzero_si128();
+    pxor xmm9, xmm9  ; __m128i neg = _mm_setzero_si128();
+    pxor xmm10, xmm10  ; __m128i neg = _mm_setzero_si128();
+    pxor xmm11, xmm11  ; __m128i neg = _mm_setzero_si128();
+    pinsrw %34, word [r12 + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
+    pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
+    pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
+    pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
+    pinsrw %34, word [r12 + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
+    pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
+    pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
+    pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
+    pinsrw %34, word [r12 + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
+    pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
+    pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
+    pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
+    pinsrw %34, word [r12 + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
+    pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
+    pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
+    pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
+    pinsrw %34, word [r12 + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
+    pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
+    pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
+    pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
+    pinsrw %34, word [r12 + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
+    pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
+    pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
+    pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
+    pinsrw %34, word [r12 + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
+    pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
+    pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
+    pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
+    pinsrw %34, word [r12 + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
+    pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
+    pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
+%if %1 != 32
+    pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
+%else
+    pinsrw %37, ebx, 7  ; xmm_shadow[31] = block[jno31];
+%endif
+    pcmpgtw xmm8, %34  ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw xmm9, %35  ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw xmm10, %36  ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw xmm11, %37  ; neg = _mm_cmpgt_epi16(neg, x1);
+    paddw %34, xmm8   ; x1 = _mm_add_epi16(x1, neg);
+    paddw %35, xmm9   ; x1 = _mm_add_epi16(x1, neg);
+    paddw %36, xmm10  ; x1 = _mm_add_epi16(x1, neg);
+    paddw %37, xmm11  ; x1 = _mm_add_epi16(x1, neg);
+    pxor %34, xmm8    ; x1 = _mm_xor_si128(x1, neg);
+    pxor %35, xmm9    ; x1 = _mm_xor_si128(x1, neg);
+    pxor %36, xmm10   ; x1 = _mm_xor_si128(x1, neg);
+    pxor %37, xmm11   ; x1 = _mm_xor_si128(x1, neg);
+    pxor xmm8, %34    ; neg = _mm_xor_si128(neg, x1);
+    pxor xmm9, %35    ; neg = _mm_xor_si128(neg, x1);
+    pxor xmm10, %36   ; neg = _mm_xor_si128(neg, x1);
+    pxor xmm11, %37   ; neg = _mm_xor_si128(neg, x1);
+    movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34  ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
+    movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35  ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
+    movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36  ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
+    movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37  ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
+    movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8  ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
+    movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9  ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
+    movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
+    movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET*)
+; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
+;                                   JCOEFPTR block, int last_dc_val,
+;                                   c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+
+; r10 = working_state *state
+; r11 = JOCTET *buffer
+; r12 = JCOEFPTR block
+; r13 = int last_dc_val
+; r14 = c_derived_tbl *dctbl
+; r15 = c_derived_tbl *actbl
+
+%define t1              rbp-(DCTSIZE2*SIZEOF_WORD)
+%define t2              t1-(DCTSIZE2*SIZEOF_WORD)
+%define put_buffer      r8
+%define put_bits        r9d
+%define buffer          rax
+
+        align   16
+        global  EXTN(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [t2]
+        collect_args
+%ifdef WIN64
+        movaps  XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
+        movaps  XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
+        movaps  XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
+        movaps  XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
+        sub     rsp, 4*SIZEOF_XMMWORD
+%endif
+        push rbx
+
+        mov buffer, r11  ; r11 is now sratch
+
+        mov put_buffer, MMWORD [r10+16]  ; put_buffer = state->cur.put_buffer;
+        mov put_bits,    DWORD [r10+24]  ; put_bits = state->cur.put_bits;
+        push r10  ; r10 is now scratch
+
+        ; Encode the DC coefficient difference per section F.1.2.1
+        movsx edi, word [r12]  ; temp = temp2 = block[0] - last_dc_val;
+        sub   edi, r13d  ; r13 is not used anymore
+        mov   ebx, edi
+
+        ; This is a well-known technique for obtaining the absolute value
+        ; without a branch.  It is derived from an assembly language technique
+        ; presented in "How to Optimize for the Pentium Processors",
+        ; Copyright (c) 1996, 1997 by Agner Fog.
+        mov esi, edi
+        sar esi, 31   ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+        xor edi, esi  ; temp ^= temp3;
+        sub edi, esi  ; temp -= temp3;
+
+        ; For a negative input, want temp2 = bitwise complement of abs(input)
+        ; This code assumes we are on a two's complement machine
+        add ebx, esi  ; temp2 += temp3;
+
+        ; Find the number of bits needed for the magnitude of the coefficient
+        lea   r11, [rel jpeg_nbits_table]
+        movzx rdi, byte [r11 + rdi]  ; nbits = JPEG_NBITS(temp);
+        ; Emit the Huffman-coded symbol for the number of bits
+        mov   r11d,  INT [r14 + rdi * 4]  ; code = dctbl->ehufco[nbits];
+        movzx  esi, byte [r14 + rdi + 1024]  ; size = dctbl->ehufsi[nbits];
+        EMIT_BITS r11, esi  ; EMIT_BITS(code, size)
+
+        ; Mask off any extra bits in code
+        mov esi, 1
+        mov ecx, edi
+        shl esi, cl
+        dec esi
+        and ebx, esi  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+
+        ; Emit that number of bits of the value, if positive,
+        ; or the complement of its magnitude, if negative.
+        EMIT_BITS rbx, edi  ; EMIT_BITS(temp2, nbits)
+
+        ; Prepare data
+        xor ebx, ebx
+        kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
+                       18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
+                       27, 20, 13, 6,  7,  14, 21, 28, 35, \
+                       xmm0, xmm1, xmm2, xmm3
+        kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
+                       30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
+                       53, 60, 61, 54, 47, 55, 62, 63, 63, \
+                       xmm4, xmm5, xmm6, xmm7
+
+        pxor xmm8, xmm8
+        pcmpeqw xmm0, xmm8  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
+        pcmpeqw xmm1, xmm8  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
+        pcmpeqw xmm2, xmm8  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
+        pcmpeqw xmm3, xmm8  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
+        pcmpeqw xmm4, xmm8  ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
+        pcmpeqw xmm5, xmm8  ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
+        pcmpeqw xmm6, xmm8  ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
+        pcmpeqw xmm7, xmm8  ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
+        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
+        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
+        packsswb xmm4, xmm5  ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
+        packsswb xmm6, xmm7  ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
+        pmovmskb r11d, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
+        pmovmskb r12d, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
+        pmovmskb r13d, xmm4  ; index  = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
+        pmovmskb r14d, xmm6  ; index  = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
+        shl r12, 16
+        shl r14, 16
+        or  r11, r12
+        or  r13, r14
+        shl r13, 32
+        or  r11, r13
+        not r11  ; index = ~index;
+
+        ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
+        ;jmp .EFN
+
+        mov   r13d,  INT [r15 + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
+        movzx r14d, byte [r15 + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
+        lea rsi, [t1]
+.BLOOP:
+        bsf r12, r11  ; r = __builtin_ctzl(index);
+        jz .ELOOP
+        mov rcx, r12
+        lea rsi, [rsi+r12*2]  ; k += r;
+        shr r11, cl  ; index >>= r;
+        movzx rdi, word [rsi]  ; temp = t1[k];
+        lea   rbx, [rel jpeg_nbits_table]
+        movzx rdi, byte [rbx + rdi]  ; nbits = JPEG_NBITS(temp);
+.BRLOOP:
+        cmp r12, 16  ; while (r > 15) {
+        jl .ERLOOP
+        EMIT_BITS r13, r14d  ; EMIT_BITS(code_0xf0, size_0xf0)
+        sub r12, 16  ; r -= 16;
+        jmp .BRLOOP
+.ERLOOP:
+        ; Emit Huffman symbol for run length / number of bits
+        CHECKBUF31  ; uses rcx, rdx
+
+        shl r12, 4  ; temp3 = (r << 4) + nbits;
+        add r12, rdi
+        mov   ebx,  INT [r15 + r12 * 4]  ; code = actbl->ehufco[temp3];
+        movzx ecx, byte [r15 + r12 + 1024]  ; size = actbl->ehufsi[temp3];
+        PUT_BITS rbx
+
+        ;EMIT_CODE(code, size)
+
+        movsx ebx, word [rsi-DCTSIZE2*2]  ; temp2 = t2[k];
+        ; Mask off any extra bits in code
+        mov rcx, rdi
+        mov rdx, 1
+        shl rdx, cl
+        dec rdx
+        and rbx, rdx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+        PUT_BITS rbx  ; PUT_BITS(temp2, nbits)
+
+        shr r11, 1  ; index >>= 1;
+        add rsi, 2  ; ++k;
+        jmp .BLOOP
+.ELOOP:
+        ; If the last coef(s) were zero, emit an end-of-block code
+        lea rdi, [t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
+        cmp rdi, rsi  ; if (r > 0) {
+        je .EFN
+        mov   ebx,  INT [r15]  ; code = actbl->ehufco[0];
+        movzx r12d, byte [r15 + 1024]  ; size = actbl->ehufsi[0];
+        EMIT_BITS rbx, r12d
+.EFN:
+        pop r10
+        ; Save put_buffer & put_bits
+        mov MMWORD [r10+16], put_buffer  ; state->cur.put_buffer = put_buffer;
+        mov DWORD  [r10+24], put_bits  ; state->cur.put_bits = put_bits;
+
+        pop rbx
+%ifdef WIN64
+        movaps  xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+        movaps  xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+        movaps  xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+        movaps  xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
+        add     rsp, 4*SIZEOF_XMMWORD
+%endif
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jchuff-sse2.asm b/simd/jchuff-sse2.asm
new file mode 100644
index 0000000..1d82273
--- /dev/null
+++ b/simd/jchuff-sse2.asm
@@ -0,0 +1,427 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
+;
+; Copyright 2009-2011, 2014-2016 D. R. Commander.
+; Copyright 2015 Matthieu Darbois
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based directly on jchuff.c; see jchuff.c for more
+; details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+%include "jpeg_nbits_table.inc"
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+
+; These macros perform the same task as the emit_bits() function in the
+; original libjpeg code.  In addition to reducing overhead by explicitly
+; inlining the code, additional performance is achieved by taking into
+; account the size of the bit buffer and waiting until it is almost full
+; before emptying it.  This mostly benefits 64-bit platforms, since 6
+; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+
+%macro EMIT_BYTE 0
+        sub put_bits, 8  ; put_bits -= 8;
+        mov edx, put_buffer
+        mov ecx, put_bits
+        shr edx, cl  ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
+        mov byte [eax], dl  ; *buffer++ = c;
+        add eax, 1
+        cmp dl, 0xFF  ; need to stuff a zero byte?
+        jne %%.EMIT_BYTE_END
+        mov byte [eax], 0  ; *buffer++ = 0;
+        add eax, 1
+%%.EMIT_BYTE_END:
+%endmacro
+
+%macro PUT_BITS 1
+        add put_bits, ecx  ; put_bits += size;
+        shl put_buffer, cl  ; put_buffer = (put_buffer << size);
+        or  put_buffer, %1
+%endmacro
+
+%macro CHECKBUF15 0
+        cmp put_bits, 16  ; if (put_bits > 31) {
+        jl %%.CHECKBUF15_END
+        mov eax, POINTER [esp+buffer]
+        EMIT_BYTE
+        EMIT_BYTE
+        mov POINTER [esp+buffer], eax
+%%.CHECKBUF15_END:
+%endmacro
+
+%macro EMIT_BITS 1
+        PUT_BITS %1
+        CHECKBUF15
+%endmacro
+
+%macro kloop_prepare 37  ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
+    pxor xmm4, xmm4  ; __m128i neg = _mm_setzero_si128();
+    pxor xmm5, xmm5  ; __m128i neg = _mm_setzero_si128();
+    pxor xmm6, xmm6  ; __m128i neg = _mm_setzero_si128();
+    pxor xmm7, xmm7  ; __m128i neg = _mm_setzero_si128();
+    pinsrw %34, word [esi + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
+    pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
+    pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
+    pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
+    pinsrw %34, word [esi + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
+    pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
+    pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
+    pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
+    pinsrw %34, word [esi + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
+    pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
+    pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
+    pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
+    pinsrw %34, word [esi + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
+    pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
+    pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
+    pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
+    pinsrw %34, word [esi + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
+    pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
+    pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
+    pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
+    pinsrw %34, word [esi + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
+    pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
+    pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
+    pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
+    pinsrw %34, word [esi + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
+    pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
+    pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
+    pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
+    pinsrw %34, word [esi + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
+    pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
+    pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
+%if %1 != 32
+    pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
+%else
+    pinsrw %37, ecx, 7  ; xmm_shadow[31] = block[jno31];
+%endif
+    pcmpgtw xmm4, %34  ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw xmm5, %35  ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw xmm6, %36  ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw xmm7, %37  ; neg = _mm_cmpgt_epi16(neg, x1);
+    paddw %34, xmm4   ; x1 = _mm_add_epi16(x1, neg);
+    paddw %35, xmm5   ; x1 = _mm_add_epi16(x1, neg);
+    paddw %36, xmm6  ; x1 = _mm_add_epi16(x1, neg);
+    paddw %37, xmm7  ; x1 = _mm_add_epi16(x1, neg);
+    pxor %34, xmm4    ; x1 = _mm_xor_si128(x1, neg);
+    pxor %35, xmm5    ; x1 = _mm_xor_si128(x1, neg);
+    pxor %36, xmm6   ; x1 = _mm_xor_si128(x1, neg);
+    pxor %37, xmm7   ; x1 = _mm_xor_si128(x1, neg);
+    pxor xmm4, %34    ; neg = _mm_xor_si128(neg, x1);
+    pxor xmm5, %35    ; neg = _mm_xor_si128(neg, x1);
+    pxor xmm6, %36   ; neg = _mm_xor_si128(neg, x1);
+    pxor xmm7, %37   ; neg = _mm_xor_si128(neg, x1);
+    movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34  ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
+    movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35  ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
+    movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36  ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
+    movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37  ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
+    movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4  ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
+    movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5  ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
+    movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
+    movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET*)
+; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
+;                                   JCOEFPTR block, int last_dc_val,
+;                                   c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+
+; eax + 8 = working_state *state
+; eax + 12 = JOCTET *buffer
+; eax + 16 = JCOEFPTR block
+; eax + 20 = int last_dc_val
+; eax + 24 = c_derived_tbl *dctbl
+; eax + 28 = c_derived_tbl *actbl
+
+%define pad             6*SIZEOF_DWORD  ; Align to 16 bytes
+%define t1              pad
+%define t2              t1+(DCTSIZE2*SIZEOF_WORD)
+%define block           t2+(DCTSIZE2*SIZEOF_WORD)
+%define actbl           block+SIZEOF_DWORD
+%define buffer          actbl+SIZEOF_DWORD
+%define temp            buffer+SIZEOF_DWORD
+%define temp2           temp+SIZEOF_DWORD
+%define temp3           temp2+SIZEOF_DWORD
+%define temp4           temp3+SIZEOF_DWORD
+%define temp5           temp4+SIZEOF_DWORD
+%define gotptr          temp5+SIZEOF_DWORD  ; void *gotptr
+%define put_buffer      ebx
+%define put_bits        edi
+
+        align   16
+        global  EXTN(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        sub     esp, temp5+9*SIZEOF_DWORD-pad
+        push    ebx
+        push    ecx
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+        push    ebp
+
+        mov esi, POINTER [eax+8]        ; (working_state *state)
+        mov put_buffer,  DWORD [esi+8]  ; put_buffer = state->cur.put_buffer;
+        mov put_bits,    DWORD [esi+12]  ; put_bits = state->cur.put_bits;
+        push esi  ; esi is now scratch
+
+        get_GOT edx                       ; get GOT address
+        movpic POINTER [esp+gotptr], edx  ; save GOT address
+
+        mov ecx, POINTER [eax+28]
+        mov edx, POINTER [eax+16]
+        mov esi, POINTER [eax+12]
+        mov POINTER [esp+actbl],  ecx
+        mov POINTER [esp+block],  edx
+        mov POINTER [esp+buffer], esi
+
+        ; Encode the DC coefficient difference per section F.1.2.1
+        mov esi, POINTER [esp+block]        ; block
+        movsx ecx, word [esi]  ; temp = temp2 = block[0] - last_dc_val;
+        sub   ecx, DWORD [eax+20]
+        mov   esi, ecx
+
+        ; This is a well-known technique for obtaining the absolute value
+        ; without a branch.  It is derived from an assembly language technique
+        ; presented in "How to Optimize for the Pentium Processors",
+        ; Copyright (c) 1996, 1997 by Agner Fog.
+        mov edx, ecx
+        sar edx, 31   ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+        xor ecx, edx ; temp ^= temp3;
+        sub ecx, edx ; temp -= temp3;
+
+        ; For a negative input, want temp2 = bitwise complement of abs(input)
+        ; This code assumes we are on a two's complement machine
+        add esi, edx  ; temp2 += temp3;
+        mov DWORD [esp+temp], esi  ; backup temp2 in temp
+
+        ; Find the number of bits needed for the magnitude of the coefficient
+        movpic ebp, POINTER [esp+gotptr]   ; load GOT address (ebp)
+        movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]  ; nbits = JPEG_NBITS(temp);
+        mov DWORD [esp+temp2], edx  ; backup nbits in temp2
+
+        ; Emit the Huffman-coded symbol for the number of bits
+        mov    ebp, POINTER [eax+24]  ; After this point, arguments are not accessible anymore
+        mov    eax,  INT [ebp + edx * 4]  ; code = dctbl->ehufco[nbits];
+        movzx  ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
+        EMIT_BITS eax  ; EMIT_BITS(code, size)
+
+        mov ecx, DWORD [esp+temp2]  ; restore nbits
+
+        ; Mask off any extra bits in code
+        mov eax, 1
+        shl eax, cl
+        dec eax
+        and eax, DWORD [esp+temp]  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+
+        ; Emit that number of bits of the value, if positive,
+        ; or the complement of its magnitude, if negative.
+        EMIT_BITS eax  ; EMIT_BITS(temp2, nbits)
+
+        ; Prepare data
+        xor ecx, ecx
+        mov esi, POINTER [esp+block]
+        kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
+                       18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
+                       27, 20, 13, 6,  7,  14, 21, 28, 35, \
+                       xmm0, xmm1, xmm2, xmm3
+        kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
+                       30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
+                       53, 60, 61, 54, 47, 55, 62, 63, 63, \
+                       xmm0, xmm1, xmm2, xmm3
+
+        pxor xmm7, xmm7
+        movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD]   ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
+        movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD]   ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
+        movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
+        movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
+        pcmpeqw xmm0, xmm7  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
+        pcmpeqw xmm1, xmm7  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
+        pcmpeqw xmm2, xmm7  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
+        pcmpeqw xmm3, xmm7  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
+        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
+        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
+        pmovmskb edx, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
+        pmovmskb ecx, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
+        shl ecx, 16
+        or  edx, ecx
+        not edx  ; index = ~index;
+
+        lea esi, [esp+t1]
+        mov ebp, POINTER [esp+actbl]  ; ebp = actbl
+
+.BLOOP:
+        bsf ecx, edx  ; r = __builtin_ctzl(index);
+        jz .ELOOP
+        lea esi, [esi+ecx*2]  ; k += r;
+        shr edx, cl  ; index >>= r;
+        mov DWORD [esp+temp3], edx
+.BRLOOP:
+        cmp ecx, 16  ; while (r > 15) {
+        jl .ERLOOP
+        sub ecx, 16 ; r -= 16;
+        mov DWORD [esp+temp], ecx
+        mov   eax, INT [ebp + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
+        movzx ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
+        EMIT_BITS eax  ; EMIT_BITS(code_0xf0, size_0xf0)
+        mov ecx, DWORD [esp+temp]
+        jmp .BRLOOP
+.ERLOOP:
+        movsx eax, word [esi]  ; temp = t1[k];
+        movpic edx, POINTER [esp+gotptr]   ; load GOT address (edx)
+        movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]  ; nbits = JPEG_NBITS(temp);
+        mov DWORD [esp+temp2], eax
+        ; Emit Huffman symbol for run length / number of bits
+        shl ecx, 4  ; temp3 = (r << 4) + nbits;
+        add ecx, eax
+        mov   eax,  INT [ebp + ecx * 4]  ; code = actbl->ehufco[temp3];
+        movzx ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
+        EMIT_BITS eax
+
+        movsx edx, word [esi+DCTSIZE2*2]  ; temp2 = t2[k];
+        ; Mask off any extra bits in code
+        mov ecx, DWORD [esp+temp2]
+        mov eax, 1
+        shl eax, cl
+        dec eax
+        and eax, edx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+        EMIT_BITS eax  ; PUT_BITS(temp2, nbits)
+        mov edx, DWORD [esp+temp3]
+        add esi, 2  ; ++k;
+        shr edx, 1  ; index >>= 1;
+
+        jmp .BLOOP
+.ELOOP:
+        movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD]  ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
+        movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD]  ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
+        movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
+        movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
+        pcmpeqw xmm0, xmm7  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
+        pcmpeqw xmm1, xmm7  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
+        pcmpeqw xmm2, xmm7  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
+        pcmpeqw xmm3, xmm7  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
+        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
+        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
+        pmovmskb edx, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
+        pmovmskb ecx, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
+        shl ecx, 16
+        or  edx, ecx
+        not edx  ; index = ~index;
+
+        lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
+        sub eax, esi
+        shr eax, 1
+        bsf ecx, edx  ; r = __builtin_ctzl(index);
+        jz .ELOOP2
+        shr edx, cl  ; index >>= r;
+        add ecx, eax
+        lea esi, [esi+ecx*2]  ; k += r;
+        mov DWORD [esp+temp3], edx
+        jmp .BRLOOP2
+.BLOOP2:
+        bsf ecx, edx  ; r = __builtin_ctzl(index);
+        jz .ELOOP2
+        lea esi, [esi+ecx*2]  ; k += r;
+        shr edx, cl  ; index >>= r;
+        mov DWORD [esp+temp3], edx
+.BRLOOP2:
+        cmp ecx, 16  ; while (r > 15) {
+        jl .ERLOOP2
+        sub ecx, 16  ; r -= 16;
+        mov DWORD [esp+temp], ecx
+        mov   eax, INT [ebp + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
+        movzx ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
+        EMIT_BITS eax  ; EMIT_BITS(code_0xf0, size_0xf0)
+        mov ecx, DWORD [esp+temp]
+        jmp .BRLOOP2
+.ERLOOP2:
+        movsx eax, word [esi]  ; temp = t1[k];
+        bsr eax, eax  ; nbits = 32 - __builtin_clz(temp);
+        inc eax
+        mov DWORD [esp+temp2], eax
+        ; Emit Huffman symbol for run length / number of bits
+        shl ecx, 4  ; temp3 = (r << 4) + nbits;
+        add ecx, eax
+        mov   eax,  INT [ebp + ecx * 4]  ; code = actbl->ehufco[temp3];
+        movzx ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
+        EMIT_BITS eax
+
+        movsx edx, word [esi+DCTSIZE2*2]  ; temp2 = t2[k];
+        ; Mask off any extra bits in code
+        mov ecx, DWORD [esp+temp2]
+        mov eax, 1
+        shl eax, cl
+        dec eax
+        and eax, edx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+        EMIT_BITS eax  ; PUT_BITS(temp2, nbits)
+        mov edx, DWORD [esp+temp3]
+        add esi, 2  ; ++k;
+        shr edx, 1  ; index >>= 1;
+
+        jmp .BLOOP2
+.ELOOP2:
+        ; If the last coef(s) were zero, emit an end-of-block code
+        lea edx, [esp + t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
+        cmp edx, esi  ; if (r > 0) {
+        je .EFN
+        mov   eax,  INT [ebp]  ; code = actbl->ehufco[0];
+        movzx ecx, byte [ebp + 1024]  ; size = actbl->ehufsi[0];
+        EMIT_BITS eax
+.EFN:
+        mov eax, [esp+buffer]
+        pop esi
+        ; Save put_buffer & put_bits
+        mov DWORD [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
+        mov DWORD [esi+12], put_bits  ; state->cur.put_bits = put_bits;
+
+        pop     ebp
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+        pop     ecx
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jcqnt3dn.asm b/simd/jcqnt3dn.asm
deleted file mode 100644
index 480777d..0000000
--- a/simd/jcqnt3dn.asm
+++ /dev/null
@@ -1,233 +0,0 @@
-;
-; jcqnt3dn.asm - sample data conversion and quantization (3DNow! & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                             FAST_FLOAT * workspace);
-;
-
-%define sample_data	ebp+8		; JSAMPARRAY sample_data
-%define start_col	ebp+12		; JDIMENSION start_col
-%define workspace	ebp+16		; FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_float_3dnow) PRIVATE
-
-EXTN(jsimd_convsamp_float_3dnow):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	pcmpeqw  mm7,mm7
-	psllw    mm7,7
-	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
-
-	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [start_col]
-	mov	edi, POINTER [workspace]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/2
-	alignx	16,7
-.convloop:
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-	psubb	mm0,mm7				; mm0=(01234567)
-	psubb	mm1,mm7				; mm1=(89ABCDEF)
-
-	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
-	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
-	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
-	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
-
-	punpcklwd mm4,mm2			; mm4=(***0***1)
-	punpckhwd mm2,mm2			; mm2=(***2***3)
-	punpcklwd mm5,mm0			; mm5=(***4***5)
-	punpckhwd mm0,mm0			; mm0=(***6***7)
-
-	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
-	psrad	mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
-	pi2fd	mm4,mm4
-	pi2fd	mm2,mm2
-	psrad	mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
-	psrad	mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
-	pi2fd	mm5,mm5
-	pi2fd	mm0,mm0
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
-	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
-	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-
-	punpcklwd mm6,mm3			; mm6=(***8***9)
-	punpckhwd mm3,mm3			; mm3=(***A***B)
-	punpcklwd mm4,mm1			; mm4=(***C***D)
-	punpckhwd mm1,mm1			; mm1=(***E***F)
-
-	psrad	mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
-	psrad	mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
-	pi2fd	mm6,mm6
-	pi2fd	mm3,mm3
-	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
-	psrad	mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
-	pi2fd	mm4,mm4
-	pi2fd	mm1,mm1
-
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
-	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
-	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
-
-	add	esi, byte 2*SIZEOF_JSAMPROW
-	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	near .convloop
-
-	femms		; empty MMX/3DNow! state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-;                             FAST_FLOAT * workspace);
-;
-
-%define coef_block	ebp+8		; JCOEFPTR coef_block
-%define divisors	ebp+12		; FAST_FLOAT * divisors
-%define workspace	ebp+16		; FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_float_3dnow) PRIVATE
-
-EXTN(jsimd_quantize_float_3dnow):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov       eax, 0x4B400000	; (float)0x00C00000 (rndint_magic)
-	movd      mm7,eax
-	punpckldq mm7,mm7		; mm7={12582912.0F 12582912.0F}
-
-	mov	esi, POINTER [workspace]
-	mov	edx, POINTER [divisors]
-	mov	edi, JCOEFPTR [coef_block]
-	mov	eax, DCTSIZE2/16
-	alignx	16,7
-.quantloop:
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-	pfmul	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	pfmul	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
-	pfmul	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
-	pfmul	mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
-
-	pfadd	mm0,mm7			; mm0=(00 ** 01 **)
-	pfadd	mm1,mm7			; mm1=(02 ** 03 **)
-	pfadd	mm2,mm7			; mm0=(04 ** 05 **)
-	pfadd	mm3,mm7			; mm1=(06 ** 07 **)
-
-	movq      mm4,mm0
-	punpcklwd mm0,mm1		; mm0=(00 02 ** **)
-	punpckhwd mm4,mm1		; mm4=(01 03 ** **)
-	movq      mm5,mm2
-	punpcklwd mm2,mm3		; mm2=(04 06 ** **)
-	punpckhwd mm5,mm3		; mm5=(05 07 ** **)
-
-	punpcklwd mm0,mm4		; mm0=(00 01 02 03)
-	punpcklwd mm2,mm5		; mm2=(04 05 06 07)
-
-	movq	mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-	pfmul	mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	pfmul	mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
-	pfmul	mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
-	pfmul	mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
-
-	pfadd	mm6,mm7			; mm0=(10 ** 11 **)
-	pfadd	mm1,mm7			; mm4=(12 ** 13 **)
-	pfadd	mm3,mm7			; mm0=(14 ** 15 **)
-	pfadd	mm4,mm7			; mm4=(16 ** 17 **)
-
-	movq      mm5,mm6
-	punpcklwd mm6,mm1		; mm6=(10 12 ** **)
-	punpckhwd mm5,mm1		; mm5=(11 13 ** **)
-	movq      mm1,mm3
-	punpcklwd mm3,mm4		; mm3=(14 16 ** **)
-	punpckhwd mm1,mm4		; mm1=(15 17 ** **)
-
-	punpcklwd mm6,mm5		; mm6=(10 11 12 13)
-	punpcklwd mm3,mm1		; mm3=(14 15 16 17)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
-
-	add	esi, byte 16*SIZEOF_FAST_FLOAT
-	add	edx, byte 16*SIZEOF_FAST_FLOAT
-	add	edi, byte 16*SIZEOF_JCOEF
-	dec	eax
-	jnz	near .quantloop
-
-	femms		; empty MMX/3DNow! state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqntmmx.asm b/simd/jcqntmmx.asm
deleted file mode 100644
index 62e00b6..0000000
--- a/simd/jcqntmmx.asm
+++ /dev/null
@@ -1,274 +0,0 @@
-;
-; jcqntmmx.asm - sample data conversion and quantization (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                     DCTELEM * workspace);
-;
-
-%define sample_data	ebp+8		; JSAMPARRAY sample_data
-%define start_col	ebp+12		; JDIMENSION start_col
-%define workspace	ebp+16		; DCTELEM * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_mmx) PRIVATE
-
-EXTN(jsimd_convsamp_mmx):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	pxor	mm6,mm6			; mm6=(all 0's)
-	pcmpeqw	mm7,mm7
-	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
-
-	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [start_col]
-	mov	edi, POINTER [workspace]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.convloop:
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm0=(01234567)
-	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm1=(89ABCDEF)
-
-	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm2=(GHIJKLMN)
-	movq	mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm3=(OPQRSTUV)
-
-	movq      mm4,mm0
-	punpcklbw mm0,mm6		; mm0=(0123)
-	punpckhbw mm4,mm6		; mm4=(4567)
-	movq      mm5,mm1
-	punpcklbw mm1,mm6		; mm1=(89AB)
-	punpckhbw mm5,mm6		; mm5=(CDEF)
-
-	paddw	mm0,mm7
-	paddw	mm4,mm7
-	paddw	mm1,mm7
-	paddw	mm5,mm7
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
-
-	movq      mm0,mm2
-	punpcklbw mm2,mm6		; mm2=(GHIJ)
-	punpckhbw mm0,mm6		; mm0=(KLMN)
-	movq      mm4,mm3
-	punpcklbw mm3,mm6		; mm3=(OPQR)
-	punpckhbw mm4,mm6		; mm4=(STUV)
-
-	paddw	mm2,mm7
-	paddw	mm0,mm7
-	paddw	mm3,mm7
-	paddw	mm4,mm7
-
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
-	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
-	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
-
-	add	esi, byte 4*SIZEOF_JSAMPROW
-	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-	dec	ecx
-	jnz	short .convloop
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
-;                     DCTELEM * workspace);
-;
-
-%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-%define SHIFT(m,n,b)      MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
-
-%define coef_block	ebp+8		; JCOEFPTR coef_block
-%define divisors	ebp+12		; DCTELEM * divisors
-%define workspace	ebp+16		; DCTELEM * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_mmx) PRIVATE
-
-EXTN(jsimd_quantize_mmx):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	esi, POINTER [workspace]
-	mov	edx, POINTER [divisors]
-	mov	edi, JCOEFPTR [coef_block]
-	mov	ah, 2
-	alignx	16,7
-.quantloop1:
-	mov	al, DCTSIZE2/8/2
-	alignx	16,7
-.quantloop2:
-	movq	mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
-
-	movq	mm0,mm2
-	movq	mm1,mm3
-
-	psraw	mm2,(WORD_BIT-1)  ; -1 if value < 0, 0 otherwise
-	psraw	mm3,(WORD_BIT-1)
-
-	pxor	mm0,mm2   ; val = -val
-	pxor	mm1,mm3
-	psubw	mm0,mm2
-	psubw	mm1,mm3
-
-	;
-	; MMX is an annoyingly crappy instruction set. It has two
-	; misfeatures that are causing problems here:
-	;
-	; - All multiplications are signed.
-	;
-	; - The second operand for the shifts is not treated as packed.
-	;
-	;
-	; We work around the first problem by implementing this algorithm:
-	;
-	; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
-	; {
-	;   enum { SHORT_BIT = 16 };
-	;   signed short sx = (signed short) x;
-	;   signed short sy = (signed short) y;
-	;   signed long sz;
-	; 
-	;   sz = (long) sx * (long) sy;     /* signed multiply */
-	; 
-	;   if (sx < 0) sz += (long) sy << SHORT_BIT;
-	;   if (sy < 0) sz += (long) sx << SHORT_BIT;
-	; 
-	;   return (unsigned long) sz;
-	; }
-	;
-	; (note that a negative sx adds _sy_ and vice versa)
-	;
-	; For the second problem, we replace the shift by a multiplication.
-	; Unfortunately that means we have to deal with the signed issue again.
-	;
-
-	paddw	mm0, MMWORD [CORRECTION(0,0,edx)]   ; correction + roundfactor
-	paddw	mm1, MMWORD [CORRECTION(0,1,edx)]
-
-	movq	mm4,mm0   ; store current value for later
-	movq	mm5,mm1
-	pmulhw	mm0, MMWORD [RECIPROCAL(0,0,edx)]   ; reciprocal
-	pmulhw	mm1, MMWORD [RECIPROCAL(0,1,edx)]
-	paddw	mm0,mm4		; reciprocal is always negative (MSB=1),
-	paddw	mm1,mm5   ; so we always need to add the initial value
-	                ; (input value is never negative as we
-	                ; inverted it at the start of this routine)
-
-	; here it gets a bit tricky as both scale
-	; and mm0/mm1 can be negative
-	movq	mm6, MMWORD [SCALE(0,0,edx)]	; scale
-	movq	mm7, MMWORD [SCALE(0,1,edx)]
-	movq	mm4,mm0
-	movq	mm5,mm1
-	pmulhw	mm0,mm6
-	pmulhw	mm1,mm7
-
-	psraw	mm6,(WORD_BIT-1)    ; determine if scale is negative
-	psraw	mm7,(WORD_BIT-1)
-
-	pand	mm6,mm4             ; and add input if it is
-	pand	mm7,mm5
-	paddw	mm0,mm6
-	paddw	mm1,mm7
-
-	psraw	mm4,(WORD_BIT-1)    ; then check if negative input 
-	psraw	mm5,(WORD_BIT-1)
-
-	pand	mm4, MMWORD [SCALE(0,0,edx)]	; and add scale if it is
-	pand	mm5, MMWORD [SCALE(0,1,edx)]
-	paddw	mm0,mm4
-	paddw	mm1,mm5
-
-	pxor	mm0,mm2   ; val = -val
-	pxor	mm1,mm3
-	psubw	mm0,mm2
-	psubw	mm1,mm3
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
-
-	add	esi, byte 8*SIZEOF_DCTELEM
-	add	edx, byte 8*SIZEOF_DCTELEM
-	add	edi, byte 8*SIZEOF_JCOEF
-	dec	al
-	jnz	near .quantloop2
-	dec	ah
-	jnz	near .quantloop1	; to avoid branch misprediction
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqnts2f-64.asm b/simd/jcqnts2f-64.asm
deleted file mode 100644
index 0bc73bc..0000000
--- a/simd/jcqnts2f-64.asm
+++ /dev/null
@@ -1,158 +0,0 @@
-;
-; jcqnts2f-64.asm - sample data conversion and quantization (64-bit SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                            FAST_FLOAT * workspace);
-;
-
-; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
-; r12 = FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_float_sse2) PRIVATE
-
-EXTN(jsimd_convsamp_float_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-	push	rbx
-
-	pcmpeqw  xmm7,xmm7
-	psllw    xmm7,7
-	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
-	mov rsi, r10
-	mov	eax, r11d
-	mov rdi, r12
-	mov	rcx, DCTSIZE/2
-.convloop:
-	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
-	movq	xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
-
-	psubb	xmm0,xmm7			; xmm0=(01234567)
-	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
-
-	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
-	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
-
-	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
-	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
-	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
-	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
-
-	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
-	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
-	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
-	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
-	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
-	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
-	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
-	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
-
-	add	rsi, byte 2*SIZEOF_JSAMPROW
-	add	rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	rcx
-	jnz	short .convloop
-
-	pop	rbx
-	uncollect_args
-	pop	rbp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-;                         FAST_FLOAT * workspace);
-;
-
-; r10 = JCOEFPTR coef_block
-; r11 = FAST_FLOAT * divisors
-; r12 = FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_float_sse2) PRIVATE
-
-EXTN(jsimd_quantize_float_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-
-	mov rsi, r12
-	mov rdx, r11
-	mov rdi, r10
-	mov	rax, DCTSIZE2/16
-.quantloop:
-	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
-	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
-	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
-	cvtps2dq xmm0,xmm0
-	cvtps2dq xmm1,xmm1
-	cvtps2dq xmm2,xmm2
-	cvtps2dq xmm3,xmm3
-
-	packssdw xmm0,xmm1
-	packssdw xmm2,xmm3
-
-	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
-	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
-
-	add	rsi, byte 16*SIZEOF_FAST_FLOAT
-	add	rdx, byte 16*SIZEOF_FAST_FLOAT
-	add	rdi, byte 16*SIZEOF_JCOEF
-	dec	rax
-	jnz	short .quantloop
-
-	uncollect_args
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqnts2f.asm b/simd/jcqnts2f.asm
deleted file mode 100644
index e5f5793..0000000
--- a/simd/jcqnts2f.asm
+++ /dev/null
@@ -1,171 +0,0 @@
-;
-; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                            FAST_FLOAT * workspace);
-;
-
-%define sample_data	ebp+8		; JSAMPARRAY sample_data
-%define start_col	ebp+12		; JDIMENSION start_col
-%define workspace	ebp+16		; FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_float_sse2) PRIVATE
-
-EXTN(jsimd_convsamp_float_sse2):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	pcmpeqw  xmm7,xmm7
-	psllw    xmm7,7
-	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
-	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [start_col]
-	mov	edi, POINTER [workspace]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/2
-	alignx	16,7
-.convloop:
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-	movq	xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-	psubb	xmm0,xmm7			; xmm0=(01234567)
-	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
-
-	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
-	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
-
-	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
-	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
-	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
-	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
-
-	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
-	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
-	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
-	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
-	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
-	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
-	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
-	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-
-	add	esi, byte 2*SIZEOF_JSAMPROW
-	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	short .convloop
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-;                         FAST_FLOAT * workspace);
-;
-
-%define coef_block	ebp+8		; JCOEFPTR coef_block
-%define divisors	ebp+12		; FAST_FLOAT * divisors
-%define workspace	ebp+16		; FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_float_sse2) PRIVATE
-
-EXTN(jsimd_quantize_float_sse2):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	esi, POINTER [workspace]
-	mov	edx, POINTER [divisors]
-	mov	edi, JCOEFPTR [coef_block]
-	mov	eax, DCTSIZE2/16
-	alignx	16,7
-.quantloop:
-	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-	cvtps2dq xmm0,xmm0
-	cvtps2dq xmm1,xmm1
-	cvtps2dq xmm2,xmm2
-	cvtps2dq xmm3,xmm3
-
-	packssdw xmm0,xmm1
-	packssdw xmm2,xmm3
-
-	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
-	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
-
-	add	esi, byte 16*SIZEOF_FAST_FLOAT
-	add	edx, byte 16*SIZEOF_FAST_FLOAT
-	add	edi, byte 16*SIZEOF_JCOEF
-	dec	eax
-	jnz	short .quantloop
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqnts2i-64.asm b/simd/jcqnts2i-64.asm
deleted file mode 100644
index bd84c6a..0000000
--- a/simd/jcqnts2i-64.asm
+++ /dev/null
@@ -1,187 +0,0 @@
-;
-; jcqnts2i-64.asm - sample data conversion and quantization (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                      DCTELEM * workspace);
-;
-
-; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
-; r12 = DCTELEM * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_sse2) PRIVATE
-
-EXTN(jsimd_convsamp_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-	push	rbx
-
-	pxor	xmm6,xmm6		; xmm6=(all 0's)
-	pcmpeqw	xmm7,xmm7
-	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-	mov rsi, r10
-	mov eax, r11d
-	mov rdi, r12
-	mov	rcx, DCTSIZE/4
-.convloop:
-	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]	; xmm0=(01234567)
-	movq	xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)
-
-	mov	rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)
-	movq	xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)
-
-	punpcklbw xmm0,xmm6		; xmm0=(01234567)
-	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)
-	paddw     xmm0,xmm7
-	paddw     xmm1,xmm7
-	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)
-	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)
-	paddw     xmm2,xmm7
-	paddw     xmm3,xmm7
-
-	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
-	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
-	movdqa	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
-	movdqa	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
-	add	rsi, byte 4*SIZEOF_JSAMPROW
-	add	rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-	dec	rcx
-	jnz	short .convloop
-
-	pop	rbx
-	uncollect_args
-	pop	rbp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
-;                      DCTELEM * workspace);
-;
-
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-
-; r10 = JCOEFPTR coef_block
-; r11 = DCTELEM * divisors
-; r12 = DCTELEM * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_sse2) PRIVATE
-
-EXTN(jsimd_quantize_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-
-	mov rsi, r12
-	mov rdx, r11
-	mov rdi, r10
-	mov	rax, DCTSIZE2/32
-.quantloop:
-	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
-	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
-	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
-	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
-	movdqa	xmm0,xmm4
-	movdqa	xmm1,xmm5
-	movdqa	xmm2,xmm6
-	movdqa	xmm3,xmm7
-	psraw	xmm4,(WORD_BIT-1)
-	psraw	xmm5,(WORD_BIT-1)
-	psraw	xmm6,(WORD_BIT-1)
-	psraw	xmm7,(WORD_BIT-1)
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm5
-	pxor	xmm2,xmm6
-	pxor	xmm3,xmm7
-	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;
-	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;
-	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;
-	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;
-
-	paddw	xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
-	paddw	xmm1, XMMWORD [CORRECTION(1,0,rdx)]
-	paddw	xmm2, XMMWORD [CORRECTION(2,0,rdx)]
-	paddw	xmm3, XMMWORD [CORRECTION(3,0,rdx)]
-	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
-	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
-	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
-	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
-	pmulhuw	xmm0, XMMWORD [SCALE(0,0,rdx)]	; scale
-	pmulhuw	xmm1, XMMWORD [SCALE(1,0,rdx)]
-	pmulhuw	xmm2, XMMWORD [SCALE(2,0,rdx)]
-	pmulhuw	xmm3, XMMWORD [SCALE(3,0,rdx)]
-
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm5
-	pxor	xmm2,xmm6
-	pxor	xmm3,xmm7
-	psubw	xmm0,xmm4
-	psubw	xmm1,xmm5
-	psubw	xmm2,xmm6
-	psubw	xmm3,xmm7
-	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
-	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
-	movdqa	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
-	movdqa	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
-	add	rsi, byte 32*SIZEOF_DCTELEM
-	add	rdx, byte 32*SIZEOF_DCTELEM
-	add	rdi, byte 32*SIZEOF_JCOEF
-	dec	rax
-	jnz	near .quantloop
-
-	uncollect_args
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqnts2i.asm b/simd/jcqnts2i.asm
deleted file mode 100644
index 412032b..0000000
--- a/simd/jcqnts2i.asm
+++ /dev/null
@@ -1,200 +0,0 @@
-;
-; jcqnts2i.asm - sample data conversion and quantization (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                      DCTELEM * workspace);
-;
-
-%define sample_data	ebp+8		; JSAMPARRAY sample_data
-%define start_col	ebp+12		; JDIMENSION start_col
-%define workspace	ebp+16		; DCTELEM * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_sse2) PRIVATE
-
-EXTN(jsimd_convsamp_sse2):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	pxor	xmm6,xmm6		; xmm6=(all 0's)
-	pcmpeqw	xmm7,xmm7
-	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [start_col]
-	mov	edi, POINTER [workspace]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.convloop:
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm0=(01234567)
-	movq	xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)
-
-	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)
-	movq	xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)
-
-	punpcklbw xmm0,xmm6		; xmm0=(01234567)
-	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)
-	paddw     xmm0,xmm7
-	paddw     xmm1,xmm7
-	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)
-	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)
-	paddw     xmm2,xmm7
-	paddw     xmm3,xmm7
-
-	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
-	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
-	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
-	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
-	add	esi, byte 4*SIZEOF_JSAMPROW
-	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-	dec	ecx
-	jnz	short .convloop
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
-;                      DCTELEM * workspace);
-;
-
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-
-%define coef_block	ebp+8		; JCOEFPTR coef_block
-%define divisors	ebp+12		; DCTELEM * divisors
-%define workspace	ebp+16		; DCTELEM * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_sse2) PRIVATE
-
-EXTN(jsimd_quantize_sse2):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	esi, POINTER [workspace]
-	mov	edx, POINTER [divisors]
-	mov	edi, JCOEFPTR [coef_block]
-	mov	eax, DCTSIZE2/32
-	alignx	16,7
-.quantloop:
-	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
-	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
-	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
-	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
-	movdqa	xmm0,xmm4
-	movdqa	xmm1,xmm5
-	movdqa	xmm2,xmm6
-	movdqa	xmm3,xmm7
-	psraw	xmm4,(WORD_BIT-1)
-	psraw	xmm5,(WORD_BIT-1)
-	psraw	xmm6,(WORD_BIT-1)
-	psraw	xmm7,(WORD_BIT-1)
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm5
-	pxor	xmm2,xmm6
-	pxor	xmm3,xmm7
-	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;
-	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;
-	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;
-	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;
-
-	paddw	xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
-	paddw	xmm1, XMMWORD [CORRECTION(1,0,edx)]
-	paddw	xmm2, XMMWORD [CORRECTION(2,0,edx)]
-	paddw	xmm3, XMMWORD [CORRECTION(3,0,edx)]
-	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
-	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
-	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
-	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
-	pmulhuw	xmm0, XMMWORD [SCALE(0,0,edx)]	; scale
-	pmulhuw	xmm1, XMMWORD [SCALE(1,0,edx)]
-	pmulhuw	xmm2, XMMWORD [SCALE(2,0,edx)]
-	pmulhuw	xmm3, XMMWORD [SCALE(3,0,edx)]
-
-	pxor	xmm0,xmm4
-	pxor	xmm1,xmm5
-	pxor	xmm2,xmm6
-	pxor	xmm3,xmm7
-	psubw	xmm0,xmm4
-	psubw	xmm1,xmm5
-	psubw	xmm2,xmm6
-	psubw	xmm3,xmm7
-	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
-	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
-	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
-	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
-	add	esi, byte 32*SIZEOF_DCTELEM
-	add	edx, byte 32*SIZEOF_DCTELEM
-	add	edi, byte 32*SIZEOF_JCOEF
-	dec	eax
-	jnz	near .quantloop
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcqntsse.asm b/simd/jcqntsse.asm
deleted file mode 100644
index df7243e..0000000
--- a/simd/jcqntsse.asm
+++ /dev/null
@@ -1,211 +0,0 @@
-;
-; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                           FAST_FLOAT * workspace);
-;
-
-%define sample_data	ebp+8		; JSAMPARRAY sample_data
-%define start_col	ebp+12		; JDIMENSION start_col
-%define workspace	ebp+16		; FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_convsamp_float_sse) PRIVATE
-
-EXTN(jsimd_convsamp_float_sse):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	pcmpeqw  mm7,mm7
-	psllw    mm7,7
-	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
-
-	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [start_col]
-	mov	edi, POINTER [workspace]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/2
-	alignx	16,7
-.convloop:
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
-
-	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-	psubb	mm0,mm7				; mm0=(01234567)
-	psubb	mm1,mm7				; mm1=(89ABCDEF)
-
-	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
-	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
-	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
-	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
-
-	punpcklwd mm4,mm2			; mm4=(***0***1)
-	punpckhwd mm2,mm2			; mm2=(***2***3)
-	punpcklwd mm5,mm0			; mm5=(***4***5)
-	punpckhwd mm0,mm0			; mm0=(***6***7)
-
-	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
-	psrad     mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
-	cvtpi2ps  xmm0,mm4			; xmm0=(01**)
-	cvtpi2ps  xmm1,mm2			; xmm1=(23**)
-	psrad     mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
-	psrad     mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
-	cvtpi2ps  xmm2,mm5			; xmm2=(45**)
-	cvtpi2ps  xmm3,mm0			; xmm3=(67**)
-
-	punpcklwd mm6,mm3			; mm6=(***8***9)
-	punpckhwd mm3,mm3			; mm3=(***A***B)
-	punpcklwd mm4,mm1			; mm4=(***C***D)
-	punpckhwd mm1,mm1			; mm1=(***E***F)
-
-	psrad     mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
-	psrad     mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
-	cvtpi2ps  xmm4,mm6			; xmm4=(89**)
-	cvtpi2ps  xmm5,mm3			; xmm5=(AB**)
-	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
-	psrad     mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
-	cvtpi2ps  xmm6,mm4			; xmm6=(CD**)
-	cvtpi2ps  xmm7,mm1			; xmm7=(EF**)
-
-	movlhps   xmm0,xmm1			; xmm0=(0123)
-	movlhps   xmm2,xmm3			; xmm2=(4567)
-	movlhps   xmm4,xmm5			; xmm4=(89AB)
-	movlhps   xmm6,xmm7			; xmm6=(CDEF)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
-	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-
-	add	esi, byte 2*SIZEOF_JSAMPROW
-	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	near .convloop
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-;                           FAST_FLOAT * workspace);
-;
-
-%define coef_block	ebp+8		; JCOEFPTR coef_block
-%define divisors	ebp+12		; FAST_FLOAT * divisors
-%define workspace	ebp+16		; FAST_FLOAT * workspace
-
-	align	16
-	global	EXTN(jsimd_quantize_float_sse) PRIVATE
-
-EXTN(jsimd_quantize_float_sse):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	esi, POINTER [workspace]
-	mov	edx, POINTER [divisors]
-	mov	edi, JCOEFPTR [coef_block]
-	mov	eax, DCTSIZE2/16
-	alignx	16,7
-.quantloop:
-	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-	movhlps  xmm4,xmm0
-	movhlps  xmm5,xmm1
-
-	cvtps2pi mm0,xmm0
-	cvtps2pi mm1,xmm1
-	cvtps2pi mm4,xmm4
-	cvtps2pi mm5,xmm5
-
-	movhlps  xmm6,xmm2
-	movhlps  xmm7,xmm3
-
-	cvtps2pi mm2,xmm2
-	cvtps2pi mm3,xmm3
-	cvtps2pi mm6,xmm6
-	cvtps2pi mm7,xmm7
-
-	packssdw mm0,mm4
-	packssdw mm1,mm5
-	packssdw mm2,mm6
-	packssdw mm3,mm7
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
-
-	add	esi, byte 16*SIZEOF_FAST_FLOAT
-	add	edx, byte 16*SIZEOF_FAST_FLOAT
-	add	edi, byte 16*SIZEOF_JCOEF
-	dec	eax
-	jnz	short .quantloop
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcsammmx.asm b/simd/jcsammmx.asm
deleted file mode 100644
index e5e2d23..0000000
--- a/simd/jcsammmx.asm
+++ /dev/null
@@ -1,324 +0,0 @@
-;
-; jcsammmx.asm - downsampling (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
-;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION image_width
-%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
-%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
-%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
-%define input_data(b)	(b)+24		; JSAMPARRAY input_data
-%define output_data(b)	(b)+28	; JSAMPARRAY output_data
-
-	align	16
-	global	EXTN(jsimd_h2v1_downsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v1_downsample_mmx):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	ecx, JDIMENSION [width_blks(ebp)]
-	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
-	jz	near .return
-
-	mov	edx, JDIMENSION [img_width(ebp)]
-
-	; -- expand_right_edge
-
-	push	ecx
-	shl	ecx,1				; output_cols * 2
-	sub	ecx,edx
-	jle	short .expand_end
-
-	mov	eax, INT [max_v_samp(ebp)]
-	test	eax,eax
-	jle	short .expand_end
-
-	cld
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	alignx	16,7
-.expandloop:
-	push	eax
-	push	ecx
-
-	mov	edi, JSAMPROW [esi]
-	add	edi,edx
-	mov	al, JSAMPLE [edi-1]
-
-	rep stosb
-
-	pop	ecx
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW
-	dec	eax
-	jg	short .expandloop
-
-.expand_end:
-	pop	ecx				; output_cols
-
-	; -- h2v1_downsample
-
-	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
-	test	eax,eax
-	jle	near .return
-
-	mov       edx, 0x00010000	; bias pattern
-	movd      mm7,edx
-	pcmpeqw   mm6,mm6
-	punpckldq mm7,mm7		; mm7={0, 1, 0, 1}
-	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
-	alignx	16,7
-.rowloop:
-	push	ecx
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]		; inptr
-	mov	edi, JSAMPROW [edi]		; outptr
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mm1, MMWORD [esi+1*SIZEOF_MMWORD]
-	movq	mm2,mm0
-	movq	mm3,mm1
-
-	pand	mm0,mm6
-	psrlw	mm2,BYTE_BIT
-	pand	mm1,mm6
-	psrlw	mm3,BYTE_BIT
-
-	paddw	mm0,mm2
-	paddw	mm1,mm3
-	paddw	mm0,mm7
-	paddw	mm1,mm7
-	psrlw	mm0,1
-	psrlw	mm1,1
-
-	packuswb mm0,mm1
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
-
-	add	esi, byte 2*SIZEOF_MMWORD	; inptr
-	add	edi, byte 1*SIZEOF_MMWORD	; outptr
-	sub	ecx, byte SIZEOF_MMWORD		; outcol
-	jnz	short .columnloop
-
-	pop	esi
-	pop	edi
-	pop	ecx
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_data
-	add	edi, byte SIZEOF_JSAMPROW	; output_data
-	dec	eax				; rowctr
-	jg	short .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
-;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION image_width
-%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
-%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
-%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
-%define input_data(b)	(b)+24		; JSAMPARRAY input_data
-%define output_data(b)	(b)+28	; JSAMPARRAY output_data
-
-	align	16
-	global	EXTN(jsimd_h2v2_downsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v2_downsample_mmx):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	ecx, JDIMENSION [width_blks(ebp)]
-	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
-	jz	near .return
-
-	mov	edx, JDIMENSION [img_width(ebp)]
-
-	; -- expand_right_edge
-
-	push	ecx
-	shl	ecx,1				; output_cols * 2
-	sub	ecx,edx
-	jle	short .expand_end
-
-	mov	eax, INT [max_v_samp(ebp)]
-	test	eax,eax
-	jle	short .expand_end
-
-	cld
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	alignx	16,7
-.expandloop:
-	push	eax
-	push	ecx
-
-	mov	edi, JSAMPROW [esi]
-	add	edi,edx
-	mov	al, JSAMPLE [edi-1]
-
-	rep stosb
-
-	pop	ecx
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW
-	dec	eax
-	jg	short .expandloop
-
-.expand_end:
-	pop	ecx				; output_cols
-
-	; -- h2v2_downsample
-
-	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
-	test	eax,eax
-	jle	near .return
-
-	mov       edx, 0x00020001	; bias pattern
-	movd      mm7,edx
-	pcmpeqw   mm6,mm6
-	punpckldq mm7,mm7		; mm7={1, 2, 1, 2}
-	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
-	alignx	16,7
-.rowloop:
-	push	ecx
-	push	edi
-	push	esi
-
-	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
-	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
-	mov	edi, JSAMPROW [edi]			; outptr
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [edx+0*SIZEOF_MMWORD]
-	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mm2, MMWORD [edx+1*SIZEOF_MMWORD]
-	movq	mm3, MMWORD [esi+1*SIZEOF_MMWORD]
-
-	movq	mm4,mm0
-	movq	mm5,mm1
-	pand	mm0,mm6
-	psrlw	mm4,BYTE_BIT
-	pand	mm1,mm6
-	psrlw	mm5,BYTE_BIT
-	paddw	mm0,mm4
-	paddw	mm1,mm5
-
-	movq	mm4,mm2
-	movq	mm5,mm3
-	pand	mm2,mm6
-	psrlw	mm4,BYTE_BIT
-	pand	mm3,mm6
-	psrlw	mm5,BYTE_BIT
-	paddw	mm2,mm4
-	paddw	mm3,mm5
-
-	paddw	mm0,mm1
-	paddw	mm2,mm3
-	paddw	mm0,mm7
-	paddw	mm2,mm7
-	psrlw	mm0,2
-	psrlw	mm2,2
-
-	packuswb mm0,mm2
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
-
-	add	edx, byte 2*SIZEOF_MMWORD	; inptr0
-	add	esi, byte 2*SIZEOF_MMWORD	; inptr1
-	add	edi, byte 1*SIZEOF_MMWORD	; outptr
-	sub	ecx, byte SIZEOF_MMWORD		; outcol
-	jnz	near .columnloop
-
-	pop	esi
-	pop	edi
-	pop	ecx
-
-	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
-	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
-	dec	eax				; rowctr
-	jg	near .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcsample-altivec.c b/simd/jcsample-altivec.c
new file mode 100644
index 0000000..603492d
--- /dev/null
+++ b/simd/jcsample-altivec.c
@@ -0,0 +1,158 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_altivec.h"
+#include "jcsample.h"
+
+
+void
+jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_blocks,
+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int outrow, outcol;
+  JDIMENSION output_cols = width_blocks * DCTSIZE;
+  JSAMPROW inptr, outptr;
+
+  __vector unsigned char this0, next0, out;
+  __vector unsigned short this0e, this0o, next0e, next0o, outl, outh;
+
+  /* Constants */
+  __vector unsigned short pw_bias = { __4X2(0, 1) },
+    pw_one = { __8X(1) };
+  __vector unsigned char even_odd_index =
+    {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15},
+    pb_zero = { __16X(0) };
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr = input_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 16, inptr += 32, outptr += 16) {
+
+      this0 = vec_ld(0, inptr);
+      this0 = vec_perm(this0, this0, even_odd_index);
+      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
+      outl = vec_add(this0e, this0o);
+      outl = vec_add(outl, pw_bias);
+      outl = vec_sr(outl, pw_one);
+
+      if (outcol > 8) {
+        next0 = vec_ld(16, inptr);
+        next0 = vec_perm(next0, next0, even_odd_index);
+        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
+        outh = vec_add(next0e, next0o);
+        outh = vec_add(outh, pw_bias);
+        outh = vec_sr(outh, pw_one);
+      } else
+        outh = vec_splat_u16(0);
+
+      out = vec_pack(outl, outh);
+      vec_st(out, 0, outptr);
+    }
+  }
+}
+
+
+void
+jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_blocks,
+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int inrow, outrow, outcol;
+  JDIMENSION output_cols = width_blocks * DCTSIZE;
+  JSAMPROW inptr0, inptr1, outptr;
+
+  __vector unsigned char this0, next0, this1, next1, out;
+  __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o,
+    next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
+
+  /* Constants */
+  __vector unsigned short pw_bias = { __4X2(1, 2) },
+    pw_two = { __8X(2) };
+  __vector unsigned char even_odd_index =
+    { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+    pb_zero = { __16X(0) };
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+       inrow += 2, outrow++) {
+
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr = output_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) {
+
+      this0 = vec_ld(0, inptr0);
+      this0 = vec_perm(this0, this0, even_odd_index);
+      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
+      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
+      out0l = vec_add(this0e, this0o);
+
+      this1 = vec_ld(0, inptr1);
+      this1 = vec_perm(this1, this1, even_odd_index);
+      this1e = (__vector unsigned short)VEC_UNPACKHU(this1);
+      this1o = (__vector unsigned short)VEC_UNPACKLU(this1);
+      out1l = vec_add(this1e, this1o);
+
+      outl = vec_add(out0l, out1l);
+      outl = vec_add(outl, pw_bias);
+      outl = vec_sr(outl, pw_two);
+
+      if (outcol > 8) {
+        next0 = vec_ld(16, inptr0);
+        next0 = vec_perm(next0, next0, even_odd_index);
+        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
+        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
+        out0h = vec_add(next0e, next0o);
+
+        next1 = vec_ld(16, inptr1);
+        next1 = vec_perm(next1, next1, even_odd_index);
+        next1e = (__vector unsigned short)VEC_UNPACKHU(next1);
+        next1o = (__vector unsigned short)VEC_UNPACKLU(next1);
+        out1h = vec_add(next1e, next1o);
+
+        outh = vec_add(out0h, out1h);
+        outh = vec_add(outh, pw_bias);
+        outh = vec_sr(outh, pw_two);
+      } else
+        outh = vec_splat_u16(0);
+
+      out = vec_pack(outl, outh);
+      vec_st(out, 0, outptr);
+    }
+  }
+}
diff --git a/simd/jcsample-mmx.asm b/simd/jcsample-mmx.asm
new file mode 100644
index 0000000..6881a56
--- /dev/null
+++ b/simd/jcsample-mmx.asm
@@ -0,0 +1,324 @@
+;
+; jcsample.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b)+8           ; JDIMENSION image_width
+%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
+%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
+%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
+%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
+
+        align   16
+        global  EXTN(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+        push    ebp
+        mov     ebp,esp
+;       push    ebx             ; unused
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     ecx, JDIMENSION [width_blks(ebp)]
+        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
+        jz      near .return
+
+        mov     edx, JDIMENSION [img_width(ebp)]
+
+        ; -- expand_right_edge
+
+        push    ecx
+        shl     ecx,1                           ; output_cols * 2
+        sub     ecx,edx
+        jle     short .expand_end
+
+        mov     eax, INT [max_v_samp(ebp)]
+        test    eax,eax
+        jle     short .expand_end
+
+        cld
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        alignx  16,7
+.expandloop:
+        push    eax
+        push    ecx
+
+        mov     edi, JSAMPROW [esi]
+        add     edi,edx
+        mov     al, JSAMPLE [edi-1]
+
+        rep stosb
+
+        pop     ecx
+        pop     eax
+
+        add     esi, byte SIZEOF_JSAMPROW
+        dec     eax
+        jg      short .expandloop
+
+.expand_end:
+        pop     ecx                             ; output_cols
+
+        ; -- h2v1_downsample
+
+        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
+        test    eax,eax
+        jle     near .return
+
+        mov       edx, 0x00010000       ; bias pattern
+        movd      mm7,edx
+        pcmpeqw   mm6,mm6
+        punpckldq mm7,mm7               ; mm7={0, 1, 0, 1}
+        psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
+        alignx  16,7
+.rowloop:
+        push    ecx
+        push    edi
+        push    esi
+
+        mov     esi, JSAMPROW [esi]             ; inptr
+        mov     edi, JSAMPROW [edi]             ; outptr
+        alignx  16,7
+.columnloop:
+
+        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+        movq    mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+        movq    mm2,mm0
+        movq    mm3,mm1
+
+        pand    mm0,mm6
+        psrlw   mm2,BYTE_BIT
+        pand    mm1,mm6
+        psrlw   mm3,BYTE_BIT
+
+        paddw   mm0,mm2
+        paddw   mm1,mm3
+        paddw   mm0,mm7
+        paddw   mm1,mm7
+        psrlw   mm0,1
+        psrlw   mm1,1
+
+        packuswb mm0,mm1
+
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
+        add     edi, byte 1*SIZEOF_MMWORD       ; outptr
+        sub     ecx, byte SIZEOF_MMWORD         ; outcol
+        jnz     short .columnloop
+
+        pop     esi
+        pop     edi
+        pop     ecx
+
+        add     esi, byte SIZEOF_JSAMPROW       ; input_data
+        add     edi, byte SIZEOF_JSAMPROW       ; output_data
+        dec     eax                             ; rowctr
+        jg      short .rowloop
+
+        emms            ; empty MMX state
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+;       pop     ebx             ; unused
+        pop     ebp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b)+8           ; JDIMENSION image_width
+%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
+%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
+%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
+%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
+
+        align   16
+        global  EXTN(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+        push    ebp
+        mov     ebp,esp
+;       push    ebx             ; unused
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     ecx, JDIMENSION [width_blks(ebp)]
+        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
+        jz      near .return
+
+        mov     edx, JDIMENSION [img_width(ebp)]
+
+        ; -- expand_right_edge
+
+        push    ecx
+        shl     ecx,1                           ; output_cols * 2
+        sub     ecx,edx
+        jle     short .expand_end
+
+        mov     eax, INT [max_v_samp(ebp)]
+        test    eax,eax
+        jle     short .expand_end
+
+        cld
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        alignx  16,7
+.expandloop:
+        push    eax
+        push    ecx
+
+        mov     edi, JSAMPROW [esi]
+        add     edi,edx
+        mov     al, JSAMPLE [edi-1]
+
+        rep stosb
+
+        pop     ecx
+        pop     eax
+
+        add     esi, byte SIZEOF_JSAMPROW
+        dec     eax
+        jg      short .expandloop
+
+.expand_end:
+        pop     ecx                             ; output_cols
+
+        ; -- h2v2_downsample
+
+        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
+        test    eax,eax
+        jle     near .return
+
+        mov       edx, 0x00020001       ; bias pattern
+        movd      mm7,edx
+        pcmpeqw   mm6,mm6
+        punpckldq mm7,mm7               ; mm7={1, 2, 1, 2}
+        psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
+        alignx  16,7
+.rowloop:
+        push    ecx
+        push    edi
+        push    esi
+
+        mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
+        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
+        mov     edi, JSAMPROW [edi]                     ; outptr
+        alignx  16,7
+.columnloop:
+
+        movq    mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+        movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+        movq    mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+        movq    mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+        movq    mm4,mm0
+        movq    mm5,mm1
+        pand    mm0,mm6
+        psrlw   mm4,BYTE_BIT
+        pand    mm1,mm6
+        psrlw   mm5,BYTE_BIT
+        paddw   mm0,mm4
+        paddw   mm1,mm5
+
+        movq    mm4,mm2
+        movq    mm5,mm3
+        pand    mm2,mm6
+        psrlw   mm4,BYTE_BIT
+        pand    mm3,mm6
+        psrlw   mm5,BYTE_BIT
+        paddw   mm2,mm4
+        paddw   mm3,mm5
+
+        paddw   mm0,mm1
+        paddw   mm2,mm3
+        paddw   mm0,mm7
+        paddw   mm2,mm7
+        psrlw   mm0,2
+        psrlw   mm2,2
+
+        packuswb mm0,mm2
+
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+        add     edx, byte 2*SIZEOF_MMWORD       ; inptr0
+        add     esi, byte 2*SIZEOF_MMWORD       ; inptr1
+        add     edi, byte 1*SIZEOF_MMWORD       ; outptr
+        sub     ecx, byte SIZEOF_MMWORD         ; outcol
+        jnz     near .columnloop
+
+        pop     esi
+        pop     edi
+        pop     ecx
+
+        add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
+        add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
+        dec     eax                             ; rowctr
+        jg      near .rowloop
+
+        emms            ; empty MMX state
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+;       pop     ebx             ; unused
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jcsample-sse2-64.asm b/simd/jcsample-sse2-64.asm
new file mode 100644
index 0000000..7693285
--- /dev/null
+++ b/simd/jcsample-sse2-64.asm
@@ -0,0 +1,330 @@
+;
+; jcsample.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+; r10 = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12 = JDIMENSION v_samp_factor
+; r13 = JDIMENSION width_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+        align   16
+        global  EXTN(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+        push    rbp
+        mov     rax,rsp
+        mov     rbp,rsp
+        collect_args
+
+        mov ecx, r13d
+        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
+        jz      near .return
+
+        mov edx, r10d
+
+        ; -- expand_right_edge
+
+        push    rcx
+        shl     rcx,1                           ; output_cols * 2
+        sub     rcx,rdx
+        jle     short .expand_end
+
+        mov     rax, r11
+        test    rax,rax
+        jle     short .expand_end
+
+        cld
+        mov     rsi, r14        ; input_data
+.expandloop:
+        push    rax
+        push    rcx
+
+        mov     rdi, JSAMPROW [rsi]
+        add     rdi,rdx
+        mov     al, JSAMPLE [rdi-1]
+
+        rep stosb
+
+        pop     rcx
+        pop     rax
+
+        add     rsi, byte SIZEOF_JSAMPROW
+        dec     rax
+        jg      short .expandloop
+
+.expand_end:
+        pop     rcx                             ; output_cols
+
+        ; -- h2v1_downsample
+
+        mov     eax, r12d        ; rowctr
+        test    eax,eax
+        jle     near .return
+
+        mov     rdx, 0x00010000         ; bias pattern
+        movd    xmm7,edx
+        pcmpeqw xmm6,xmm6
+        pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+        mov     rsi, r14        ; input_data
+        mov     rdi, r15        ; output_data
+.rowloop:
+        push    rcx
+        push    rdi
+        push    rsi
+
+        mov     rsi, JSAMPROW [rsi]             ; inptr
+        mov rdi, JSAMPROW [rdi]         ; outptr
+
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jae     short .columnloop
+
+.columnloop_r8:
+        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        pxor    xmm1,xmm1
+        mov     rcx, SIZEOF_XMMWORD
+        jmp     short .downsample
+
+.columnloop:
+        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        movdqa  xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+        movdqa  xmm2,xmm0
+        movdqa  xmm3,xmm1
+
+        pand    xmm0,xmm6
+        psrlw   xmm2,BYTE_BIT
+        pand    xmm1,xmm6
+        psrlw   xmm3,BYTE_BIT
+
+        paddw   xmm0,xmm2
+        paddw   xmm1,xmm3
+        paddw   xmm0,xmm7
+        paddw   xmm1,xmm7
+        psrlw   xmm0,1
+        psrlw   xmm1,1
+
+        packuswb xmm0,xmm1
+
+        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+        sub     rcx, byte SIZEOF_XMMWORD        ; outcol
+        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
+        add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jae     short .columnloop
+        test    rcx,rcx
+        jnz     short .columnloop_r8
+
+        pop     rsi
+        pop     rdi
+        pop     rcx
+
+        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
+        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
+        dec     rax                             ; rowctr
+        jg      near .rowloop
+
+.return:
+        uncollect_args
+        pop     rbp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+; r10 = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12 = JDIMENSION v_samp_factor
+; r13 = JDIMENSION width_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+        align   16
+        global  EXTN(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+        push    rbp
+        mov     rax,rsp
+        mov     rbp,rsp
+        collect_args
+
+        mov     ecx, r13d
+        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
+        jz      near .return
+
+        mov     edx, r10d
+
+        ; -- expand_right_edge
+
+        push    rcx
+        shl     rcx,1                           ; output_cols * 2
+        sub     rcx,rdx
+        jle     short .expand_end
+
+        mov     rax, r11
+        test    rax,rax
+        jle     short .expand_end
+
+        cld
+        mov     rsi, r14        ; input_data
+.expandloop:
+        push    rax
+        push    rcx
+
+        mov     rdi, JSAMPROW [rsi]
+        add     rdi,rdx
+        mov     al, JSAMPLE [rdi-1]
+
+        rep stosb
+
+        pop     rcx
+        pop     rax
+
+        add     rsi, byte SIZEOF_JSAMPROW
+        dec     rax
+        jg      short .expandloop
+
+.expand_end:
+        pop     rcx                             ; output_cols
+
+        ; -- h2v2_downsample
+
+        mov     eax, r12d        ; rowctr
+        test    rax,rax
+        jle     near .return
+
+        mov     rdx, 0x00020001         ; bias pattern
+        movd    xmm7,edx
+        pcmpeqw xmm6,xmm6
+        pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+        mov     rsi, r14        ; input_data
+        mov     rdi, r15        ; output_data
+.rowloop:
+        push    rcx
+        push    rdi
+        push    rsi
+
+        mov     rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
+        mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1
+        mov     rdi, JSAMPROW [rdi]                     ; outptr
+
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jae     short .columnloop
+
+.columnloop_r8:
+        movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        pxor    xmm2,xmm2
+        pxor    xmm3,xmm3
+        mov     rcx, SIZEOF_XMMWORD
+        jmp     short .downsample
+
+.columnloop:
+        movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        movdqa  xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+        movdqa  xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+        movdqa  xmm4,xmm0
+        movdqa  xmm5,xmm1
+        pand    xmm0,xmm6
+        psrlw   xmm4,BYTE_BIT
+        pand    xmm1,xmm6
+        psrlw   xmm5,BYTE_BIT
+        paddw   xmm0,xmm4
+        paddw   xmm1,xmm5
+
+        movdqa  xmm4,xmm2
+        movdqa  xmm5,xmm3
+        pand    xmm2,xmm6
+        psrlw   xmm4,BYTE_BIT
+        pand    xmm3,xmm6
+        psrlw   xmm5,BYTE_BIT
+        paddw   xmm2,xmm4
+        paddw   xmm3,xmm5
+
+        paddw   xmm0,xmm1
+        paddw   xmm2,xmm3
+        paddw   xmm0,xmm7
+        paddw   xmm2,xmm7
+        psrlw   xmm0,2
+        psrlw   xmm2,2
+
+        packuswb xmm0,xmm2
+
+        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+        sub     rcx, byte SIZEOF_XMMWORD        ; outcol
+        add     rdx, byte 2*SIZEOF_XMMWORD      ; inptr0
+        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr1
+        add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jae     near .columnloop
+        test    rcx,rcx
+        jnz     near .columnloop_r8
+
+        pop     rsi
+        pop     rdi
+        pop     rcx
+
+        add     rsi, byte 2*SIZEOF_JSAMPROW     ; input_data
+        add     rdi, byte 1*SIZEOF_JSAMPROW     ; output_data
+        dec     rax                             ; rowctr
+        jg      near .rowloop
+
+.return:
+        uncollect_args
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jcsample-sse2.asm b/simd/jcsample-sse2.asm
new file mode 100644
index 0000000..11202db
--- /dev/null
+++ b/simd/jcsample-sse2.asm
@@ -0,0 +1,351 @@
+;
+; jcsample.asm - downsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b)+8           ; JDIMENSION image_width
+%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
+%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
+%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
+%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
+
+        align   16
+        global  EXTN(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+        push    ebp
+        mov     ebp,esp
+;       push    ebx             ; unused
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     ecx, JDIMENSION [width_blks(ebp)]
+        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
+        jz      near .return
+
+        mov     edx, JDIMENSION [img_width(ebp)]
+
+        ; -- expand_right_edge
+
+        push    ecx
+        shl     ecx,1                           ; output_cols * 2
+        sub     ecx,edx
+        jle     short .expand_end
+
+        mov     eax, INT [max_v_samp(ebp)]
+        test    eax,eax
+        jle     short .expand_end
+
+        cld
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        alignx  16,7
+.expandloop:
+        push    eax
+        push    ecx
+
+        mov     edi, JSAMPROW [esi]
+        add     edi,edx
+        mov     al, JSAMPLE [edi-1]
+
+        rep stosb
+
+        pop     ecx
+        pop     eax
+
+        add     esi, byte SIZEOF_JSAMPROW
+        dec     eax
+        jg      short .expandloop
+
+.expand_end:
+        pop     ecx                             ; output_cols
+
+        ; -- h2v1_downsample
+
+        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
+        test    eax,eax
+        jle     near .return
+
+        mov     edx, 0x00010000         ; bias pattern
+        movd    xmm7,edx
+        pcmpeqw xmm6,xmm6
+        pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
+        alignx  16,7
+.rowloop:
+        push    ecx
+        push    edi
+        push    esi
+
+        mov     esi, JSAMPROW [esi]             ; inptr
+        mov     edi, JSAMPROW [edi]             ; outptr
+
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jae     short .columnloop
+        alignx  16,7
+
+.columnloop_r8:
+        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        pxor    xmm1,xmm1
+        mov     ecx, SIZEOF_XMMWORD
+        jmp     short .downsample
+        alignx  16,7
+
+.columnloop:
+        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        movdqa  xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+        movdqa  xmm2,xmm0
+        movdqa  xmm3,xmm1
+
+        pand    xmm0,xmm6
+        psrlw   xmm2,BYTE_BIT
+        pand    xmm1,xmm6
+        psrlw   xmm3,BYTE_BIT
+
+        paddw   xmm0,xmm2
+        paddw   xmm1,xmm3
+        paddw   xmm0,xmm7
+        paddw   xmm1,xmm7
+        psrlw   xmm0,1
+        psrlw   xmm1,1
+
+        packuswb xmm0,xmm1
+
+        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+        sub     ecx, byte SIZEOF_XMMWORD        ; outcol
+        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
+        add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jae     short .columnloop
+        test    ecx,ecx
+        jnz     short .columnloop_r8
+
+        pop     esi
+        pop     edi
+        pop     ecx
+
+        add     esi, byte SIZEOF_JSAMPROW       ; input_data
+        add     edi, byte SIZEOF_JSAMPROW       ; output_data
+        dec     eax                             ; rowctr
+        jg      near .rowloop
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+;       pop     ebx             ; unused
+        pop     ebp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b)+8           ; JDIMENSION image_width
+%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
+%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
+%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
+%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
+
+        align   16
+        global  EXTN(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+        push    ebp
+        mov     ebp,esp
+;       push    ebx             ; unused
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     ecx, JDIMENSION [width_blks(ebp)]
+        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
+        jz      near .return
+
+        mov     edx, JDIMENSION [img_width(ebp)]
+
+        ; -- expand_right_edge
+
+        push    ecx
+        shl     ecx,1                           ; output_cols * 2
+        sub     ecx,edx
+        jle     short .expand_end
+
+        mov     eax, INT [max_v_samp(ebp)]
+        test    eax,eax
+        jle     short .expand_end
+
+        cld
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        alignx  16,7
+.expandloop:
+        push    eax
+        push    ecx
+
+        mov     edi, JSAMPROW [esi]
+        add     edi,edx
+        mov     al, JSAMPLE [edi-1]
+
+        rep stosb
+
+        pop     ecx
+        pop     eax
+
+        add     esi, byte SIZEOF_JSAMPROW
+        dec     eax
+        jg      short .expandloop
+
+.expand_end:
+        pop     ecx                             ; output_cols
+
+        ; -- h2v2_downsample
+
+        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
+        test    eax,eax
+        jle     near .return
+
+        mov     edx, 0x00020001         ; bias pattern
+        movd    xmm7,edx
+        pcmpeqw xmm6,xmm6
+        pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
+        alignx  16,7
+.rowloop:
+        push    ecx
+        push    edi
+        push    esi
+
+        mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
+        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
+        mov     edi, JSAMPROW [edi]                     ; outptr
+
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jae     short .columnloop
+        alignx  16,7
+
+.columnloop_r8:
+        movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        pxor    xmm2,xmm2
+        pxor    xmm3,xmm3
+        mov     ecx, SIZEOF_XMMWORD
+        jmp     short .downsample
+        alignx  16,7
+
+.columnloop:
+        movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        movdqa  xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+        movdqa  xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+        movdqa  xmm4,xmm0
+        movdqa  xmm5,xmm1
+        pand    xmm0,xmm6
+        psrlw   xmm4,BYTE_BIT
+        pand    xmm1,xmm6
+        psrlw   xmm5,BYTE_BIT
+        paddw   xmm0,xmm4
+        paddw   xmm1,xmm5
+
+        movdqa  xmm4,xmm2
+        movdqa  xmm5,xmm3
+        pand    xmm2,xmm6
+        psrlw   xmm4,BYTE_BIT
+        pand    xmm3,xmm6
+        psrlw   xmm5,BYTE_BIT
+        paddw   xmm2,xmm4
+        paddw   xmm3,xmm5
+
+        paddw   xmm0,xmm1
+        paddw   xmm2,xmm3
+        paddw   xmm0,xmm7
+        paddw   xmm2,xmm7
+        psrlw   xmm0,2
+        psrlw   xmm2,2
+
+        packuswb xmm0,xmm2
+
+        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+        sub     ecx, byte SIZEOF_XMMWORD        ; outcol
+        add     edx, byte 2*SIZEOF_XMMWORD      ; inptr0
+        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr1
+        add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jae     near .columnloop
+        test    ecx,ecx
+        jnz     near .columnloop_r8
+
+        pop     esi
+        pop     edi
+        pop     ecx
+
+        add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
+        add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
+        dec     eax                             ; rowctr
+        jg      near .rowloop
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+;       pop     ebx             ; unused
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jcsample.h b/simd/jcsample.h
new file mode 100644
index 0000000..2a50544
--- /dev/null
+++ b/simd/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge (JSAMPARRAY image_data, int num_rows,
+                   JDIMENSION input_cols, JDIMENSION output_cols)
+{
+  register JSAMPROW ptr;
+  register JSAMPLE pixval;
+  register int count;
+  int row;
+  int numcols = (int) (output_cols - input_cols);
+
+  if (numcols > 0) {
+    for (row = 0; row < num_rows; row++) {
+      ptr = image_data[row] + input_cols;
+      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      for (count = numcols; count > 0; count--)
+        *ptr++ = pixval;
+    }
+  }
+}
diff --git a/simd/jcsamss2-64.asm b/simd/jcsamss2-64.asm
deleted file mode 100644
index 9cd4d1c..0000000
--- a/simd/jcsamss2-64.asm
+++ /dev/null
@@ -1,330 +0,0 @@
-;
-; jcsamss2-64.asm - downsampling (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-; r10 = JDIMENSION image_width
-; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
-; r14 = JSAMPARRAY input_data
-; r15 = JSAMPARRAY output_data
-
-	align	16
-	global	EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_downsample_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-
-	mov ecx, r13d
-	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
-	jz	near .return
-
-	mov edx, r10d
-
-	; -- expand_right_edge
-
-	push	rcx
-	shl	rcx,1				; output_cols * 2
-	sub	rcx,rdx
-	jle	short .expand_end
-
-	mov	rax, r11
-	test	rax,rax
-	jle	short .expand_end
-
-	cld
-	mov	rsi, r14	; input_data
-.expandloop:
-	push	rax
-	push	rcx
-
-	mov	rdi, JSAMPROW [rsi]
-	add	rdi,rdx
-	mov	al, JSAMPLE [rdi-1]
-
-	rep stosb
-
-	pop	rcx
-	pop	rax
-
-	add	rsi, byte SIZEOF_JSAMPROW
-	dec	rax
-	jg	short .expandloop
-
-.expand_end:
-	pop	rcx				; output_cols
-
-	; -- h2v1_downsample
-
-	mov	eax, r12d	; rowctr
-	test	eax,eax
-	jle	near .return
-
-	mov	rdx, 0x00010000		; bias pattern
-	movd	xmm7,edx
-	pcmpeqw	xmm6,xmm6
-	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
-	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-	mov	rsi, r14	; input_data
-	mov	rdi, r15	; output_data
-.rowloop:
-	push	rcx
-	push	rdi
-	push	rsi
-
-	mov	rsi, JSAMPROW [rsi]		; inptr
-	mov rdi, JSAMPROW [rdi]		; outptr
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	short .columnloop
-
-.columnloop_r8:
-	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	pxor	xmm1,xmm1
-	mov	rcx, SIZEOF_XMMWORD
-	jmp	short .downsample
-
-.columnloop:
-	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqa	xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-.downsample:
-	movdqa	xmm2,xmm0
-	movdqa	xmm3,xmm1
-
-	pand	xmm0,xmm6
-	psrlw	xmm2,BYTE_BIT
-	pand	xmm1,xmm6
-	psrlw	xmm3,BYTE_BIT
-
-	paddw	xmm0,xmm2
-	paddw	xmm1,xmm3
-	paddw	xmm0,xmm7
-	paddw	xmm1,xmm7
-	psrlw	xmm0,1
-	psrlw	xmm1,1
-
-	packuswb xmm0,xmm1
-
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
-	sub	rcx, byte SIZEOF_XMMWORD	; outcol
-	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
-	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	short .columnloop
-	test	rcx,rcx
-	jnz	short .columnloop_r8
-
-	pop	rsi
-	pop	rdi
-	pop	rcx
-
-	add	rsi, byte SIZEOF_JSAMPROW	; input_data
-	add	rdi, byte SIZEOF_JSAMPROW	; output_data
-	dec	rax				; rowctr
-	jg	near .rowloop
-
-.return:
-	uncollect_args
-	pop	rbp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-; r10 = JDIMENSION image_width
-; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
-; r14 = JSAMPARRAY input_data
-; r15 = JSAMPARRAY output_data
-
-	align	16
-	global	EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_downsample_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-
-	mov	ecx, r13d
-	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
-	jz	near .return
-
-	mov	edx, r10d
-
-	; -- expand_right_edge
-
-	push	rcx
-	shl	rcx,1				; output_cols * 2
-	sub	rcx,rdx
-	jle	short .expand_end
-
-	mov	rax, r11
-	test	rax,rax
-	jle	short .expand_end
-
-	cld
-	mov	rsi, r14	; input_data
-.expandloop:
-	push	rax
-	push	rcx
-
-	mov	rdi, JSAMPROW [rsi]
-	add	rdi,rdx
-	mov	al, JSAMPLE [rdi-1]
-
-	rep stosb
-
-	pop	rcx
-	pop	rax
-
-	add	rsi, byte SIZEOF_JSAMPROW
-	dec	rax
-	jg	short .expandloop
-
-.expand_end:
-	pop	rcx				; output_cols
-
-	; -- h2v2_downsample
-
-	mov	eax, r12d	; rowctr
-	test	rax,rax
-	jle	near .return
-
-	mov	rdx, 0x00020001		; bias pattern
-	movd	xmm7,edx
-	pcmpeqw	xmm6,xmm6
-	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
-	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-	mov	rsi, r14	; input_data
-	mov	rdi, r15	; output_data
-.rowloop:
-	push	rcx
-	push	rdi
-	push	rsi
-
-	mov	rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
-	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1
-	mov	rdi, JSAMPROW [rdi]			; outptr
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	short .columnloop
-
-.columnloop_r8:
-	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	mov	rcx, SIZEOF_XMMWORD
-	jmp	short .downsample
-
-.columnloop:
-	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqa	xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-	movdqa	xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-.downsample:
-	movdqa	xmm4,xmm0
-	movdqa	xmm5,xmm1
-	pand	xmm0,xmm6
-	psrlw	xmm4,BYTE_BIT
-	pand	xmm1,xmm6
-	psrlw	xmm5,BYTE_BIT
-	paddw	xmm0,xmm4
-	paddw	xmm1,xmm5
-
-	movdqa	xmm4,xmm2
-	movdqa	xmm5,xmm3
-	pand	xmm2,xmm6
-	psrlw	xmm4,BYTE_BIT
-	pand	xmm3,xmm6
-	psrlw	xmm5,BYTE_BIT
-	paddw	xmm2,xmm4
-	paddw	xmm3,xmm5
-
-	paddw	xmm0,xmm1
-	paddw	xmm2,xmm3
-	paddw	xmm0,xmm7
-	paddw	xmm2,xmm7
-	psrlw	xmm0,2
-	psrlw	xmm2,2
-
-	packuswb xmm0,xmm2
-
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
-	sub	rcx, byte SIZEOF_XMMWORD	; outcol
-	add	rdx, byte 2*SIZEOF_XMMWORD	; inptr0
-	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr1
-	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	test	rcx,rcx
-	jnz	near .columnloop_r8
-
-	pop	rsi
-	pop	rdi
-	pop	rcx
-
-	add	rsi, byte 2*SIZEOF_JSAMPROW	; input_data
-	add	rdi, byte 1*SIZEOF_JSAMPROW	; output_data
-	dec	rax				; rowctr
-	jg	near .rowloop
-
-.return:
-	uncollect_args
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jcsamss2.asm b/simd/jcsamss2.asm
deleted file mode 100644
index feb979d..0000000
--- a/simd/jcsamss2.asm
+++ /dev/null
@@ -1,351 +0,0 @@
-;
-; jcsamss2.asm - downsampling (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION image_width
-%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
-%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
-%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
-%define input_data(b)	(b)+24		; JSAMPARRAY input_data
-%define output_data(b)	(b)+28		; JSAMPARRAY output_data
-
-	align	16
-	global	EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_downsample_sse2):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	ecx, JDIMENSION [width_blks(ebp)]
-	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
-	jz	near .return
-
-	mov	edx, JDIMENSION [img_width(ebp)]
-
-	; -- expand_right_edge
-
-	push	ecx
-	shl	ecx,1				; output_cols * 2
-	sub	ecx,edx
-	jle	short .expand_end
-
-	mov	eax, INT [max_v_samp(ebp)]
-	test	eax,eax
-	jle	short .expand_end
-
-	cld
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	alignx	16,7
-.expandloop:
-	push	eax
-	push	ecx
-
-	mov	edi, JSAMPROW [esi]
-	add	edi,edx
-	mov	al, JSAMPLE [edi-1]
-
-	rep stosb
-
-	pop	ecx
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW
-	dec	eax
-	jg	short .expandloop
-
-.expand_end:
-	pop	ecx				; output_cols
-
-	; -- h2v1_downsample
-
-	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
-	test	eax,eax
-	jle	near .return
-
-	mov	edx, 0x00010000		; bias pattern
-	movd	xmm7,edx
-	pcmpeqw	xmm6,xmm6
-	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
-	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
-	alignx	16,7
-.rowloop:
-	push	ecx
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]		; inptr
-	mov	edi, JSAMPROW [edi]		; outptr
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	short .columnloop
-	alignx	16,7
-
-.columnloop_r8:
-	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	pxor	xmm1,xmm1
-	mov	ecx, SIZEOF_XMMWORD
-	jmp	short .downsample
-	alignx	16,7
-
-.columnloop:
-	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqa	xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-.downsample:
-	movdqa	xmm2,xmm0
-	movdqa	xmm3,xmm1
-
-	pand	xmm0,xmm6
-	psrlw	xmm2,BYTE_BIT
-	pand	xmm1,xmm6
-	psrlw	xmm3,BYTE_BIT
-
-	paddw	xmm0,xmm2
-	paddw	xmm1,xmm3
-	paddw	xmm0,xmm7
-	paddw	xmm1,xmm7
-	psrlw	xmm0,1
-	psrlw	xmm1,1
-
-	packuswb xmm0,xmm1
-
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
-	sub	ecx, byte SIZEOF_XMMWORD	; outcol
-	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
-	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	short .columnloop
-	test	ecx,ecx
-	jnz	short .columnloop_r8
-
-	pop	esi
-	pop	edi
-	pop	ecx
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_data
-	add	edi, byte SIZEOF_JSAMPROW	; output_data
-	dec	eax				; rowctr
-	jg	near .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)	(b)+8			; JDIMENSION image_width
-%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
-%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
-%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
-%define input_data(b)	(b)+24		; JSAMPARRAY input_data
-%define output_data(b)	(b)+28	; JSAMPARRAY output_data
-
-	align	16
-	global	EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_downsample_sse2):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	ecx, JDIMENSION [width_blks(ebp)]
-	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
-	jz	near .return
-
-	mov	edx, JDIMENSION [img_width(ebp)]
-
-	; -- expand_right_edge
-
-	push	ecx
-	shl	ecx,1				; output_cols * 2
-	sub	ecx,edx
-	jle	short .expand_end
-
-	mov	eax, INT [max_v_samp(ebp)]
-	test	eax,eax
-	jle	short .expand_end
-
-	cld
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	alignx	16,7
-.expandloop:
-	push	eax
-	push	ecx
-
-	mov	edi, JSAMPROW [esi]
-	add	edi,edx
-	mov	al, JSAMPLE [edi-1]
-
-	rep stosb
-
-	pop	ecx
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW
-	dec	eax
-	jg	short .expandloop
-
-.expand_end:
-	pop	ecx				; output_cols
-
-	; -- h2v2_downsample
-
-	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
-	test	eax,eax
-	jle	near .return
-
-	mov	edx, 0x00020001		; bias pattern
-	movd	xmm7,edx
-	pcmpeqw	xmm6,xmm6
-	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
-	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
-	alignx	16,7
-.rowloop:
-	push	ecx
-	push	edi
-	push	esi
-
-	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
-	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
-	mov	edi, JSAMPROW [edi]			; outptr
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	short .columnloop
-	alignx	16,7
-
-.columnloop_r8:
-	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
-	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	pxor	xmm2,xmm2
-	pxor	xmm3,xmm3
-	mov	ecx, SIZEOF_XMMWORD
-	jmp	short .downsample
-	alignx	16,7
-
-.columnloop:
-	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
-	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqa	xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
-	movdqa	xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-.downsample:
-	movdqa	xmm4,xmm0
-	movdqa	xmm5,xmm1
-	pand	xmm0,xmm6
-	psrlw	xmm4,BYTE_BIT
-	pand	xmm1,xmm6
-	psrlw	xmm5,BYTE_BIT
-	paddw	xmm0,xmm4
-	paddw	xmm1,xmm5
-
-	movdqa	xmm4,xmm2
-	movdqa	xmm5,xmm3
-	pand	xmm2,xmm6
-	psrlw	xmm4,BYTE_BIT
-	pand	xmm3,xmm6
-	psrlw	xmm5,BYTE_BIT
-	paddw	xmm2,xmm4
-	paddw	xmm3,xmm5
-
-	paddw	xmm0,xmm1
-	paddw	xmm2,xmm3
-	paddw	xmm0,xmm7
-	paddw	xmm2,xmm7
-	psrlw	xmm0,2
-	psrlw	xmm2,2
-
-	packuswb xmm0,xmm2
-
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
-	sub	ecx, byte SIZEOF_XMMWORD	; outcol
-	add	edx, byte 2*SIZEOF_XMMWORD	; inptr0
-	add	esi, byte 2*SIZEOF_XMMWORD	; inptr1
-	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jae	near .columnloop
-	test	ecx,ecx
-	jnz	near .columnloop_r8
-
-	pop	esi
-	pop	edi
-	pop	ecx
-
-	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
-	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
-	dec	eax				; rowctr
-	jg	near .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdclrmmx.asm b/simd/jdclrmmx.asm
deleted file mode 100644
index d2aa165..0000000
--- a/simd/jdclrmmx.asm
+++ /dev/null
@@ -1,405 +0,0 @@
-;
-; jdclrmmx.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
-;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                            JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b)	(b)+8			; JDIMENSION out_width
-%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
-%define input_row(b)	(b)+16		; JDIMENSION input_row
-%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
-%define num_rows(b)	(b)+24		; int num_rows
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_ycc_rgb_convert_mmx) PRIVATE
-
-EXTN(jsimd_ycc_rgb_convert_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	edi, JSAMPIMAGE [input_buf(eax)]
-	mov	ecx, JDIMENSION [input_row(eax)]
-	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
-	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-	pop	ecx
-
-	mov	edi, JSAMPARRAY [output_buf(eax)]
-	mov	eax, INT [num_rows(eax)]
-	test	eax,eax
-	jle	near .return
-	alignx	16,7
-.rowloop:
-	push	eax
-	push	edi
-	push	edx
-	push	ebx
-	push	esi
-	push	ecx			; col
-
-	mov	esi, JSAMPROW [esi]	; inptr0
-	mov	ebx, JSAMPROW [ebx]	; inptr1
-	mov	edx, JSAMPROW [edx]	; inptr2
-	mov	edi, JSAMPROW [edi]	; outptr
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-	alignx	16,7
-.columnloop:
-
-	movq	mm5, MMWORD [ebx]	; mm5=Cb(01234567)
-	movq	mm1, MMWORD [edx]	; mm1=Cr(01234567)
-
-	pcmpeqw	mm4,mm4
-	pcmpeqw	mm7,mm7
-	psrlw	mm4,BYTE_BIT
-	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
-	movq	mm0,mm4			; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
-
-	pand	mm4,mm5			; mm4=Cb(0246)=CbE
-	psrlw	mm5,BYTE_BIT		; mm5=Cb(1357)=CbO
-	pand	mm0,mm1			; mm0=Cr(0246)=CrE
-	psrlw	mm1,BYTE_BIT		; mm1=Cr(1357)=CrO
-
-	paddw	mm4,mm7
-	paddw	mm5,mm7
-	paddw	mm0,mm7
-	paddw	mm1,mm7
-
-	; (Original)
-	; R = Y                + 1.40200 * Cr
-	; G = Y - 0.34414 * Cb - 0.71414 * Cr
-	; B = Y + 1.77200 * Cb
-	;
-	; (This implementation)
-	; R = Y                + 0.40200 * Cr + Cr
-	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-	; B = Y - 0.22800 * Cb + Cb + Cb
-
-	movq	mm2,mm4			; mm2=CbE
-	movq	mm3,mm5			; mm3=CbO
-	paddw	mm4,mm4			; mm4=2*CbE
-	paddw	mm5,mm5			; mm5=2*CbO
-	movq	mm6,mm0			; mm6=CrE
-	movq	mm7,mm1			; mm7=CrO
-	paddw	mm0,mm0			; mm0=2*CrE
-	paddw	mm1,mm1			; mm1=2*CrO
-
-	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbE * -FIX(0.22800))
-	pmulhw	mm5,[GOTOFF(eax,PW_MF0228)]	; mm5=(2*CbO * -FIX(0.22800))
-	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrE * FIX(0.40200))
-	pmulhw	mm1,[GOTOFF(eax,PW_F0402)]	; mm1=(2*CrO * FIX(0.40200))
-
-	paddw	mm4,[GOTOFF(eax,PW_ONE)]
-	paddw	mm5,[GOTOFF(eax,PW_ONE)]
-	psraw	mm4,1			; mm4=(CbE * -FIX(0.22800))
-	psraw	mm5,1			; mm5=(CbO * -FIX(0.22800))
-	paddw	mm0,[GOTOFF(eax,PW_ONE)]
-	paddw	mm1,[GOTOFF(eax,PW_ONE)]
-	psraw	mm0,1			; mm0=(CrE * FIX(0.40200))
-	psraw	mm1,1			; mm1=(CrO * FIX(0.40200))
-
-	paddw	mm4,mm2
-	paddw	mm5,mm3
-	paddw	mm4,mm2			; mm4=(CbE * FIX(1.77200))=(B-Y)E
-	paddw	mm5,mm3			; mm5=(CbO * FIX(1.77200))=(B-Y)O
-	paddw	mm0,mm6			; mm0=(CrE * FIX(1.40200))=(R-Y)E
-	paddw	mm1,mm7			; mm1=(CrO * FIX(1.40200))=(R-Y)O
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=(B-Y)E
-	movq	MMWORD [wk(1)], mm5	; wk(1)=(B-Y)O
-
-	movq      mm4,mm2
-	movq      mm5,mm3
-	punpcklwd mm2,mm6
-	punpckhwd mm4,mm6
-	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
-	punpcklwd mm3,mm7
-	punpckhwd mm5,mm7
-	pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
-	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     mm2,SCALEBITS
-	psrad     mm4,SCALEBITS
-	paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     mm3,SCALEBITS
-	psrad     mm5,SCALEBITS
-
-	packssdw  mm2,mm4	; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-	packssdw  mm3,mm5	; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-	psubw     mm2,mm6	; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-	psubw     mm3,mm7	; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-	movq      mm5, MMWORD [esi]	; mm5=Y(01234567)
-
-	pcmpeqw   mm4,mm4
-	psrlw     mm4,BYTE_BIT		; mm4={0xFF 0x00 0xFF 0x00 ..}
-	pand      mm4,mm5		; mm4=Y(0246)=YE
-	psrlw     mm5,BYTE_BIT		; mm5=Y(1357)=YO
-
-	paddw     mm0,mm4		; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
-	paddw     mm1,mm5		; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
-	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
-	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
-
-	paddw     mm2,mm4		; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
-	paddw     mm3,mm5		; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
-	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
-	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
-
-	paddw     mm4, MMWORD [wk(0)]	; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
-	paddw     mm5, MMWORD [wk(1)]	; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
-	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
-	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
-	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
-	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
-	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
-
-	movq      mmG,mmA
-	movq      mmH,mmA
-	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
-	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
-
-	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
-	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
-
-	movq      mmC,mmD
-	movq      mmB,mmD
-	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
-	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
-
-	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
-
-	movq      mmF,mmE
-	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
-	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
-
-	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
-	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
-	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
-
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st16
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
-	sub	ecx, byte SIZEOF_MMWORD
-	jz	short .nextrow
-
-	add	esi, byte SIZEOF_MMWORD			; inptr0
-	add	ebx, byte SIZEOF_MMWORD			; inptr1
-	add	edx, byte SIZEOF_MMWORD			; inptr2
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st16:
-	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
-	cmp	ecx, byte 2*SIZEOF_MMWORD
-	jb	short .column_st8
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
-	movq	mmA,mmC
-	sub	ecx, byte 2*SIZEOF_MMWORD
-	add	edi, byte 2*SIZEOF_MMWORD
-	jmp	short .column_st4
-.column_st8:
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st4
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	mmA,mmE
-	sub	ecx, byte SIZEOF_MMWORD
-	add	edi, byte SIZEOF_MMWORD
-.column_st4:
-	movd	eax,mmA
-	cmp	ecx, byte SIZEOF_DWORD
-	jb	short .column_st2
-	mov	DWORD [edi+0*SIZEOF_DWORD], eax
-	psrlq	mmA,DWORD_BIT
-	movd	eax,mmA
-	sub	ecx, byte SIZEOF_DWORD
-	add	edi, byte SIZEOF_DWORD
-.column_st2:
-	cmp	ecx, byte SIZEOF_WORD
-	jb	short .column_st1
-	mov	WORD [edi+0*SIZEOF_WORD], ax
-	shr	eax,WORD_BIT
-	sub	ecx, byte SIZEOF_WORD
-	add	edi, byte SIZEOF_WORD
-.column_st1:
-	cmp	ecx, byte SIZEOF_BYTE
-	jb	short .nextrow
-	mov	BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
-	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
-	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
-	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
-	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
-	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
-	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
-	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
-	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
-
-	movq      mmC,mmA
-	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
-	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
-	movq      mmG,mmB
-	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
-	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
-
-	movq      mmD,mmA
-	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
-	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
-	movq      mmH,mmC
-	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
-	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
-
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st16
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
-	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
-	sub	ecx, byte SIZEOF_MMWORD
-	jz	short .nextrow
-
-	add	esi, byte SIZEOF_MMWORD			; inptr0
-	add	ebx, byte SIZEOF_MMWORD			; inptr1
-	add	edx, byte SIZEOF_MMWORD			; inptr2
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st16:
-	cmp	ecx, byte SIZEOF_MMWORD/2
-	jb	short .column_st8
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
-	movq	mmA,mmC
-	movq	mmD,mmH
-	sub	ecx, byte SIZEOF_MMWORD/2
-	add	edi, byte 2*SIZEOF_MMWORD
-.column_st8:
-	cmp	ecx, byte SIZEOF_MMWORD/4
-	jb	short .column_st4
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	mmA,mmD
-	sub	ecx, byte SIZEOF_MMWORD/4
-	add	edi, byte 1*SIZEOF_MMWORD
-.column_st4:
-	cmp	ecx, byte SIZEOF_MMWORD/8
-	jb	short .nextrow
-	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	alignx	16,7
-
-.nextrow:
-	pop	ecx
-	pop	esi
-	pop	ebx
-	pop	edx
-	pop	edi
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW
-	add	ebx, byte SIZEOF_JSAMPROW
-	add	edx, byte SIZEOF_JSAMPROW
-	add	edi, byte SIZEOF_JSAMPROW	; output_buf
-	dec	eax				; num_rows
-	jg	near .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm
deleted file mode 100644
index 37f7468..0000000
--- a/simd/jdclrss2-64.asm
+++ /dev/null
@@ -1,441 +0,0 @@
-;
-; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009, 2012 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-				
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                             JSAMPARRAY output_buf, int num_rows)
-;
-
-; r10 = JDIMENSION out_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION input_row
-; r13 = JSAMPARRAY output_buf
-; r14 = int num_rows
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-	push	rbx
-
-	mov	ecx, r10d	; num_cols
-	test	rcx,rcx
-	jz	near .return
-
-	push	rcx
-
-	mov	rdi, r11
-	mov	ecx, r12d
-	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-	lea	rsi, [rsi+rcx*SIZEOF_JSAMPROW]
-	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
-	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
-	pop	rcx
-
-	mov	rdi, r13
-	mov	eax, r14d
-	test	rax,rax
-	jle	near .return
-.rowloop:
-	push	rax
-	push	rdi
-	push	rdx
-	push	rbx
-	push	rsi
-	push	rcx			; col
-
-	mov	rsi, JSAMPROW [rsi]	; inptr0
-	mov	rbx, JSAMPROW [rbx]	; inptr1
-	mov	rdx, JSAMPROW [rdx]	; inptr2
-	mov	rdi, JSAMPROW [rdi]	; outptr
-.columnloop:
-
-	movdqa	xmm5, XMMWORD [rbx]	; xmm5=Cb(0123456789ABCDEF)
-	movdqa	xmm1, XMMWORD [rdx]	; xmm1=Cr(0123456789ABCDEF)
-
-	pcmpeqw	xmm4,xmm4
-	pcmpeqw	xmm7,xmm7
-	psrlw	xmm4,BYTE_BIT
-	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
-	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
-	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
-	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
-	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
-
-	paddw	xmm4,xmm7
-	paddw	xmm5,xmm7
-	paddw	xmm0,xmm7
-	paddw	xmm1,xmm7
-
-	; (Original)
-	; R = Y                + 1.40200 * Cr
-	; G = Y - 0.34414 * Cb - 0.71414 * Cr
-	; B = Y + 1.77200 * Cb
-	;
-	; (This implementation)
-	; R = Y                + 0.40200 * Cr + Cr
-	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-	; B = Y - 0.22800 * Cb + Cb + Cb
-
-	movdqa	xmm2,xmm4		; xmm2=CbE
-	movdqa	xmm3,xmm5		; xmm3=CbO
-	paddw	xmm4,xmm4		; xmm4=2*CbE
-	paddw	xmm5,xmm5		; xmm5=2*CbO
-	movdqa	xmm6,xmm0		; xmm6=CrE
-	movdqa	xmm7,xmm1		; xmm7=CrO
-	paddw	xmm0,xmm0		; xmm0=2*CrE
-	paddw	xmm1,xmm1		; xmm1=2*CrO
-
-	pmulhw	xmm4,[rel PW_MF0228]	; xmm4=(2*CbE * -FIX(0.22800))
-	pmulhw	xmm5,[rel PW_MF0228]	; xmm5=(2*CbO * -FIX(0.22800))
-	pmulhw	xmm0,[rel PW_F0402]	; xmm0=(2*CrE * FIX(0.40200))
-	pmulhw	xmm1,[rel PW_F0402]	; xmm1=(2*CrO * FIX(0.40200))
-
-	paddw	xmm4,[rel PW_ONE]
-	paddw	xmm5,[rel PW_ONE]
-	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
-	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
-	paddw	xmm0,[rel PW_ONE]
-	paddw	xmm1,[rel PW_ONE]
-	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
-	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
-
-	paddw	xmm4,xmm2
-	paddw	xmm5,xmm3
-	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
-	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
-	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
-	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
-	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
-
-	movdqa    xmm4,xmm2
-	movdqa    xmm5,xmm3
-	punpcklwd xmm2,xmm6
-	punpckhwd xmm4,xmm6
-	pmaddwd   xmm2,[rel PW_MF0344_F0285]
-	pmaddwd   xmm4,[rel PW_MF0344_F0285]
-	punpcklwd xmm3,xmm7
-	punpckhwd xmm5,xmm7
-	pmaddwd   xmm3,[rel PW_MF0344_F0285]
-	pmaddwd   xmm5,[rel PW_MF0344_F0285]
-
-	paddd     xmm2,[rel PD_ONEHALF]
-	paddd     xmm4,[rel PD_ONEHALF]
-	psrad     xmm2,SCALEBITS
-	psrad     xmm4,SCALEBITS
-	paddd     xmm3,[rel PD_ONEHALF]
-	paddd     xmm5,[rel PD_ONEHALF]
-	psrad     xmm3,SCALEBITS
-	psrad     xmm5,SCALEBITS
-
-	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-	movdqa    xmm5, XMMWORD [rsi]	; xmm5=Y(0123456789ABCDEF)
-
-	pcmpeqw   xmm4,xmm4
-	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
-	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
-	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
-
-	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
-	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
-	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
-	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
-
-	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
-	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
-	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
-	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
-
-	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
-	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
-	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
-	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-	movdqa    xmmG,xmmA
-	movdqa    xmmH,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-	movdqa    xmmC,xmmD
-	movdqa    xmmB,xmmD
-	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-	movdqa    xmmF,xmmE
-	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-	movdqa    xmmB,xmmE
-	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-	movdqa    xmmB,xmmF
-	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	rdi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	rcx, byte SIZEOF_XMMWORD
-	jz	near .nextrow
-
-	add	rsi, byte SIZEOF_XMMWORD	; inptr0
-	add	rbx, byte SIZEOF_XMMWORD	; inptr1
-	add	rdx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-
-.column_st32:
-	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
-	cmp	rcx, byte 2*SIZEOF_XMMWORD
-	jb	short .column_st16
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmF
-	sub	rcx, byte 2*SIZEOF_XMMWORD
-	jmp	short .column_st15
-.column_st16:
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jb	short .column_st15
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	rcx, byte SIZEOF_XMMWORD
-.column_st15:
-	; Store the lower 8 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	rcx, byte SIZEOF_MMWORD
-	jb	short .column_st7
-	movq	XMM_MMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_MMWORD
-	sub	rcx, byte SIZEOF_MMWORD
-	psrldq	xmmA, SIZEOF_MMWORD
-.column_st7:
-	; Store the lower 4 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	rcx, byte SIZEOF_DWORD
-	jb	short .column_st3
-	movd	XMM_DWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_DWORD
-	sub	rcx, byte SIZEOF_DWORD
-	psrldq	xmmA, SIZEOF_DWORD
-.column_st3:
-	; Store the lower 2 bytes of rax to the output when it has enough
-	; space.
-	movd	eax, xmmA
-	cmp	rcx, byte SIZEOF_WORD
-	jb	short .column_st1
-	mov	WORD [rdi], ax
-	add	rdi, byte SIZEOF_WORD
-	sub	rcx, byte SIZEOF_WORD
-	shr	rax, 16
-.column_st1:
-	; Store the lower 1 byte of rax to the output when it has enough
-	; space.
-	test	rcx, rcx
-	jz	short .nextrow
-	mov	BYTE [rdi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%else
-	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%endif
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-	movdqa    xmmC,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-	movdqa    xmmG,xmmB
-	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	movdqa    xmmH,xmmC
-	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	rdi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-	movdqu	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	rcx, byte SIZEOF_XMMWORD
-	jz	near .nextrow
-
-	add	rsi, byte SIZEOF_XMMWORD	; inptr0
-	add	rbx, byte SIZEOF_XMMWORD	; inptr1
-	add	rdx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-
-.column_st32:
-	cmp	rcx, byte SIZEOF_XMMWORD/2
-	jb	short .column_st16
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmC
-	movdqa	xmmD,xmmH
-	sub	rcx, byte SIZEOF_XMMWORD/2
-.column_st16:
-	cmp	rcx, byte SIZEOF_XMMWORD/4
-	jb	short .column_st15
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	rcx, byte SIZEOF_XMMWORD/4
-.column_st15:
-	; Store two pixels (8 bytes) of xmmA to the output when it has enough
-	; space.
-	cmp	rcx, byte SIZEOF_XMMWORD/8
-	jb	short .column_st7
-	movq	MMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD/8*4
-	sub	rcx, byte SIZEOF_XMMWORD/8
-	psrldq	xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-	; Store one pixel (4 bytes) of xmmA to the output when it has enough
-	; space.
-	test	rcx, rcx
-	jz	short .nextrow
-	movd	XMM_DWORD [rdi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.nextrow:
-	pop	rcx
-	pop	rsi
-	pop	rbx
-	pop	rdx
-	pop	rdi
-	pop	rax
-
-	add	rsi, byte SIZEOF_JSAMPROW
-	add	rbx, byte SIZEOF_JSAMPROW
-	add	rdx, byte SIZEOF_JSAMPROW
-	add	rdi, byte SIZEOF_JSAMPROW	; output_buf
-	dec	rax				; num_rows
-	jg	near .rowloop
-
-	sfence		; flush the write buffer
-
-.return:
-	pop	rbx
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm
deleted file mode 100644
index 98402c6..0000000
--- a/simd/jdclrss2.asm
+++ /dev/null
@@ -1,460 +0,0 @@
-;
-; jdclrss2.asm - colorspace conversion (SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2012 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-				
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                             JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b)	(b)+8			; JDIMENSION out_width
-%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
-%define input_row(b)	(b)+16		; JDIMENSION input_row
-%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
-%define num_rows(b)	(b)+24		; int num_rows
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	edi, JSAMPIMAGE [input_buf(eax)]
-	mov	ecx, JDIMENSION [input_row(eax)]
-	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
-	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-	pop	ecx
-
-	mov	edi, JSAMPARRAY [output_buf(eax)]
-	mov	eax, INT [num_rows(eax)]
-	test	eax,eax
-	jle	near .return
-	alignx	16,7
-.rowloop:
-	push	eax
-	push	edi
-	push	edx
-	push	ebx
-	push	esi
-	push	ecx			; col
-
-	mov	esi, JSAMPROW [esi]	; inptr0
-	mov	ebx, JSAMPROW [ebx]	; inptr1
-	mov	edx, JSAMPROW [edx]	; inptr2
-	mov	edi, JSAMPROW [edi]	; outptr
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-	alignx	16,7
-.columnloop:
-
-	movdqa	xmm5, XMMWORD [ebx]	; xmm5=Cb(0123456789ABCDEF)
-	movdqa	xmm1, XMMWORD [edx]	; xmm1=Cr(0123456789ABCDEF)
-
-	pcmpeqw	xmm4,xmm4
-	pcmpeqw	xmm7,xmm7
-	psrlw	xmm4,BYTE_BIT
-	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
-	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
-	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
-	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
-	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
-
-	paddw	xmm4,xmm7
-	paddw	xmm5,xmm7
-	paddw	xmm0,xmm7
-	paddw	xmm1,xmm7
-
-	; (Original)
-	; R = Y                + 1.40200 * Cr
-	; G = Y - 0.34414 * Cb - 0.71414 * Cr
-	; B = Y + 1.77200 * Cb
-	;
-	; (This implementation)
-	; R = Y                + 0.40200 * Cr + Cr
-	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-	; B = Y - 0.22800 * Cb + Cb + Cb
-
-	movdqa	xmm2,xmm4		; xmm2=CbE
-	movdqa	xmm3,xmm5		; xmm3=CbO
-	paddw	xmm4,xmm4		; xmm4=2*CbE
-	paddw	xmm5,xmm5		; xmm5=2*CbO
-	movdqa	xmm6,xmm0		; xmm6=CrE
-	movdqa	xmm7,xmm1		; xmm7=CrO
-	paddw	xmm0,xmm0		; xmm0=2*CrE
-	paddw	xmm1,xmm1		; xmm1=2*CrO
-
-	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbE * -FIX(0.22800))
-	pmulhw	xmm5,[GOTOFF(eax,PW_MF0228)]	; xmm5=(2*CbO * -FIX(0.22800))
-	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrE * FIX(0.40200))
-	pmulhw	xmm1,[GOTOFF(eax,PW_F0402)]	; xmm1=(2*CrO * FIX(0.40200))
-
-	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
-	paddw	xmm5,[GOTOFF(eax,PW_ONE)]
-	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
-	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
-	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
-	paddw	xmm1,[GOTOFF(eax,PW_ONE)]
-	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
-	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
-
-	paddw	xmm4,xmm2
-	paddw	xmm5,xmm3
-	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
-	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
-	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
-	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
-	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
-
-	movdqa    xmm4,xmm2
-	movdqa    xmm5,xmm3
-	punpcklwd xmm2,xmm6
-	punpckhwd xmm4,xmm6
-	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
-	punpcklwd xmm3,xmm7
-	punpckhwd xmm5,xmm7
-	pmaddwd   xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
-	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     xmm4,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     xmm2,SCALEBITS
-	psrad     xmm4,SCALEBITS
-	paddd     xmm3,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     xmm3,SCALEBITS
-	psrad     xmm5,SCALEBITS
-
-	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-	movdqa    xmm5, XMMWORD [esi]	; xmm5=Y(0123456789ABCDEF)
-
-	pcmpeqw   xmm4,xmm4
-	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
-	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
-	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
-
-	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
-	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
-	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
-	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
-
-	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
-	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
-	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
-	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
-
-	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
-	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
-	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
-	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-	movdqa    xmmG,xmmA
-	movdqa    xmmH,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-	movdqa    xmmC,xmmD
-	movdqa    xmmB,xmmD
-	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-	movdqa    xmmF,xmmE
-	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-	movdqa    xmmB,xmmE
-	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-	movdqa    xmmB,xmmF
-	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	edi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	ecx, byte SIZEOF_XMMWORD
-	jz	near .nextrow
-
-	add	esi, byte SIZEOF_XMMWORD	; inptr0
-	add	ebx, byte SIZEOF_XMMWORD	; inptr1
-	add	edx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st32:
-	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
-	cmp	ecx, byte 2*SIZEOF_XMMWORD
-	jb	short .column_st16
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmF
-	sub	ecx, byte 2*SIZEOF_XMMWORD
-	jmp	short .column_st15
-.column_st16:
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jb	short .column_st15
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	ecx, byte SIZEOF_XMMWORD
-.column_st15:
-	; Store the lower 8 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st7
-	movq	XMM_MMWORD [edi], xmmA
-	add	edi, byte SIZEOF_MMWORD
-	sub	ecx, byte SIZEOF_MMWORD
-	psrldq	xmmA, SIZEOF_MMWORD
-.column_st7:
-	; Store the lower 4 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	ecx, byte SIZEOF_DWORD
-	jb	short .column_st3
-	movd	XMM_DWORD [edi], xmmA
-	add	edi, byte SIZEOF_DWORD
-	sub	ecx, byte SIZEOF_DWORD
-	psrldq	xmmA, SIZEOF_DWORD
-.column_st3:
-	; Store the lower 2 bytes of eax to the output when it has enough
-	; space.
-	movd	eax, xmmA
-	cmp	ecx, byte SIZEOF_WORD
-	jb	short .column_st1
-	mov	WORD [edi], ax
-	add	edi, byte SIZEOF_WORD
-	sub	ecx, byte SIZEOF_WORD
-	shr	eax, 16
-.column_st1:
-	; Store the lower 1 byte of eax to the output when it has enough
-	; space.
-	test	ecx, ecx
-	jz	short .nextrow
-	mov	BYTE [edi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%else
-	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%endif
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-	movdqa    xmmC,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-	movdqa    xmmG,xmmB
-	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	movdqa    xmmH,xmmC
-	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	edi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-	movdqu	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	ecx, byte SIZEOF_XMMWORD
-	jz	near .nextrow
-
-	add	esi, byte SIZEOF_XMMWORD	; inptr0
-	add	ebx, byte SIZEOF_XMMWORD	; inptr1
-	add	edx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st32:
-	cmp	ecx, byte SIZEOF_XMMWORD/2
-	jb	short .column_st16
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmC
-	movdqa	xmmD,xmmH
-	sub	ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
-	cmp	ecx, byte SIZEOF_XMMWORD/4
-	jb	short .column_st15
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
-	; Store two pixels (8 bytes) of xmmA to the output when it has enough
-	; space.
-	cmp	ecx, byte SIZEOF_XMMWORD/8
-	jb	short .column_st7
-	movq	XMM_MMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD/8*4
-	sub	ecx, byte SIZEOF_XMMWORD/8
-	psrldq	xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-	; Store one pixel (4 bytes) of xmmA to the output when it has enough
-	; space.
-	test	ecx, ecx
-	jz	short .nextrow
-	movd	XMM_DWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-	alignx	16,7
-
-.nextrow:
-	pop	ecx
-	pop	esi
-	pop	ebx
-	pop	edx
-	pop	edi
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW
-	add	ebx, byte SIZEOF_JSAMPROW
-	add	edx, byte SIZEOF_JSAMPROW
-	add	edi, byte SIZEOF_JSAMPROW	; output_buf
-	dec	eax				; num_rows
-	jg	near .rowloop
-
-	sfence		; flush the write buffer
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdcolext-altivec.c b/simd/jdcolext-altivec.c
new file mode 100644
index 0000000..1ae91b9
--- /dev/null
+++ b/simd/jdcolext-altivec.c
@@ -0,0 +1,274 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-altivec.c */
+
+
+void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
+                                    JDIMENSION input_row,
+                                    JSAMPARRAY output_buf, int num_rows)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int pitch = out_width * RGB_PIXELSIZE, num_cols;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
+    y, cb, cr;
+#if __BIG_ENDIAN__
+  __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char out4;
+#endif
+#endif
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3;
+#endif
+  __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh,
+    crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
+  __vector int g0, g1, g2, g3;
+
+  /* Constants
+   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+   * high-order bits, not 16.
+   */
+  __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
+    pw_mf0228 = { __8X(-F_0_228 >> 1) },
+    pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
+    pw_one = { __8X(1) }, pw_255 = { __8X(255) },
+    pw_cj = { __8X(CENTERJSAMPLE) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+#else
+    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+#endif
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    for (num_cols = pitch; num_cols > 0;
+         num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16,
+         inptr0 += 16, inptr1 += 16, inptr2 += 16) {
+
+      y = vec_ld(0, inptr0);
+      /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+       * support unsigned vectors.
+       */
+      yl = (__vector signed short)VEC_UNPACKHU(y);
+      yh = (__vector signed short)VEC_UNPACKLU(y);
+
+      cb = vec_ld(0, inptr1);
+      cbl = (__vector signed short)VEC_UNPACKHU(cb);
+      cbh = (__vector signed short)VEC_UNPACKLU(cb);
+      cbl = vec_sub(cbl, pw_cj);
+      cbh = vec_sub(cbh, pw_cj);
+
+      cr = vec_ld(0, inptr2);
+      crl = (__vector signed short)VEC_UNPACKHU(cr);
+      crh = (__vector signed short)VEC_UNPACKLU(cr);
+      crl = vec_sub(crl, pw_cj);
+      crh = vec_sub(crh, pw_cj);
+
+      /* (Original)
+       * R = Y                + 1.40200 * Cr
+       * G = Y - 0.34414 * Cb - 0.71414 * Cr
+       * B = Y + 1.77200 * Cb
+       *
+       * (This implementation)
+       * R = Y                + 0.40200 * Cr + Cr
+       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       * B = Y - 0.22800 * Cb + Cb + Cb
+       */
+      bl = vec_add(cbl, cbl);
+      bh = vec_add(cbh, cbh);
+      bl = vec_madds(bl, pw_mf0228, pw_one);
+      bh = vec_madds(bh, pw_mf0228, pw_one);
+      bl = vec_sra(bl, (__vector unsigned short)pw_one);
+      bh = vec_sra(bh, (__vector unsigned short)pw_one);
+      bl = vec_add(bl, cbl);
+      bh = vec_add(bh, cbh);
+      bl = vec_add(bl, cbl);
+      bh = vec_add(bh, cbh);
+      bl = vec_add(bl, yl);
+      bh = vec_add(bh, yh);
+
+      rl = vec_add(crl, crl);
+      rh = vec_add(crh, crh);
+      rl = vec_madds(rl, pw_f0402, pw_one);
+      rh = vec_madds(rh, pw_f0402, pw_one);
+      rl = vec_sra(rl, (__vector unsigned short)pw_one);
+      rh = vec_sra(rh, (__vector unsigned short)pw_one);
+      rl = vec_add(rl, crl);
+      rh = vec_add(rh, crh);
+      rl = vec_add(rl, yl);
+      rh = vec_add(rh, yh);
+
+      g0w = vec_mergeh(cbl, crl);
+      g1w = vec_mergel(cbl, crl);
+      g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf);
+      g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf);
+      g2w = vec_mergeh(cbh, crh);
+      g3w = vec_mergel(cbh, crh);
+      g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf);
+      g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf);
+      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+       * each dword into a new 16-bit vector, which is the equivalent of
+       * descaling the 32-bit results (right-shifting by 16 bits) and then
+       * packing them.
+       */
+      gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index);
+      gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index);
+      gl = vec_sub(gl, crl);
+      gh = vec_sub(gh, crh);
+      gl = vec_add(gl, yl);
+      gh = vec_add(gh, yh);
+
+      rg0 = vec_mergeh(rl, gl);
+      bx0 = vec_mergeh(bl, pw_255);
+      rg1 = vec_mergel(rl, gl);
+      bx1 = vec_mergel(bl, pw_255);
+      rg2 = vec_mergeh(rh, gh);
+      bx2 = vec_mergeh(bh, pw_255);
+      rg3 = vec_mergel(rh, gh);
+      bx3 = vec_mergel(bh, pw_255);
+
+      rgbx0 = vec_packsu(rg0, bx0);
+      rgbx1 = vec_packsu(rg1, bx1);
+      rgbx2 = vec_packsu(rg2, bx2);
+      rgbx3 = vec_packsu(rg3, bx3);
+
+#if RGB_PIXELSIZE == 3
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
+      rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
+      rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
+#else
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
+      rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
+      rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
+      rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
+#endif
+
+#if __BIG_ENDIAN__
+      offset = (size_t)outptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overwrite.  Since there is no way to
+           * write a partial AltiVec register, overwrite would occur on the
+           * last chunk of the last image row if the right edge is not on a
+           * 16-byte boundary.  It could also occur on other rows if the bytes
+           * per row is low enough.  Since we can't determine whether we're on
+           * the last image row, we have to assume every row is the last.
+           */
+          vec_st(rgb0, 0, tmpbuf);
+          vec_st(rgb1, 16, tmpbuf);
+          vec_st(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          vec_st(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          unaligned_shift_index = vec_lvsl(0, outptr);
+          edgel = vec_ld(0, outptr);
+          edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
+          edges = vec_perm(edgeh, edgel, unaligned_shift_index);
+          unaligned_shift_index = vec_lvsr(0, outptr);
+          out0 = vec_perm(edges, rgb0, unaligned_shift_index);
+          out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+          out4 = vec_perm(rgb3, edges, unaligned_shift_index);
+#else
+          out3 = vec_perm(rgb2, edges, unaligned_shift_index);
+#endif
+          vec_st(out0, 0, outptr);
+          if (bytes > 16)
+            vec_st(out1, 16, outptr);
+          if (bytes > 32)
+            vec_st(out2, 32, outptr);
+          if (bytes > 48)
+            vec_st(out3, 48, outptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            vec_st(out4, 64, outptr);
+#endif
+        }
+      } else {
+#endif /* __BIG_ENDIAN__ */
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          VEC_ST(rgb0, 0, tmpbuf);
+          VEC_ST(rgb1, 16, tmpbuf);
+          VEC_ST(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          VEC_ST(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          VEC_ST(rgb0, 0, outptr);
+          if (num_cols > 16)
+            VEC_ST(rgb1, 16, outptr);
+          if (num_cols > 32)
+            VEC_ST(rgb2, 32, outptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            VEC_ST(rgb3, 48, outptr);
+#endif
+        }
+#if __BIG_ENDIAN__
+      }
+#endif
+    }
+  }
+}
diff --git a/simd/jdcolext-mmx.asm b/simd/jdcolext-mmx.asm
new file mode 100644
index 0000000..de1f00f
--- /dev/null
+++ b/simd/jdcolext-mmx.asm
@@ -0,0 +1,405 @@
+;
+; jdcolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
+;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                            JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b)    (b)+8           ; JDIMENSION out_width
+%define input_buf(b)    (b)+12          ; JSAMPIMAGE input_buf
+%define input_row(b)    (b)+16          ; JDIMENSION input_row
+%define output_buf(b)   (b)+20          ; JSAMPARRAY output_buf
+%define num_rows(b)     (b)+24          ; int num_rows
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM          2
+%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
+
+        align   16
+        global  EXTN(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic eax             ; make a room for GOT address
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx                     ; get GOT address
+        movpic  POINTER [gotptr], ebx   ; save GOT address
+
+        mov     ecx, JDIMENSION [out_width(eax)]        ; num_cols
+        test    ecx,ecx
+        jz      near .return
+
+        push    ecx
+
+        mov     edi, JSAMPIMAGE [input_buf(eax)]
+        mov     ecx, JDIMENSION [input_row(eax)]
+        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
+        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+        pop     ecx
+
+        mov     edi, JSAMPARRAY [output_buf(eax)]
+        mov     eax, INT [num_rows(eax)]
+        test    eax,eax
+        jle     near .return
+        alignx  16,7
+.rowloop:
+        push    eax
+        push    edi
+        push    edx
+        push    ebx
+        push    esi
+        push    ecx                     ; col
+
+        mov     esi, JSAMPROW [esi]     ; inptr0
+        mov     ebx, JSAMPROW [ebx]     ; inptr1
+        mov     edx, JSAMPROW [edx]     ; inptr2
+        mov     edi, JSAMPROW [edi]     ; outptr
+        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
+        alignx  16,7
+.columnloop:
+
+        movq    mm5, MMWORD [ebx]       ; mm5=Cb(01234567)
+        movq    mm1, MMWORD [edx]       ; mm1=Cr(01234567)
+
+        pcmpeqw mm4,mm4
+        pcmpeqw mm7,mm7
+        psrlw   mm4,BYTE_BIT
+        psllw   mm7,7                   ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+        movq    mm0,mm4                 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+        pand    mm4,mm5                 ; mm4=Cb(0246)=CbE
+        psrlw   mm5,BYTE_BIT            ; mm5=Cb(1357)=CbO
+        pand    mm0,mm1                 ; mm0=Cr(0246)=CrE
+        psrlw   mm1,BYTE_BIT            ; mm1=Cr(1357)=CrO
+
+        paddw   mm4,mm7
+        paddw   mm5,mm7
+        paddw   mm0,mm7
+        paddw   mm1,mm7
+
+        ; (Original)
+        ; R = Y                + 1.40200 * Cr
+        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+        ; B = Y + 1.77200 * Cb
+        ;
+        ; (This implementation)
+        ; R = Y                + 0.40200 * Cr + Cr
+        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+        ; B = Y - 0.22800 * Cb + Cb + Cb
+
+        movq    mm2,mm4                 ; mm2=CbE
+        movq    mm3,mm5                 ; mm3=CbO
+        paddw   mm4,mm4                 ; mm4=2*CbE
+        paddw   mm5,mm5                 ; mm5=2*CbO
+        movq    mm6,mm0                 ; mm6=CrE
+        movq    mm7,mm1                 ; mm7=CrO
+        paddw   mm0,mm0                 ; mm0=2*CrE
+        paddw   mm1,mm1                 ; mm1=2*CrO
+
+        pmulhw  mm4,[GOTOFF(eax,PW_MF0228)]     ; mm4=(2*CbE * -FIX(0.22800))
+        pmulhw  mm5,[GOTOFF(eax,PW_MF0228)]     ; mm5=(2*CbO * -FIX(0.22800))
+        pmulhw  mm0,[GOTOFF(eax,PW_F0402)]      ; mm0=(2*CrE * FIX(0.40200))
+        pmulhw  mm1,[GOTOFF(eax,PW_F0402)]      ; mm1=(2*CrO * FIX(0.40200))
+
+        paddw   mm4,[GOTOFF(eax,PW_ONE)]
+        paddw   mm5,[GOTOFF(eax,PW_ONE)]
+        psraw   mm4,1                   ; mm4=(CbE * -FIX(0.22800))
+        psraw   mm5,1                   ; mm5=(CbO * -FIX(0.22800))
+        paddw   mm0,[GOTOFF(eax,PW_ONE)]
+        paddw   mm1,[GOTOFF(eax,PW_ONE)]
+        psraw   mm0,1                   ; mm0=(CrE * FIX(0.40200))
+        psraw   mm1,1                   ; mm1=(CrO * FIX(0.40200))
+
+        paddw   mm4,mm2
+        paddw   mm5,mm3
+        paddw   mm4,mm2                 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
+        paddw   mm5,mm3                 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
+        paddw   mm0,mm6                 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
+        paddw   mm1,mm7                 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+        movq    MMWORD [wk(0)], mm4     ; wk(0)=(B-Y)E
+        movq    MMWORD [wk(1)], mm5     ; wk(1)=(B-Y)O
+
+        movq      mm4,mm2
+        movq      mm5,mm3
+        punpcklwd mm2,mm6
+        punpckhwd mm4,mm6
+        pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+        pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
+        punpcklwd mm3,mm7
+        punpckhwd mm5,mm7
+        pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
+        pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+        paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+        paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
+        psrad     mm2,SCALEBITS
+        psrad     mm4,SCALEBITS
+        paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
+        paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+        psrad     mm3,SCALEBITS
+        psrad     mm5,SCALEBITS
+
+        packssdw  mm2,mm4       ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+        packssdw  mm3,mm5       ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+        psubw     mm2,mm6       ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+        psubw     mm3,mm7       ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+        movq      mm5, MMWORD [esi]     ; mm5=Y(01234567)
+
+        pcmpeqw   mm4,mm4
+        psrlw     mm4,BYTE_BIT          ; mm4={0xFF 0x00 0xFF 0x00 ..}
+        pand      mm4,mm5               ; mm4=Y(0246)=YE
+        psrlw     mm5,BYTE_BIT          ; mm5=Y(1357)=YO
+
+        paddw     mm0,mm4               ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+        paddw     mm1,mm5               ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+        packuswb  mm0,mm0               ; mm0=(R0 R2 R4 R6 ** ** ** **)
+        packuswb  mm1,mm1               ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+        paddw     mm2,mm4               ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+        paddw     mm3,mm5               ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+        packuswb  mm2,mm2               ; mm2=(G0 G2 G4 G6 ** ** ** **)
+        packuswb  mm3,mm3               ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+        paddw     mm4, MMWORD [wk(0)]   ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+        paddw     mm5, MMWORD [wk(1)]   ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+        packuswb  mm4,mm4               ; mm4=(B0 B2 B4 B6 ** ** ** **)
+        packuswb  mm5,mm5               ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+        ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
+        punpcklbw mmE,mmB               ; mmE=(20 01 22 03 24 05 26 07)
+        punpcklbw mmD,mmF               ; mmD=(11 21 13 23 15 25 17 27)
+
+        movq      mmG,mmA
+        movq      mmH,mmA
+        punpcklwd mmA,mmE               ; mmA=(00 10 20 01 02 12 22 03)
+        punpckhwd mmG,mmE               ; mmG=(04 14 24 05 06 16 26 07)
+
+        psrlq     mmH,2*BYTE_BIT        ; mmH=(02 12 04 14 06 16 -- --)
+        psrlq     mmE,2*BYTE_BIT        ; mmE=(22 03 24 05 26 07 -- --)
+
+        movq      mmC,mmD
+        movq      mmB,mmD
+        punpcklwd mmD,mmH               ; mmD=(11 21 02 12 13 23 04 14)
+        punpckhwd mmC,mmH               ; mmC=(15 25 06 16 17 27 -- --)
+
+        psrlq     mmB,2*BYTE_BIT        ; mmB=(13 23 15 25 17 27 -- --)
+
+        movq      mmF,mmE
+        punpcklwd mmE,mmB               ; mmE=(22 03 13 23 24 05 15 25)
+        punpckhwd mmF,mmB               ; mmF=(26 07 17 27 -- -- -- --)
+
+        punpckldq mmA,mmD               ; mmA=(00 10 20 01 11 21 02 12)
+        punpckldq mmE,mmG               ; mmE=(22 03 13 23 04 14 24 05)
+        punpckldq mmC,mmF               ; mmC=(15 25 06 16 26 07 17 27)
+
+        cmp     ecx, byte SIZEOF_MMWORD
+        jb      short .column_st16
+
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
+        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+        sub     ecx, byte SIZEOF_MMWORD
+        jz      short .nextrow
+
+        add     esi, byte SIZEOF_MMWORD                 ; inptr0
+        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
+        add     edx, byte SIZEOF_MMWORD                 ; inptr2
+        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
+        jmp     near .columnloop
+        alignx  16,7
+
+.column_st16:
+        lea     ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+        cmp     ecx, byte 2*SIZEOF_MMWORD
+        jb      short .column_st8
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
+        movq    mmA,mmC
+        sub     ecx, byte 2*SIZEOF_MMWORD
+        add     edi, byte 2*SIZEOF_MMWORD
+        jmp     short .column_st4
+.column_st8:
+        cmp     ecx, byte SIZEOF_MMWORD
+        jb      short .column_st4
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+        movq    mmA,mmE
+        sub     ecx, byte SIZEOF_MMWORD
+        add     edi, byte SIZEOF_MMWORD
+.column_st4:
+        movd    eax,mmA
+        cmp     ecx, byte SIZEOF_DWORD
+        jb      short .column_st2
+        mov     DWORD [edi+0*SIZEOF_DWORD], eax
+        psrlq   mmA,DWORD_BIT
+        movd    eax,mmA
+        sub     ecx, byte SIZEOF_DWORD
+        add     edi, byte SIZEOF_DWORD
+.column_st2:
+        cmp     ecx, byte SIZEOF_WORD
+        jb      short .column_st1
+        mov     WORD [edi+0*SIZEOF_WORD], ax
+        shr     eax,WORD_BIT
+        sub     ecx, byte SIZEOF_WORD
+        add     edi, byte SIZEOF_WORD
+.column_st1:
+        cmp     ecx, byte SIZEOF_BYTE
+        jb      short .nextrow
+        mov     BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+        pcmpeqb   mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
+        pcmpeqb   mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+        pxor      mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
+        pxor      mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+        ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
+        punpcklbw mmE,mmG               ; mmE=(20 30 22 32 24 34 26 36)
+        punpcklbw mmB,mmD               ; mmB=(01 11 03 13 05 15 07 17)
+        punpcklbw mmF,mmH               ; mmF=(21 31 23 33 25 35 27 37)
+
+        movq      mmC,mmA
+        punpcklwd mmA,mmE               ; mmA=(00 10 20 30 02 12 22 32)
+        punpckhwd mmC,mmE               ; mmC=(04 14 24 34 06 16 26 36)
+        movq      mmG,mmB
+        punpcklwd mmB,mmF               ; mmB=(01 11 21 31 03 13 23 33)
+        punpckhwd mmG,mmF               ; mmG=(05 15 25 35 07 17 27 37)
+
+        movq      mmD,mmA
+        punpckldq mmA,mmB               ; mmA=(00 10 20 30 01 11 21 31)
+        punpckhdq mmD,mmB               ; mmD=(02 12 22 32 03 13 23 33)
+        movq      mmH,mmC
+        punpckldq mmC,mmG               ; mmC=(04 14 24 34 05 15 25 35)
+        punpckhdq mmH,mmG               ; mmH=(06 16 26 36 07 17 27 37)
+
+        cmp     ecx, byte SIZEOF_MMWORD
+        jb      short .column_st16
+
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
+        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
+        movq    MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+        sub     ecx, byte SIZEOF_MMWORD
+        jz      short .nextrow
+
+        add     esi, byte SIZEOF_MMWORD                 ; inptr0
+        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
+        add     edx, byte SIZEOF_MMWORD                 ; inptr2
+        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
+        jmp     near .columnloop
+        alignx  16,7
+
+.column_st16:
+        cmp     ecx, byte SIZEOF_MMWORD/2
+        jb      short .column_st8
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
+        movq    mmA,mmC
+        movq    mmD,mmH
+        sub     ecx, byte SIZEOF_MMWORD/2
+        add     edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+        cmp     ecx, byte SIZEOF_MMWORD/4
+        jb      short .column_st4
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+        movq    mmA,mmD
+        sub     ecx, byte SIZEOF_MMWORD/4
+        add     edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+        cmp     ecx, byte SIZEOF_MMWORD/8
+        jb      short .nextrow
+        movd    DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+        alignx  16,7
+
+.nextrow:
+        pop     ecx
+        pop     esi
+        pop     ebx
+        pop     edx
+        pop     edi
+        pop     eax
+
+        add     esi, byte SIZEOF_JSAMPROW
+        add     ebx, byte SIZEOF_JSAMPROW
+        add     edx, byte SIZEOF_JSAMPROW
+        add     edi, byte SIZEOF_JSAMPROW       ; output_buf
+        dec     eax                             ; num_rows
+        jg      near .rowloop
+
+        emms            ; empty MMX state
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jdcolext-sse2-64.asm b/simd/jdcolext-sse2-64.asm
new file mode 100644
index 0000000..d356e65
--- /dev/null
+++ b/simd/jdcolext-sse2-64.asm
@@ -0,0 +1,441 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
+;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                             JSAMPARRAY output_buf, int num_rows)
+;
+
+; r10 = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14 = int num_rows
+
+%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+        global  EXTN(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [wk(0)]
+        collect_args
+        push    rbx
+
+        mov     ecx, r10d        ; num_cols
+        test    rcx,rcx
+        jz      near .return
+
+        push    rcx
+
+        mov     rdi, r11
+        mov     ecx, r12d
+        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+        lea     rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+        lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+        lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+        pop     rcx
+
+        mov     rdi, r13
+        mov     eax, r14d
+        test    rax,rax
+        jle     near .return
+.rowloop:
+        push    rax
+        push    rdi
+        push    rdx
+        push    rbx
+        push    rsi
+        push    rcx                     ; col
+
+        mov     rsi, JSAMPROW [rsi]     ; inptr0
+        mov     rbx, JSAMPROW [rbx]     ; inptr1
+        mov     rdx, JSAMPROW [rdx]     ; inptr2
+        mov     rdi, JSAMPROW [rdi]     ; outptr
+.columnloop:
+
+        movdqa  xmm5, XMMWORD [rbx]     ; xmm5=Cb(0123456789ABCDEF)
+        movdqa  xmm1, XMMWORD [rdx]     ; xmm1=Cr(0123456789ABCDEF)
+
+        pcmpeqw xmm4,xmm4
+        pcmpeqw xmm7,xmm7
+        psrlw   xmm4,BYTE_BIT
+        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+        movdqa  xmm0,xmm4               ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+        pand    xmm4,xmm5               ; xmm4=Cb(02468ACE)=CbE
+        psrlw   xmm5,BYTE_BIT           ; xmm5=Cb(13579BDF)=CbO
+        pand    xmm0,xmm1               ; xmm0=Cr(02468ACE)=CrE
+        psrlw   xmm1,BYTE_BIT           ; xmm1=Cr(13579BDF)=CrO
+
+        paddw   xmm4,xmm7
+        paddw   xmm5,xmm7
+        paddw   xmm0,xmm7
+        paddw   xmm1,xmm7
+
+        ; (Original)
+        ; R = Y                + 1.40200 * Cr
+        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+        ; B = Y + 1.77200 * Cb
+        ;
+        ; (This implementation)
+        ; R = Y                + 0.40200 * Cr + Cr
+        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+        ; B = Y - 0.22800 * Cb + Cb + Cb
+
+        movdqa  xmm2,xmm4               ; xmm2=CbE
+        movdqa  xmm3,xmm5               ; xmm3=CbO
+        paddw   xmm4,xmm4               ; xmm4=2*CbE
+        paddw   xmm5,xmm5               ; xmm5=2*CbO
+        movdqa  xmm6,xmm0               ; xmm6=CrE
+        movdqa  xmm7,xmm1               ; xmm7=CrO
+        paddw   xmm0,xmm0               ; xmm0=2*CrE
+        paddw   xmm1,xmm1               ; xmm1=2*CrO
+
+        pmulhw  xmm4,[rel PW_MF0228]    ; xmm4=(2*CbE * -FIX(0.22800))
+        pmulhw  xmm5,[rel PW_MF0228]    ; xmm5=(2*CbO * -FIX(0.22800))
+        pmulhw  xmm0,[rel PW_F0402]     ; xmm0=(2*CrE * FIX(0.40200))
+        pmulhw  xmm1,[rel PW_F0402]     ; xmm1=(2*CrO * FIX(0.40200))
+
+        paddw   xmm4,[rel PW_ONE]
+        paddw   xmm5,[rel PW_ONE]
+        psraw   xmm4,1                  ; xmm4=(CbE * -FIX(0.22800))
+        psraw   xmm5,1                  ; xmm5=(CbO * -FIX(0.22800))
+        paddw   xmm0,[rel PW_ONE]
+        paddw   xmm1,[rel PW_ONE]
+        psraw   xmm0,1                  ; xmm0=(CrE * FIX(0.40200))
+        psraw   xmm1,1                  ; xmm1=(CrO * FIX(0.40200))
+
+        paddw   xmm4,xmm2
+        paddw   xmm5,xmm3
+        paddw   xmm4,xmm2               ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+        paddw   xmm5,xmm3               ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+        paddw   xmm0,xmm6               ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+        paddw   xmm1,xmm7               ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+        movdqa    xmm4,xmm2
+        movdqa    xmm5,xmm3
+        punpcklwd xmm2,xmm6
+        punpckhwd xmm4,xmm6
+        pmaddwd   xmm2,[rel PW_MF0344_F0285]
+        pmaddwd   xmm4,[rel PW_MF0344_F0285]
+        punpcklwd xmm3,xmm7
+        punpckhwd xmm5,xmm7
+        pmaddwd   xmm3,[rel PW_MF0344_F0285]
+        pmaddwd   xmm5,[rel PW_MF0344_F0285]
+
+        paddd     xmm2,[rel PD_ONEHALF]
+        paddd     xmm4,[rel PD_ONEHALF]
+        psrad     xmm2,SCALEBITS
+        psrad     xmm4,SCALEBITS
+        paddd     xmm3,[rel PD_ONEHALF]
+        paddd     xmm5,[rel PD_ONEHALF]
+        psrad     xmm3,SCALEBITS
+        psrad     xmm5,SCALEBITS
+
+        packssdw  xmm2,xmm4     ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+        packssdw  xmm3,xmm5     ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+        psubw     xmm2,xmm6     ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+        psubw     xmm3,xmm7     ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+        movdqa    xmm5, XMMWORD [rsi]   ; xmm5=Y(0123456789ABCDEF)
+
+        pcmpeqw   xmm4,xmm4
+        psrlw     xmm4,BYTE_BIT         ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+        pand      xmm4,xmm5             ; xmm4=Y(02468ACE)=YE
+        psrlw     xmm5,BYTE_BIT         ; xmm5=Y(13579BDF)=YO
+
+        paddw     xmm0,xmm4             ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+        paddw     xmm1,xmm5             ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
+        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
+
+        paddw     xmm2,xmm4             ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+        paddw     xmm3,xmm5             ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
+        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
+
+        paddw     xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+        paddw     xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
+        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+        movdqa    xmmG,xmmA
+        movdqa    xmmH,xmmA
+        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+        movdqa    xmmC,xmmD
+        movdqa    xmmB,xmmD
+        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+        movdqa    xmmF,xmmE
+        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+        movdqa    xmmB,xmmE
+        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+        movdqa    xmmB,xmmF
+        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jb      short .column_st32
+
+        test    rdi, SIZEOF_XMMWORD-1
+        jnz     short .out1
+        ; --(aligned)-------------------
+        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+        jmp     short .out0
+.out1:  ; --(unaligned)-----------------
+        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+        sub     rcx, byte SIZEOF_XMMWORD
+        jz      near .nextrow
+
+        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
+        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
+        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
+        jmp     near .columnloop
+
+.column_st32:
+        lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
+        cmp     rcx, byte 2*SIZEOF_XMMWORD
+        jb      short .column_st16
+        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
+        movdqa  xmmA,xmmF
+        sub     rcx, byte 2*SIZEOF_XMMWORD
+        jmp     short .column_st15
+.column_st16:
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jb      short .column_st15
+        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        add     rdi, byte SIZEOF_XMMWORD        ; outptr
+        movdqa  xmmA,xmmD
+        sub     rcx, byte SIZEOF_XMMWORD
+.column_st15:
+        ; Store the lower 8 bytes of xmmA to the output when it has enough
+        ; space.
+        cmp     rcx, byte SIZEOF_MMWORD
+        jb      short .column_st7
+        movq    XMM_MMWORD [rdi], xmmA
+        add     rdi, byte SIZEOF_MMWORD
+        sub     rcx, byte SIZEOF_MMWORD
+        psrldq  xmmA, SIZEOF_MMWORD
+.column_st7:
+        ; Store the lower 4 bytes of xmmA to the output when it has enough
+        ; space.
+        cmp     rcx, byte SIZEOF_DWORD
+        jb      short .column_st3
+        movd    XMM_DWORD [rdi], xmmA
+        add     rdi, byte SIZEOF_DWORD
+        sub     rcx, byte SIZEOF_DWORD
+        psrldq  xmmA, SIZEOF_DWORD
+.column_st3:
+        ; Store the lower 2 bytes of rax to the output when it has enough
+        ; space.
+        movd    eax, xmmA
+        cmp     rcx, byte SIZEOF_WORD
+        jb      short .column_st1
+        mov     WORD [rdi], ax
+        add     rdi, byte SIZEOF_WORD
+        sub     rcx, byte SIZEOF_WORD
+        shr     rax, 16
+.column_st1:
+        ; Store the lower 1 byte of rax to the output when it has enough
+        ; space.
+        test    rcx, rcx
+        jz      short .nextrow
+        mov     BYTE [rdi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%else
+        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%endif
+        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+        movdqa    xmmC,xmmA
+        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+        movdqa    xmmG,xmmB
+        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+        movdqa    xmmD,xmmA
+        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+        movdqa    xmmH,xmmC
+        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jb      short .column_st32
+
+        test    rdi, SIZEOF_XMMWORD-1
+        jnz     short .out1
+        ; --(aligned)-------------------
+        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+        movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+        jmp     short .out0
+.out1:  ; --(unaligned)-----------------
+        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+        movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+        sub     rcx, byte SIZEOF_XMMWORD
+        jz      near .nextrow
+
+        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
+        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
+        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
+        jmp     near .columnloop
+
+.column_st32:
+        cmp     rcx, byte SIZEOF_XMMWORD/2
+        jb      short .column_st16
+        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
+        movdqa  xmmA,xmmC
+        movdqa  xmmD,xmmH
+        sub     rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+        cmp     rcx, byte SIZEOF_XMMWORD/4
+        jb      short .column_st15
+        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        add     rdi, byte SIZEOF_XMMWORD        ; outptr
+        movdqa  xmmA,xmmD
+        sub     rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+        ; space.
+        cmp     rcx, byte SIZEOF_XMMWORD/8
+        jb      short .column_st7
+        movq    MMWORD [rdi], xmmA
+        add     rdi, byte SIZEOF_XMMWORD/8*4
+        sub     rcx, byte SIZEOF_XMMWORD/8
+        psrldq  xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+        ; space.
+        test    rcx, rcx
+        jz      short .nextrow
+        movd    XMM_DWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+        pop     rcx
+        pop     rsi
+        pop     rbx
+        pop     rdx
+        pop     rdi
+        pop     rax
+
+        add     rsi, byte SIZEOF_JSAMPROW
+        add     rbx, byte SIZEOF_JSAMPROW
+        add     rdx, byte SIZEOF_JSAMPROW
+        add     rdi, byte SIZEOF_JSAMPROW       ; output_buf
+        dec     rax                             ; num_rows
+        jg      near .rowloop
+
+        sfence          ; flush the write buffer
+
+.return:
+        pop     rbx
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jdcolext-sse2.asm b/simd/jdcolext-sse2.asm
new file mode 100644
index 0000000..54ae4db
--- /dev/null
+++ b/simd/jdcolext-sse2.asm
@@ -0,0 +1,460 @@
+;
+; jdcolext.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2012 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
+;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                             JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b)    (b)+8           ; JDIMENSION out_width
+%define input_buf(b)    (b)+12          ; JSAMPIMAGE input_buf
+%define input_row(b)    (b)+16          ; JDIMENSION input_row
+%define output_buf(b)   (b)+20          ; JSAMPARRAY output_buf
+%define num_rows(b)     (b)+24          ; int num_rows
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
+
+        align   16
+        global  EXTN(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic eax             ; make a room for GOT address
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx                     ; get GOT address
+        movpic  POINTER [gotptr], ebx   ; save GOT address
+
+        mov     ecx, JDIMENSION [out_width(eax)]        ; num_cols
+        test    ecx,ecx
+        jz      near .return
+
+        push    ecx
+
+        mov     edi, JSAMPIMAGE [input_buf(eax)]
+        mov     ecx, JDIMENSION [input_row(eax)]
+        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
+        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+        pop     ecx
+
+        mov     edi, JSAMPARRAY [output_buf(eax)]
+        mov     eax, INT [num_rows(eax)]
+        test    eax,eax
+        jle     near .return
+        alignx  16,7
+.rowloop:
+        push    eax
+        push    edi
+        push    edx
+        push    ebx
+        push    esi
+        push    ecx                     ; col
+
+        mov     esi, JSAMPROW [esi]     ; inptr0
+        mov     ebx, JSAMPROW [ebx]     ; inptr1
+        mov     edx, JSAMPROW [edx]     ; inptr2
+        mov     edi, JSAMPROW [edi]     ; outptr
+        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
+        alignx  16,7
+.columnloop:
+
+        movdqa  xmm5, XMMWORD [ebx]     ; xmm5=Cb(0123456789ABCDEF)
+        movdqa  xmm1, XMMWORD [edx]     ; xmm1=Cr(0123456789ABCDEF)
+
+        pcmpeqw xmm4,xmm4
+        pcmpeqw xmm7,xmm7
+        psrlw   xmm4,BYTE_BIT
+        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+        movdqa  xmm0,xmm4               ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+        pand    xmm4,xmm5               ; xmm4=Cb(02468ACE)=CbE
+        psrlw   xmm5,BYTE_BIT           ; xmm5=Cb(13579BDF)=CbO
+        pand    xmm0,xmm1               ; xmm0=Cr(02468ACE)=CrE
+        psrlw   xmm1,BYTE_BIT           ; xmm1=Cr(13579BDF)=CrO
+
+        paddw   xmm4,xmm7
+        paddw   xmm5,xmm7
+        paddw   xmm0,xmm7
+        paddw   xmm1,xmm7
+
+        ; (Original)
+        ; R = Y                + 1.40200 * Cr
+        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+        ; B = Y + 1.77200 * Cb
+        ;
+        ; (This implementation)
+        ; R = Y                + 0.40200 * Cr + Cr
+        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+        ; B = Y - 0.22800 * Cb + Cb + Cb
+
+        movdqa  xmm2,xmm4               ; xmm2=CbE
+        movdqa  xmm3,xmm5               ; xmm3=CbO
+        paddw   xmm4,xmm4               ; xmm4=2*CbE
+        paddw   xmm5,xmm5               ; xmm5=2*CbO
+        movdqa  xmm6,xmm0               ; xmm6=CrE
+        movdqa  xmm7,xmm1               ; xmm7=CrO
+        paddw   xmm0,xmm0               ; xmm0=2*CrE
+        paddw   xmm1,xmm1               ; xmm1=2*CrO
+
+        pmulhw  xmm4,[GOTOFF(eax,PW_MF0228)]    ; xmm4=(2*CbE * -FIX(0.22800))
+        pmulhw  xmm5,[GOTOFF(eax,PW_MF0228)]    ; xmm5=(2*CbO * -FIX(0.22800))
+        pmulhw  xmm0,[GOTOFF(eax,PW_F0402)]     ; xmm0=(2*CrE * FIX(0.40200))
+        pmulhw  xmm1,[GOTOFF(eax,PW_F0402)]     ; xmm1=(2*CrO * FIX(0.40200))
+
+        paddw   xmm4,[GOTOFF(eax,PW_ONE)]
+        paddw   xmm5,[GOTOFF(eax,PW_ONE)]
+        psraw   xmm4,1                  ; xmm4=(CbE * -FIX(0.22800))
+        psraw   xmm5,1                  ; xmm5=(CbO * -FIX(0.22800))
+        paddw   xmm0,[GOTOFF(eax,PW_ONE)]
+        paddw   xmm1,[GOTOFF(eax,PW_ONE)]
+        psraw   xmm0,1                  ; xmm0=(CrE * FIX(0.40200))
+        psraw   xmm1,1                  ; xmm1=(CrO * FIX(0.40200))
+
+        paddw   xmm4,xmm2
+        paddw   xmm5,xmm3
+        paddw   xmm4,xmm2               ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+        paddw   xmm5,xmm3               ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+        paddw   xmm0,xmm6               ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+        paddw   xmm1,xmm7               ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+        movdqa    xmm4,xmm2
+        movdqa    xmm5,xmm3
+        punpcklwd xmm2,xmm6
+        punpckhwd xmm4,xmm6
+        pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+        pmaddwd   xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
+        punpcklwd xmm3,xmm7
+        punpckhwd xmm5,xmm7
+        pmaddwd   xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
+        pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+        paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
+        paddd     xmm4,[GOTOFF(eax,PD_ONEHALF)]
+        psrad     xmm2,SCALEBITS
+        psrad     xmm4,SCALEBITS
+        paddd     xmm3,[GOTOFF(eax,PD_ONEHALF)]
+        paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
+        psrad     xmm3,SCALEBITS
+        psrad     xmm5,SCALEBITS
+
+        packssdw  xmm2,xmm4     ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+        packssdw  xmm3,xmm5     ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+        psubw     xmm2,xmm6     ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+        psubw     xmm3,xmm7     ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+        movdqa    xmm5, XMMWORD [esi]   ; xmm5=Y(0123456789ABCDEF)
+
+        pcmpeqw   xmm4,xmm4
+        psrlw     xmm4,BYTE_BIT         ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+        pand      xmm4,xmm5             ; xmm4=Y(02468ACE)=YE
+        psrlw     xmm5,BYTE_BIT         ; xmm5=Y(13579BDF)=YO
+
+        paddw     xmm0,xmm4             ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+        paddw     xmm1,xmm5             ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
+        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
+
+        paddw     xmm2,xmm4             ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+        paddw     xmm3,xmm5             ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
+        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
+
+        paddw     xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+        paddw     xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
+        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+        movdqa    xmmG,xmmA
+        movdqa    xmmH,xmmA
+        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+        movdqa    xmmC,xmmD
+        movdqa    xmmB,xmmD
+        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+        movdqa    xmmF,xmmE
+        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+        movdqa    xmmB,xmmE
+        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+        movdqa    xmmB,xmmF
+        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jb      short .column_st32
+
+        test    edi, SIZEOF_XMMWORD-1
+        jnz     short .out1
+        ; --(aligned)-------------------
+        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+        jmp     short .out0
+.out1:  ; --(unaligned)-----------------
+        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+        sub     ecx, byte SIZEOF_XMMWORD
+        jz      near .nextrow
+
+        add     esi, byte SIZEOF_XMMWORD        ; inptr0
+        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
+        add     edx, byte SIZEOF_XMMWORD        ; inptr2
+        jmp     near .columnloop
+        alignx  16,7
+
+.column_st32:
+        lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
+        cmp     ecx, byte 2*SIZEOF_XMMWORD
+        jb      short .column_st16
+        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
+        movdqa  xmmA,xmmF
+        sub     ecx, byte 2*SIZEOF_XMMWORD
+        jmp     short .column_st15
+.column_st16:
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jb      short .column_st15
+        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        add     edi, byte SIZEOF_XMMWORD        ; outptr
+        movdqa  xmmA,xmmD
+        sub     ecx, byte SIZEOF_XMMWORD
+.column_st15:
+        ; Store the lower 8 bytes of xmmA to the output when it has enough
+        ; space.
+        cmp     ecx, byte SIZEOF_MMWORD
+        jb      short .column_st7
+        movq    XMM_MMWORD [edi], xmmA
+        add     edi, byte SIZEOF_MMWORD
+        sub     ecx, byte SIZEOF_MMWORD
+        psrldq  xmmA, SIZEOF_MMWORD
+.column_st7:
+        ; Store the lower 4 bytes of xmmA to the output when it has enough
+        ; space.
+        cmp     ecx, byte SIZEOF_DWORD
+        jb      short .column_st3
+        movd    XMM_DWORD [edi], xmmA
+        add     edi, byte SIZEOF_DWORD
+        sub     ecx, byte SIZEOF_DWORD
+        psrldq  xmmA, SIZEOF_DWORD
+.column_st3:
+        ; Store the lower 2 bytes of eax to the output when it has enough
+        ; space.
+        movd    eax, xmmA
+        cmp     ecx, byte SIZEOF_WORD
+        jb      short .column_st1
+        mov     WORD [edi], ax
+        add     edi, byte SIZEOF_WORD
+        sub     ecx, byte SIZEOF_WORD
+        shr     eax, 16
+.column_st1:
+        ; Store the lower 1 byte of eax to the output when it has enough
+        ; space.
+        test    ecx, ecx
+        jz      short .nextrow
+        mov     BYTE [edi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%else
+        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%endif
+        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+        movdqa    xmmC,xmmA
+        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+        movdqa    xmmG,xmmB
+        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+        movdqa    xmmD,xmmA
+        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+        movdqa    xmmH,xmmC
+        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jb      short .column_st32
+
+        test    edi, SIZEOF_XMMWORD-1
+        jnz     short .out1
+        ; --(aligned)-------------------
+        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+        movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+        jmp     short .out0
+.out1:  ; --(unaligned)-----------------
+        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+        movdqu  XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+        sub     ecx, byte SIZEOF_XMMWORD
+        jz      near .nextrow
+
+        add     esi, byte SIZEOF_XMMWORD        ; inptr0
+        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
+        add     edx, byte SIZEOF_XMMWORD        ; inptr2
+        jmp     near .columnloop
+        alignx  16,7
+
+.column_st32:
+        cmp     ecx, byte SIZEOF_XMMWORD/2
+        jb      short .column_st16
+        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
+        movdqa  xmmA,xmmC
+        movdqa  xmmD,xmmH
+        sub     ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+        cmp     ecx, byte SIZEOF_XMMWORD/4
+        jb      short .column_st15
+        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        add     edi, byte SIZEOF_XMMWORD        ; outptr
+        movdqa  xmmA,xmmD
+        sub     ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+        ; space.
+        cmp     ecx, byte SIZEOF_XMMWORD/8
+        jb      short .column_st7
+        movq    XMM_MMWORD [edi], xmmA
+        add     edi, byte SIZEOF_XMMWORD/8*4
+        sub     ecx, byte SIZEOF_XMMWORD/8
+        psrldq  xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+        ; space.
+        test    ecx, ecx
+        jz      short .nextrow
+        movd    XMM_DWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+        alignx  16,7
+
+.nextrow:
+        pop     ecx
+        pop     esi
+        pop     ebx
+        pop     edx
+        pop     edi
+        pop     eax
+
+        add     esi, byte SIZEOF_JSAMPROW
+        add     ebx, byte SIZEOF_JSAMPROW
+        add     edx, byte SIZEOF_JSAMPROW
+        add     edi, byte SIZEOF_JSAMPROW       ; output_buf
+        dec     eax                             ; num_rows
+        jg      near .rowloop
+
+        sfence          ; flush the write buffer
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jdcolor-altivec.c b/simd/jdcolor-altivec.c
new file mode 100644
index 0000000..e0892d8
--- /dev/null
+++ b/simd/jdcolor-altivec.c
@@ -0,0 +1,96 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344 22554              /* FIX(0.34414) */
+#define F_0_714 46802              /* FIX(0.71414) */
+#define F_1_402 91881              /* FIX(1.40200) */
+#define F_1_772 116130             /* FIX(1.77200) */
+#define F_0_402 (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285 (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
+#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
+#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
+#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
+#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
+#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
diff --git a/simd/jdcolmmx.asm b/simd/jdcolor-mmx.asm
similarity index 70%
rename from simd/jdcolmmx.asm
rename to simd/jdcolor-mmx.asm
index 21ca32a..6730e48 100644
--- a/simd/jdcolmmx.asm
+++ b/simd/jdcolor-mmx.asm
@@ -1,5 +1,5 @@
 ;
-; jdcolmmx.asm - colorspace conversion (MMX)
+; jdcolor.asm - colorspace conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright 2009 D. R. Commander
@@ -21,37 +21,37 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS	16
+%define SCALEBITS       16
 
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+F_0_344 equ      22554                  ; FIX(0.34414)
+F_0_714 equ      46802                  ; FIX(0.71414)
+F_1_402 equ      91881                  ; FIX(1.40200)
+F_1_772 equ     116130                  ; FIX(1.77200)
+F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
+F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
+F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 
-	alignz	16
-	global	EXTN(jconst_ycc_rgb_convert_mmx) PRIVATE
+        alignz  16
+        global  EXTN(jconst_ycc_rgb_convert_mmx)
 
 EXTN(jconst_ycc_rgb_convert_mmx):
 
-PW_F0402	times 4 dw  F_0_402
-PW_MF0228	times 4 dw -F_0_228
-PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
-PW_ONE		times 4 dw  1
-PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+PW_F0402        times 4 dw  F_0_402
+PW_MF0228       times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE          times 4 dw  1
+PD_ONEHALF      times 2 dd  1 << (SCALEBITS-1)
 
-	alignz	16
+        alignz  16
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
+        SECTION SEG_TEXT
+        BITS    32
 
-%include "jdclrmmx.asm"
+%include "jdcolext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -62,7 +62,7 @@
 %define RGB_BLUE EXT_RGB_BLUE
 %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 %define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
-%include "jdclrmmx.asm"
+%include "jdcolext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -73,7 +73,7 @@
 %define RGB_BLUE EXT_RGBX_BLUE
 %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 %define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
-%include "jdclrmmx.asm"
+%include "jdcolext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -84,7 +84,7 @@
 %define RGB_BLUE EXT_BGR_BLUE
 %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 %define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
-%include "jdclrmmx.asm"
+%include "jdcolext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -95,7 +95,7 @@
 %define RGB_BLUE EXT_BGRX_BLUE
 %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 %define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
-%include "jdclrmmx.asm"
+%include "jdcolext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -106,7 +106,7 @@
 %define RGB_BLUE EXT_XBGR_BLUE
 %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 %define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
-%include "jdclrmmx.asm"
+%include "jdcolext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -117,4 +117,4 @@
 %define RGB_BLUE EXT_XRGB_BLUE
 %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 %define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
-%include "jdclrmmx.asm"
+%include "jdcolext-mmx.asm"
diff --git a/simd/jdcolss2-64.asm b/simd/jdcolor-sse2-64.asm
similarity index 70%
rename from simd/jdcolss2-64.asm
rename to simd/jdcolor-sse2-64.asm
index 443734f..e9277f1 100644
--- a/simd/jdcolss2-64.asm
+++ b/simd/jdcolor-sse2-64.asm
@@ -1,5 +1,5 @@
 ;
-; jdcolss2-64.asm - colorspace conversion (64-bit SSE2)
+; jdcolor.asm - colorspace conversion (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright 2009 D. R. Commander
@@ -21,37 +21,37 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS	16
+%define SCALEBITS       16
 
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+F_0_344 equ      22554                  ; FIX(0.34414)
+F_0_714 equ      46802                  ; FIX(0.71414)
+F_1_402 equ      91881                  ; FIX(1.40200)
+F_1_772 equ     116130                  ; FIX(1.77200)
+F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
+F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
+F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 
-	alignz	16
-	global	EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
+        alignz  16
+        global  EXTN(jconst_ycc_rgb_convert_sse2)
 
 EXTN(jconst_ycc_rgb_convert_sse2):
 
-PW_F0402	times 8 dw  F_0_402
-PW_MF0228	times 8 dw -F_0_228
-PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
-PW_ONE		times 8 dw  1
-PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
 
-	alignz	16
+        alignz  16
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
+        SECTION SEG_TEXT
+        BITS    64
 
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -62,7 +62,7 @@
 %define RGB_BLUE EXT_RGB_BLUE
 %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -73,7 +73,7 @@
 %define RGB_BLUE EXT_RGBX_BLUE
 %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -84,7 +84,7 @@
 %define RGB_BLUE EXT_BGR_BLUE
 %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -95,7 +95,7 @@
 %define RGB_BLUE EXT_BGRX_BLUE
 %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -106,7 +106,7 @@
 %define RGB_BLUE EXT_XBGR_BLUE
 %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -117,4 +117,4 @@
 %define RGB_BLUE EXT_XRGB_BLUE
 %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2-64.asm"
diff --git a/simd/jdcolss2-64.asm b/simd/jdcolor-sse2.asm
similarity index 70%
copy from simd/jdcolss2-64.asm
copy to simd/jdcolor-sse2.asm
index 443734f..c122cc7 100644
--- a/simd/jdcolss2-64.asm
+++ b/simd/jdcolor-sse2.asm
@@ -1,5 +1,5 @@
 ;
-; jdcolss2-64.asm - colorspace conversion (64-bit SSE2)
+; jdcolor.asm - colorspace conversion (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright 2009 D. R. Commander
@@ -21,37 +21,37 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS	16
+%define SCALEBITS       16
 
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+F_0_344 equ      22554                  ; FIX(0.34414)
+F_0_714 equ      46802                  ; FIX(0.71414)
+F_1_402 equ      91881                  ; FIX(1.40200)
+F_1_772 equ     116130                  ; FIX(1.77200)
+F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
+F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
+F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 
-	alignz	16
-	global	EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
+        alignz  16
+        global  EXTN(jconst_ycc_rgb_convert_sse2)
 
 EXTN(jconst_ycc_rgb_convert_sse2):
 
-PW_F0402	times 8 dw  F_0_402
-PW_MF0228	times 8 dw -F_0_228
-PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
-PW_ONE		times 8 dw  1
-PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
 
-	alignz	16
+        alignz  16
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
+        SECTION SEG_TEXT
+        BITS    32
 
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -62,7 +62,7 @@
 %define RGB_BLUE EXT_RGB_BLUE
 %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -73,7 +73,7 @@
 %define RGB_BLUE EXT_RGBX_BLUE
 %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -84,7 +84,7 @@
 %define RGB_BLUE EXT_BGR_BLUE
 %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -95,7 +95,7 @@
 %define RGB_BLUE EXT_BGRX_BLUE
 %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -106,7 +106,7 @@
 %define RGB_BLUE EXT_XBGR_BLUE
 %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -117,4 +117,4 @@
 %define RGB_BLUE EXT_XRGB_BLUE
 %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 %define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
-%include "jdclrss2-64.asm"
+%include "jdcolext-sse2.asm"
diff --git a/simd/jdcolss2.asm b/simd/jdcolss2.asm
deleted file mode 100644
index f968cf8..0000000
--- a/simd/jdcolss2.asm
+++ /dev/null
@@ -1,120 +0,0 @@
-;
-; jdcolss2.asm - colorspace conversion (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
-
-EXTN(jconst_ycc_rgb_convert_sse2):
-
-PW_F0402	times 8 dw  F_0_402
-PW_MF0228	times 8 dw -F_0_228
-PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
-PW_ONE		times 8 dw  1
-PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-
-%include "jdclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
-%include "jdclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
-%include "jdclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
-%include "jdclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
-%include "jdclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
-%include "jdclrss2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
-%include "jdclrss2.asm"
diff --git a/simd/jdct.inc b/simd/jdct.inc
index cc62704..ad5890c 100644
--- a/simd/jdct.inc
+++ b/simd/jdct.inc
@@ -18,11 +18,11 @@
 ;
 %define RANGE_MASK  (MAXJSAMPLE * 4 + 3)  ; 2 bits wider than legal samples
 
-%define ROW(n,b,s)		((b)+(n)*(s))
-%define COL(n,b,s)		((b)+(n)*(s)*DCTSIZE)
+%define ROW(n,b,s)              ((b)+(n)*(s))
+%define COL(n,b,s)              ((b)+(n)*(s)*DCTSIZE)
 
-%define DWBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
-%define MMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
-%define XMMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
+%define DWBLOCK(m,n,b,s)        ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
+%define MMBLOCK(m,n,b,s)        ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
+%define XMMBLOCK(m,n,b,s)       ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
 
 ; --------------------------------------------------------------------------
diff --git a/simd/jdmerge-altivec.c b/simd/jdmerge-altivec.c
new file mode 100644
index 0000000..cc8d3d9
--- /dev/null
+++ b/simd/jdmerge-altivec.c
@@ -0,0 +1,108 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344 22554              /* FIX(0.34414) */
+#define F_0_714 46802              /* FIX(0.71414) */
+#define F_1_402 91881              /* FIX(1.40200) */
+#define F_1_772 116130             /* FIX(1.77200) */
+#define F_0_402 (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285 (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
+#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
+#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgbx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgbx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
+#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
+#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgrx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgrx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
+#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/simd/jdmermmx.asm b/simd/jdmerge-mmx.asm
similarity index 74%
rename from simd/jdmermmx.asm
rename to simd/jdmerge-mmx.asm
index 76f2f5b..2daa7fa 100644
--- a/simd/jdmermmx.asm
+++ b/simd/jdmerge-mmx.asm
@@ -1,5 +1,5 @@
 ;
-; jdmermmx.asm - merged upsampling/color conversion (MMX)
+; jdmerge.asm - merged upsampling/color conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright 2009 D. R. Commander
@@ -21,37 +21,37 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS	16
+%define SCALEBITS       16
 
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+F_0_344 equ      22554                  ; FIX(0.34414)
+F_0_714 equ      46802                  ; FIX(0.71414)
+F_1_402 equ      91881                  ; FIX(1.40200)
+F_1_772 equ     116130                  ; FIX(1.77200)
+F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
+F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
+F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 
-	alignz	16
-	global	EXTN(jconst_merged_upsample_mmx) PRIVATE
+        alignz  16
+        global  EXTN(jconst_merged_upsample_mmx)
 
 EXTN(jconst_merged_upsample_mmx):
 
-PW_F0402	times 4 dw  F_0_402
-PW_MF0228	times 4 dw -F_0_228
-PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
-PW_ONE		times 4 dw  1
-PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+PW_F0402        times 4 dw  F_0_402
+PW_MF0228       times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE          times 4 dw  1
+PD_ONEHALF      times 2 dd  1 << (SCALEBITS-1)
 
-	alignz	16
+        alignz  16
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
+        SECTION SEG_TEXT
+        BITS    32
 
-%include "jdmrgmmx.asm"
+%include "jdmrgext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -63,7 +63,7 @@
 %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
 %define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
-%include "jdmrgmmx.asm"
+%include "jdmrgext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -75,7 +75,7 @@
 %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
 %define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
-%include "jdmrgmmx.asm"
+%include "jdmrgext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -87,7 +87,7 @@
 %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
 %define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
-%include "jdmrgmmx.asm"
+%include "jdmrgext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -99,7 +99,7 @@
 %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
 %define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
-%include "jdmrgmmx.asm"
+%include "jdmrgext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -111,7 +111,7 @@
 %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
 %define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
-%include "jdmrgmmx.asm"
+%include "jdmrgext-mmx.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -123,4 +123,4 @@
 %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
 %define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
-%include "jdmrgmmx.asm"
+%include "jdmrgext-mmx.asm"
diff --git a/simd/jdmerss2.asm b/simd/jdmerge-sse2-64.asm
similarity index 73%
copy from simd/jdmerss2.asm
copy to simd/jdmerge-sse2-64.asm
index 4fa6f7f..8f953c7 100644
--- a/simd/jdmerss2.asm
+++ b/simd/jdmerge-sse2-64.asm
@@ -1,5 +1,5 @@
 ;
-; jdmerss2.asm - merged upsampling/color conversion (SSE2)
+; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright 2009 D. R. Commander
@@ -21,37 +21,37 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS	16
+%define SCALEBITS       16
 
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+F_0_344 equ      22554                  ; FIX(0.34414)
+F_0_714 equ      46802                  ; FIX(0.71414)
+F_1_402 equ      91881                  ; FIX(1.40200)
+F_1_772 equ     116130                  ; FIX(1.77200)
+F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
+F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
+F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 
-	alignz	16
-	global	EXTN(jconst_merged_upsample_sse2) PRIVATE
+        alignz  16
+        global  EXTN(jconst_merged_upsample_sse2)
 
 EXTN(jconst_merged_upsample_sse2):
 
-PW_F0402	times 8 dw  F_0_402
-PW_MF0228	times 8 dw -F_0_228
-PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
-PW_ONE		times 8 dw  1
-PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
 
-	alignz	16
+        alignz  16
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
+        SECTION SEG_TEXT
+        BITS    64
 
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -63,7 +63,7 @@
 %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -75,7 +75,7 @@
 %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -87,7 +87,7 @@
 %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -99,7 +99,7 @@
 %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -111,7 +111,7 @@
 %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2-64.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -123,4 +123,4 @@
 %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2-64.asm"
diff --git a/simd/jdmerss2.asm b/simd/jdmerge-sse2.asm
similarity index 74%
rename from simd/jdmerss2.asm
rename to simd/jdmerge-sse2.asm
index 4fa6f7f..d22e828 100644
--- a/simd/jdmerss2.asm
+++ b/simd/jdmerge-sse2.asm
@@ -1,5 +1,5 @@
 ;
-; jdmerss2.asm - merged upsampling/color conversion (SSE2)
+; jdmerge.asm - merged upsampling/color conversion (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright 2009 D. R. Commander
@@ -21,37 +21,37 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS	16
+%define SCALEBITS       16
 
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+F_0_344 equ      22554                  ; FIX(0.34414)
+F_0_714 equ      46802                  ; FIX(0.71414)
+F_1_402 equ      91881                  ; FIX(1.40200)
+F_1_772 equ     116130                  ; FIX(1.77200)
+F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
+F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
+F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 
-	alignz	16
-	global	EXTN(jconst_merged_upsample_sse2) PRIVATE
+        alignz  16
+        global  EXTN(jconst_merged_upsample_sse2)
 
 EXTN(jconst_merged_upsample_sse2):
 
-PW_F0402	times 8 dw  F_0_402
-PW_MF0228	times 8 dw -F_0_228
-PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
-PW_ONE		times 8 dw  1
-PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
 
-	alignz	16
+        alignz  16
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
+        SECTION SEG_TEXT
+        BITS    32
 
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -63,7 +63,7 @@
 %define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -75,7 +75,7 @@
 %define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -87,7 +87,7 @@
 %define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -99,7 +99,7 @@
 %define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -111,7 +111,7 @@
 %define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2.asm"
 
 %undef RGB_RED
 %undef RGB_GREEN
@@ -123,4 +123,4 @@
 %define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 %define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
 %define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
-%include "jdmrgss2.asm"
+%include "jdmrgext-sse2.asm"
diff --git a/simd/jdmerss2-64.asm b/simd/jdmerss2-64.asm
deleted file mode 100644
index 02dd6da..0000000
--- a/simd/jdmerss2-64.asm
+++ /dev/null
@@ -1,126 +0,0 @@
-;
-; jdmerss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS	16
-
-F_0_344	equ	 22554			; FIX(0.34414)
-F_0_714	equ	 46802			; FIX(0.71414)
-F_1_402	equ	 91881			; FIX(1.40200)
-F_1_772	equ	116130			; FIX(1.77200)
-F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
-F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
-F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_merged_upsample_sse2) PRIVATE
-
-EXTN(jconst_merged_upsample_sse2):
-
-PW_F0402	times 8 dw  F_0_402
-PW_MF0228	times 8 dw -F_0_228
-PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
-PW_ONE		times 8 dw  1
-PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-
-%include "jdmrgss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
-%include "jdmrgss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
-%include "jdmrgss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
-%include "jdmrgss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
-%include "jdmrgss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
-%include "jdmrgss2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
-%include "jdmrgss2-64.asm"
diff --git a/simd/jdmrgext-altivec.c b/simd/jdmrgext-altivec.c
new file mode 100644
index 0000000..3b6950d
--- /dev/null
+++ b/simd/jdmrgext-altivec.c
@@ -0,0 +1,323 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-altivec.c */
+
+
+void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
+                                         JSAMPIMAGE input_buf,
+                                         JDIMENSION in_row_group_ctr,
+                                         JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
+#if __BIG_ENDIAN__
+  int offset;
+#endif
+  unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
+
+  __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
+    y, cb, cr;
+#if __BIG_ENDIAN__
+  __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char out4;
+#endif
+#endif
+#if RGB_PIXELSIZE == 4
+  __vector unsigned char rgb3;
+#endif
+  __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
+    crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
+    rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo;
+  __vector int g_y0, g_y1, g_y2, g_y3;
+
+  /* Constants
+   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+   * high-order bits, not 16.
+   */
+  __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
+    pw_mf0228 = { __8X(-F_0_228 >> 1) },
+    pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
+    pw_one = { __8X(1) }, pw_255 = { __8X(255) },
+    pw_cj = { __8X(CENTERJSAMPLE) };
+  __vector int pd_onehalf = { __4X(ONE_HALF) };
+  __vector unsigned char pb_zero = { __16X(0) },
+#if __BIG_ENDIAN__
+    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29},
+    even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30},
+    odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31};
+#else
+    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31},
+    even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0},
+    odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0};
+#endif
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) {
+
+    cb = vec_ld(0, inptr1);
+    /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
+     * support unsigned vectors.
+     */
+    cbl = (__vector signed short)VEC_UNPACKHU(cb);
+    cbh = (__vector signed short)VEC_UNPACKLU(cb);
+    cbl = vec_sub(cbl, pw_cj);
+    cbh = vec_sub(cbh, pw_cj);
+
+    cr = vec_ld(0, inptr2);
+    crl = (__vector signed short)VEC_UNPACKHU(cr);
+    crh = (__vector signed short)VEC_UNPACKLU(cr);
+    crl = vec_sub(crl, pw_cj);
+    crh = vec_sub(crh, pw_cj);
+
+    /* (Original)
+     * R = Y                + 1.40200 * Cr
+     * G = Y - 0.34414 * Cb - 0.71414 * Cr
+     * B = Y + 1.77200 * Cb
+     *
+     * (This implementation)
+     * R = Y                + 0.40200 * Cr + Cr
+     * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+     * B = Y - 0.22800 * Cb + Cb + Cb
+     */
+    b_yl = vec_add(cbl, cbl);
+    b_yh = vec_add(cbh, cbh);
+    b_yl = vec_madds(b_yl, pw_mf0228, pw_one);
+    b_yh = vec_madds(b_yh, pw_mf0228, pw_one);
+    b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one);
+    b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one);
+    b_yl = vec_add(b_yl, cbl);
+    b_yh = vec_add(b_yh, cbh);
+    b_yl = vec_add(b_yl, cbl);
+    b_yh = vec_add(b_yh, cbh);
+
+    r_yl = vec_add(crl, crl);
+    r_yh = vec_add(crh, crh);
+    r_yl = vec_madds(r_yl, pw_f0402, pw_one);
+    r_yh = vec_madds(r_yh, pw_f0402, pw_one);
+    r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one);
+    r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one);
+    r_yl = vec_add(r_yl, crl);
+    r_yh = vec_add(r_yh, crh);
+
+    g_y0w = vec_mergeh(cbl, crl);
+    g_y1w = vec_mergel(cbl, crl);
+    g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf);
+    g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf);
+    g_y2w = vec_mergeh(cbh, crh);
+    g_y3w = vec_mergel(cbh, crh);
+    g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf);
+    g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf);
+    /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
+     * each dword into a new 16-bit vector, which is the equivalent of
+     * descaling the 32-bit results (right-shifting by 16 bits) and then
+     * packing them.
+     */
+    g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1,
+                    shift_pack_index);
+    g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3,
+                    shift_pack_index);
+    g_yl = vec_sub(g_yl, crl);
+    g_yh = vec_sub(g_yh, crh);
+
+    for (yloop = 0; yloop < 2 && num_cols > 0; yloop++,
+         num_cols -= RGB_PIXELSIZE * 16,
+         outptr += RGB_PIXELSIZE * 16, inptr0 += 16) {
+
+      y = vec_ld(0, inptr0);
+      ye = (__vector signed short)vec_perm(pb_zero, y, even_index);
+      yo = (__vector signed short)vec_perm(pb_zero, y, odd_index);
+
+      if (yloop == 0) {
+        be = vec_add(b_yl, ye);
+        bo = vec_add(b_yl, yo);
+        re = vec_add(r_yl, ye);
+        ro = vec_add(r_yl, yo);
+        ge = vec_add(g_yl, ye);
+        go = vec_add(g_yl, yo);
+      } else {
+        be = vec_add(b_yh, ye);
+        bo = vec_add(b_yh, yo);
+        re = vec_add(r_yh, ye);
+        ro = vec_add(r_yh, yo);
+        ge = vec_add(g_yh, ye);
+        go = vec_add(g_yh, yo);
+      }
+
+      rl = vec_mergeh(re, ro);
+      rh = vec_mergel(re, ro);
+      gl = vec_mergeh(ge, go);
+      gh = vec_mergel(ge, go);
+      bl = vec_mergeh(be, bo);
+      bh = vec_mergel(be, bo);
+
+      rg0 = vec_mergeh(rl, gl);
+      bx0 = vec_mergeh(bl, pw_255);
+      rg1 = vec_mergel(rl, gl);
+      bx1 = vec_mergel(bl, pw_255);
+      rg2 = vec_mergeh(rh, gh);
+      bx2 = vec_mergeh(bh, pw_255);
+      rg3 = vec_mergel(rh, gh);
+      bx3 = vec_mergel(bh, pw_255);
+
+      rgbx0 = vec_packsu(rg0, bx0);
+      rgbx1 = vec_packsu(rg1, bx1);
+      rgbx2 = vec_packsu(rg2, bx2);
+      rgbx3 = vec_packsu(rg3, bx3);
+
+#if RGB_PIXELSIZE == 3
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
+       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
+       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
+      rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
+      rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
+#else
+      /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
+       * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
+       * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
+       * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
+       *
+       * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
+       * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
+       * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
+       * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
+       */
+      rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
+      rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
+      rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
+      rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
+#endif
+
+#if __BIG_ENDIAN__
+      offset = (size_t)outptr & 15;
+      if (offset) {
+        __vector unsigned char unaligned_shift_index;
+        int bytes = num_cols + offset;
+
+        if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
+          /* Slow path to prevent buffer overwrite.  Since there is no way to
+           * write a partial AltiVec register, overwrite would occur on the
+           * last chunk of the last image row if the right edge is not on a
+           * 16-byte boundary.  It could also occur on other rows if the bytes
+           * per row is low enough.  Since we can't determine whether we're on
+           * the last image row, we have to assume every row is the last.
+           */
+          vec_st(rgb0, 0, tmpbuf);
+          vec_st(rgb1, 16, tmpbuf);
+          vec_st(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          vec_st(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          unaligned_shift_index = vec_lvsl(0, outptr);
+          edgel = vec_ld(0, outptr);
+          edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
+          edges = vec_perm(edgeh, edgel, unaligned_shift_index);
+          unaligned_shift_index = vec_lvsr(0, outptr);
+          out0 = vec_perm(edges, rgb0, unaligned_shift_index);
+          out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
+          out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
+#if RGB_PIXELSIZE == 4
+          out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
+          out4 = vec_perm(rgb3, edges, unaligned_shift_index);
+#else
+          out3 = vec_perm(rgb2, edges, unaligned_shift_index);
+#endif
+          vec_st(out0, 0, outptr);
+          if (bytes > 16)
+            vec_st(out1, 16, outptr);
+          if (bytes > 32)
+            vec_st(out2, 32, outptr);
+          if (bytes > 48)
+            vec_st(out3, 48, outptr);
+#if RGB_PIXELSIZE == 4
+          if (bytes > 64)
+            vec_st(out4, 64, outptr);
+#endif
+        }
+      } else {
+#endif /* __BIG_ENDIAN__ */
+        if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
+          /* Slow path */
+          VEC_ST(rgb0, 0, tmpbuf);
+          VEC_ST(rgb1, 16, tmpbuf);
+          VEC_ST(rgb2, 32, tmpbuf);
+#if RGB_PIXELSIZE == 4
+          VEC_ST(rgb3, 48, tmpbuf);
+#endif
+          memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
+        } else {
+          /* Fast path */
+          VEC_ST(rgb0, 0, outptr);
+          if (num_cols > 16)
+            VEC_ST(rgb1, 16, outptr);
+          if (num_cols > 32)
+            VEC_ST(rgb2, 32, outptr);
+#if RGB_PIXELSIZE == 4
+          if (num_cols > 48)
+            VEC_ST(rgb3, 48, outptr);
+#endif
+        }
+#if __BIG_ENDIAN__
+      }
+#endif
+    }
+  }
+}
+
+
+void jsimd_h2v2_merged_upsample_altivec (JDIMENSION output_width,
+                                         JSAMPIMAGE input_buf,
+                                         JDIMENSION in_row_group_ctr,
+                                         JSAMPARRAY output_buf)
+{
+  JSAMPROW inptr, outptr;
+
+  inptr = input_buf[0][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+  jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
+                                     output_buf);
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+  output_buf[0] = output_buf[1];
+  jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
+                                     output_buf);
+
+  input_buf[0][in_row_group_ctr] = inptr;
+  output_buf[0] = outptr;
+}
diff --git a/simd/jdmrgext-mmx.asm b/simd/jdmrgext-mmx.asm
new file mode 100644
index 0000000..a92e934
--- /dev/null
+++ b/simd/jdmrgext-mmx.asm
@@ -0,0 +1,464 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b)+8                   ; JDIMENSION output_width
+%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
+%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM          3
+%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
+
+        align   16
+        global  EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic eax             ; make a room for GOT address
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx                     ; get GOT address
+        movpic  POINTER [gotptr], ebx   ; save GOT address
+
+        mov     ecx, JDIMENSION [output_width(eax)]     ; col
+        test    ecx,ecx
+        jz      near .return
+
+        push    ecx
+
+        mov     edi, JSAMPIMAGE [input_buf(eax)]
+        mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
+        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+        mov     edi, JSAMPARRAY [output_buf(eax)]
+        mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
+        mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
+        mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
+        mov     edi, JSAMPROW [edi]                             ; outptr
+
+        pop     ecx                     ; col
+
+        alignx  16,7
+.columnloop:
+        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
+
+        movq      mm6, MMWORD [ebx]     ; mm6=Cb(01234567)
+        movq      mm7, MMWORD [edx]     ; mm7=Cr(01234567)
+
+        pxor      mm1,mm1               ; mm1=(all 0's)
+        pcmpeqw   mm3,mm3
+        psllw     mm3,7                 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+        movq      mm4,mm6
+        punpckhbw mm6,mm1               ; mm6=Cb(4567)=CbH
+        punpcklbw mm4,mm1               ; mm4=Cb(0123)=CbL
+        movq      mm0,mm7
+        punpckhbw mm7,mm1               ; mm7=Cr(4567)=CrH
+        punpcklbw mm0,mm1               ; mm0=Cr(0123)=CrL
+
+        paddw     mm6,mm3
+        paddw     mm4,mm3
+        paddw     mm7,mm3
+        paddw     mm0,mm3
+
+        ; (Original)
+        ; R = Y                + 1.40200 * Cr
+        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+        ; B = Y + 1.77200 * Cb
+        ;
+        ; (This implementation)
+        ; R = Y                + 0.40200 * Cr + Cr
+        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+        ; B = Y - 0.22800 * Cb + Cb + Cb
+
+        movq    mm5,mm6                 ; mm5=CbH
+        movq    mm2,mm4                 ; mm2=CbL
+        paddw   mm6,mm6                 ; mm6=2*CbH
+        paddw   mm4,mm4                 ; mm4=2*CbL
+        movq    mm1,mm7                 ; mm1=CrH
+        movq    mm3,mm0                 ; mm3=CrL
+        paddw   mm7,mm7                 ; mm7=2*CrH
+        paddw   mm0,mm0                 ; mm0=2*CrL
+
+        pmulhw  mm6,[GOTOFF(eax,PW_MF0228)]     ; mm6=(2*CbH * -FIX(0.22800))
+        pmulhw  mm4,[GOTOFF(eax,PW_MF0228)]     ; mm4=(2*CbL * -FIX(0.22800))
+        pmulhw  mm7,[GOTOFF(eax,PW_F0402)]      ; mm7=(2*CrH * FIX(0.40200))
+        pmulhw  mm0,[GOTOFF(eax,PW_F0402)]      ; mm0=(2*CrL * FIX(0.40200))
+
+        paddw   mm6,[GOTOFF(eax,PW_ONE)]
+        paddw   mm4,[GOTOFF(eax,PW_ONE)]
+        psraw   mm6,1                   ; mm6=(CbH * -FIX(0.22800))
+        psraw   mm4,1                   ; mm4=(CbL * -FIX(0.22800))
+        paddw   mm7,[GOTOFF(eax,PW_ONE)]
+        paddw   mm0,[GOTOFF(eax,PW_ONE)]
+        psraw   mm7,1                   ; mm7=(CrH * FIX(0.40200))
+        psraw   mm0,1                   ; mm0=(CrL * FIX(0.40200))
+
+        paddw   mm6,mm5
+        paddw   mm4,mm2
+        paddw   mm6,mm5                 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
+        paddw   mm4,mm2                 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
+        paddw   mm7,mm1                 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
+        paddw   mm0,mm3                 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+        movq    MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
+        movq    MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
+
+        movq      mm6,mm5
+        movq      mm7,mm2
+        punpcklwd mm5,mm1
+        punpckhwd mm6,mm1
+        pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+        pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
+        punpcklwd mm2,mm3
+        punpckhwd mm7,mm3
+        pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+        pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+        paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+        paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
+        psrad     mm5,SCALEBITS
+        psrad     mm6,SCALEBITS
+        paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+        paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
+        psrad     mm2,SCALEBITS
+        psrad     mm7,SCALEBITS
+
+        packssdw  mm5,mm6       ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+        packssdw  mm2,mm7       ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+        psubw     mm5,mm1       ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+        psubw     mm2,mm3       ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+        movq    MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
+
+        mov     al,2                    ; Yctr
+        jmp     short .Yloop_1st
+        alignx  16,7
+
+.Yloop_2nd:
+        movq    mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
+        movq    mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
+        movq    mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
+        alignx  16,7
+
+.Yloop_1st:
+        movq    mm7, MMWORD [esi]       ; mm7=Y(01234567)
+
+        pcmpeqw mm6,mm6
+        psrlw   mm6,BYTE_BIT            ; mm6={0xFF 0x00 0xFF 0x00 ..}
+        pand    mm6,mm7                 ; mm6=Y(0246)=YE
+        psrlw   mm7,BYTE_BIT            ; mm7=Y(1357)=YO
+
+        movq    mm1,mm0                 ; mm1=mm0=(R-Y)(L/H)
+        movq    mm3,mm2                 ; mm3=mm2=(G-Y)(L/H)
+        movq    mm5,mm4                 ; mm5=mm4=(B-Y)(L/H)
+
+        paddw     mm0,mm6               ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+        paddw     mm1,mm7               ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+        packuswb  mm0,mm0               ; mm0=(R0 R2 R4 R6 ** ** ** **)
+        packuswb  mm1,mm1               ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+        paddw     mm2,mm6               ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+        paddw     mm3,mm7               ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+        packuswb  mm2,mm2               ; mm2=(G0 G2 G4 G6 ** ** ** **)
+        packuswb  mm3,mm3               ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+        paddw     mm4,mm6               ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+        paddw     mm5,mm7               ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+        packuswb  mm4,mm4               ; mm4=(B0 B2 B4 B6 ** ** ** **)
+        packuswb  mm5,mm5               ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+        ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
+        punpcklbw mmE,mmB               ; mmE=(20 01 22 03 24 05 26 07)
+        punpcklbw mmD,mmF               ; mmD=(11 21 13 23 15 25 17 27)
+
+        movq      mmG,mmA
+        movq      mmH,mmA
+        punpcklwd mmA,mmE               ; mmA=(00 10 20 01 02 12 22 03)
+        punpckhwd mmG,mmE               ; mmG=(04 14 24 05 06 16 26 07)
+
+        psrlq     mmH,2*BYTE_BIT        ; mmH=(02 12 04 14 06 16 -- --)
+        psrlq     mmE,2*BYTE_BIT        ; mmE=(22 03 24 05 26 07 -- --)
+
+        movq      mmC,mmD
+        movq      mmB,mmD
+        punpcklwd mmD,mmH               ; mmD=(11 21 02 12 13 23 04 14)
+        punpckhwd mmC,mmH               ; mmC=(15 25 06 16 17 27 -- --)
+
+        psrlq     mmB,2*BYTE_BIT        ; mmB=(13 23 15 25 17 27 -- --)
+
+        movq      mmF,mmE
+        punpcklwd mmE,mmB               ; mmE=(22 03 13 23 24 05 15 25)
+        punpckhwd mmF,mmB               ; mmF=(26 07 17 27 -- -- -- --)
+
+        punpckldq mmA,mmD               ; mmA=(00 10 20 01 11 21 02 12)
+        punpckldq mmE,mmG               ; mmE=(22 03 13 23 04 14 24 05)
+        punpckldq mmC,mmF               ; mmC=(15 25 06 16 26 07 17 27)
+
+        cmp     ecx, byte SIZEOF_MMWORD
+        jb      short .column_st16
+
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
+        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+        sub     ecx, byte SIZEOF_MMWORD
+        jz      near .endcolumn
+
+        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
+        add     esi, byte SIZEOF_MMWORD                 ; inptr0
+        dec     al                      ; Yctr
+        jnz     near .Yloop_2nd
+
+        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
+        add     edx, byte SIZEOF_MMWORD                 ; inptr2
+        jmp     near .columnloop
+        alignx  16,7
+
+.column_st16:
+        lea     ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+        cmp     ecx, byte 2*SIZEOF_MMWORD
+        jb      short .column_st8
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
+        movq    mmA,mmC
+        sub     ecx, byte 2*SIZEOF_MMWORD
+        add     edi, byte 2*SIZEOF_MMWORD
+        jmp     short .column_st4
+.column_st8:
+        cmp     ecx, byte SIZEOF_MMWORD
+        jb      short .column_st4
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+        movq    mmA,mmE
+        sub     ecx, byte SIZEOF_MMWORD
+        add     edi, byte SIZEOF_MMWORD
+.column_st4:
+        movd    eax,mmA
+        cmp     ecx, byte SIZEOF_DWORD
+        jb      short .column_st2
+        mov     DWORD [edi+0*SIZEOF_DWORD], eax
+        psrlq   mmA,DWORD_BIT
+        movd    eax,mmA
+        sub     ecx, byte SIZEOF_DWORD
+        add     edi, byte SIZEOF_DWORD
+.column_st2:
+        cmp     ecx, byte SIZEOF_WORD
+        jb      short .column_st1
+        mov     WORD [edi+0*SIZEOF_WORD], ax
+        shr     eax,WORD_BIT
+        sub     ecx, byte SIZEOF_WORD
+        add     edi, byte SIZEOF_WORD
+.column_st1:
+        cmp     ecx, byte SIZEOF_BYTE
+        jb      short .endcolumn
+        mov     BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+        pcmpeqb   mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
+        pcmpeqb   mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+        pxor      mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
+        pxor      mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+        ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
+        punpcklbw mmE,mmG               ; mmE=(20 30 22 32 24 34 26 36)
+        punpcklbw mmB,mmD               ; mmB=(01 11 03 13 05 15 07 17)
+        punpcklbw mmF,mmH               ; mmF=(21 31 23 33 25 35 27 37)
+
+        movq      mmC,mmA
+        punpcklwd mmA,mmE               ; mmA=(00 10 20 30 02 12 22 32)
+        punpckhwd mmC,mmE               ; mmC=(04 14 24 34 06 16 26 36)
+        movq      mmG,mmB
+        punpcklwd mmB,mmF               ; mmB=(01 11 21 31 03 13 23 33)
+        punpckhwd mmG,mmF               ; mmG=(05 15 25 35 07 17 27 37)
+
+        movq      mmD,mmA
+        punpckldq mmA,mmB               ; mmA=(00 10 20 30 01 11 21 31)
+        punpckhdq mmD,mmB               ; mmD=(02 12 22 32 03 13 23 33)
+        movq      mmH,mmC
+        punpckldq mmC,mmG               ; mmC=(04 14 24 34 05 15 25 35)
+        punpckhdq mmH,mmG               ; mmH=(06 16 26 36 07 17 27 37)
+
+        cmp     ecx, byte SIZEOF_MMWORD
+        jb      short .column_st16
+
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
+        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
+        movq    MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+        sub     ecx, byte SIZEOF_MMWORD
+        jz      short .endcolumn
+
+        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
+        add     esi, byte SIZEOF_MMWORD                 ; inptr0
+        dec     al                      ; Yctr
+        jnz     near .Yloop_2nd
+
+        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
+        add     edx, byte SIZEOF_MMWORD                 ; inptr2
+        jmp     near .columnloop
+        alignx  16,7
+
+.column_st16:
+        cmp     ecx, byte SIZEOF_MMWORD/2
+        jb      short .column_st8
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
+        movq    mmA,mmC
+        movq    mmD,mmH
+        sub     ecx, byte SIZEOF_MMWORD/2
+        add     edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+        cmp     ecx, byte SIZEOF_MMWORD/4
+        jb      short .column_st4
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+        movq    mmA,mmD
+        sub     ecx, byte SIZEOF_MMWORD/4
+        add     edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+        cmp     ecx, byte SIZEOF_MMWORD/8
+        jb      short .endcolumn
+        movd    DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+        emms            ; empty MMX state
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b)+8                   ; JDIMENSION output_width
+%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
+%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
+
+        align   16
+        global  EXTN(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+        push    ebp
+        mov     ebp,esp
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     eax, JDIMENSION [output_width(ebp)]
+
+        mov     edi, JSAMPIMAGE [input_buf(ebp)]
+        mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
+        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+        mov     edi, JSAMPARRAY [output_buf(ebp)]
+        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+        push    edx                     ; inptr2
+        push    ebx                     ; inptr1
+        push    esi                     ; inptr00
+        mov     ebx,esp
+
+        push    edi                     ; output_buf (outptr0)
+        push    ecx                     ; in_row_group_ctr
+        push    ebx                     ; input_buf
+        push    eax                     ; output_width
+
+        call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+        add     esi, byte SIZEOF_JSAMPROW       ; inptr01
+        add     edi, byte SIZEOF_JSAMPROW       ; outptr1
+        mov     POINTER [ebx+0*SIZEOF_POINTER], esi
+        mov     POINTER [ebx-1*SIZEOF_POINTER], edi
+
+        call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+        add     esp, byte 7*SIZEOF_DWORD
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jdmrgext-sse2-64.asm b/simd/jdmrgext-sse2-64.asm
new file mode 100644
index 0000000..989d7f1
--- /dev/null
+++ b/simd/jdmrgext-sse2-64.asm
@@ -0,0 +1,538 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+; r10 = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          3
+
+        align   16
+        global  EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [wk(0)]
+        collect_args
+        push    rbx
+
+        mov     ecx, r10d        ; col
+        test    rcx,rcx
+        jz      near .return
+
+        push    rcx
+
+        mov     rdi, r11
+        mov     ecx, r12d
+        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+        mov     rdi, r13
+        mov     rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]         ; inptr0
+        mov     rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]         ; inptr1
+        mov     rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]         ; inptr2
+        mov     rdi, JSAMPROW [rdi]                             ; outptr
+
+        pop     rcx                     ; col
+
+.columnloop:
+
+        movdqa    xmm6, XMMWORD [rbx]   ; xmm6=Cb(0123456789ABCDEF)
+        movdqa    xmm7, XMMWORD [rdx]   ; xmm7=Cr(0123456789ABCDEF)
+
+        pxor      xmm1,xmm1             ; xmm1=(all 0's)
+        pcmpeqw   xmm3,xmm3
+        psllw     xmm3,7                ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+        movdqa    xmm4,xmm6
+        punpckhbw xmm6,xmm1             ; xmm6=Cb(89ABCDEF)=CbH
+        punpcklbw xmm4,xmm1             ; xmm4=Cb(01234567)=CbL
+        movdqa    xmm0,xmm7
+        punpckhbw xmm7,xmm1             ; xmm7=Cr(89ABCDEF)=CrH
+        punpcklbw xmm0,xmm1             ; xmm0=Cr(01234567)=CrL
+
+        paddw     xmm6,xmm3
+        paddw     xmm4,xmm3
+        paddw     xmm7,xmm3
+        paddw     xmm0,xmm3
+
+        ; (Original)
+        ; R = Y                + 1.40200 * Cr
+        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+        ; B = Y + 1.77200 * Cb
+        ;
+        ; (This implementation)
+        ; R = Y                + 0.40200 * Cr + Cr
+        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+        ; B = Y - 0.22800 * Cb + Cb + Cb
+
+        movdqa  xmm5,xmm6               ; xmm5=CbH
+        movdqa  xmm2,xmm4               ; xmm2=CbL
+        paddw   xmm6,xmm6               ; xmm6=2*CbH
+        paddw   xmm4,xmm4               ; xmm4=2*CbL
+        movdqa  xmm1,xmm7               ; xmm1=CrH
+        movdqa  xmm3,xmm0               ; xmm3=CrL
+        paddw   xmm7,xmm7               ; xmm7=2*CrH
+        paddw   xmm0,xmm0               ; xmm0=2*CrL
+
+        pmulhw  xmm6,[rel PW_MF0228]    ; xmm6=(2*CbH * -FIX(0.22800))
+        pmulhw  xmm4,[rel PW_MF0228]    ; xmm4=(2*CbL * -FIX(0.22800))
+        pmulhw  xmm7,[rel PW_F0402]     ; xmm7=(2*CrH * FIX(0.40200))
+        pmulhw  xmm0,[rel PW_F0402]     ; xmm0=(2*CrL * FIX(0.40200))
+
+        paddw   xmm6,[rel PW_ONE]
+        paddw   xmm4,[rel PW_ONE]
+        psraw   xmm6,1                  ; xmm6=(CbH * -FIX(0.22800))
+        psraw   xmm4,1                  ; xmm4=(CbL * -FIX(0.22800))
+        paddw   xmm7,[rel PW_ONE]
+        paddw   xmm0,[rel PW_ONE]
+        psraw   xmm7,1                  ; xmm7=(CrH * FIX(0.40200))
+        psraw   xmm0,1                  ; xmm0=(CrL * FIX(0.40200))
+
+        paddw   xmm6,xmm5
+        paddw   xmm4,xmm2
+        paddw   xmm6,xmm5               ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+        paddw   xmm4,xmm2               ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+        paddw   xmm7,xmm1               ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+        paddw   xmm0,xmm3               ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+        movdqa    xmm6,xmm5
+        movdqa    xmm7,xmm2
+        punpcklwd xmm5,xmm1
+        punpckhwd xmm6,xmm1
+        pmaddwd   xmm5,[rel PW_MF0344_F0285]
+        pmaddwd   xmm6,[rel PW_MF0344_F0285]
+        punpcklwd xmm2,xmm3
+        punpckhwd xmm7,xmm3
+        pmaddwd   xmm2,[rel PW_MF0344_F0285]
+        pmaddwd   xmm7,[rel PW_MF0344_F0285]
+
+        paddd     xmm5,[rel PD_ONEHALF]
+        paddd     xmm6,[rel PD_ONEHALF]
+        psrad     xmm5,SCALEBITS
+        psrad     xmm6,SCALEBITS
+        paddd     xmm2,[rel PD_ONEHALF]
+        paddd     xmm7,[rel PD_ONEHALF]
+        psrad     xmm2,SCALEBITS
+        psrad     xmm7,SCALEBITS
+
+        packssdw  xmm5,xmm6     ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+        packssdw  xmm2,xmm7     ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+        psubw     xmm5,xmm1     ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+        psubw     xmm2,xmm3     ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+        movdqa  XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+        mov     al,2                    ; Yctr
+        jmp     short .Yloop_1st
+
+.Yloop_2nd:
+        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+
+.Yloop_1st:
+        movdqa  xmm7, XMMWORD [rsi]     ; xmm7=Y(0123456789ABCDEF)
+
+        pcmpeqw xmm6,xmm6
+        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+        pand    xmm6,xmm7               ; xmm6=Y(02468ACE)=YE
+        psrlw   xmm7,BYTE_BIT           ; xmm7=Y(13579BDF)=YO
+
+        movdqa  xmm1,xmm0               ; xmm1=xmm0=(R-Y)(L/H)
+        movdqa  xmm3,xmm2               ; xmm3=xmm2=(G-Y)(L/H)
+        movdqa  xmm5,xmm4               ; xmm5=xmm4=(B-Y)(L/H)
+
+        paddw     xmm0,xmm6             ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+        paddw     xmm1,xmm7             ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
+        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
+
+        paddw     xmm2,xmm6             ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+        paddw     xmm3,xmm7             ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
+        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
+
+        paddw     xmm4,xmm6             ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+        paddw     xmm5,xmm7             ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
+        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+        movdqa    xmmG,xmmA
+        movdqa    xmmH,xmmA
+        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+        movdqa    xmmC,xmmD
+        movdqa    xmmB,xmmD
+        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+        movdqa    xmmF,xmmE
+        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+        movdqa    xmmB,xmmE
+        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+        movdqa    xmmB,xmmF
+        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jb      short .column_st32
+
+        test    rdi, SIZEOF_XMMWORD-1
+        jnz     short .out1
+        ; --(aligned)-------------------
+        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+        jmp     short .out0
+.out1:  ; --(unaligned)-----------------
+        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+        sub     rcx, byte SIZEOF_XMMWORD
+        jz      near .endcolumn
+
+        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
+        dec     al                      ; Yctr
+        jnz     near .Yloop_2nd
+
+        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
+        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
+        jmp     near .columnloop
+
+.column_st32:
+        lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
+        cmp     rcx, byte 2*SIZEOF_XMMWORD
+        jb      short .column_st16
+        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
+        movdqa  xmmA,xmmF
+        sub     rcx, byte 2*SIZEOF_XMMWORD
+        jmp     short .column_st15
+.column_st16:
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jb      short .column_st15
+        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        add     rdi, byte SIZEOF_XMMWORD        ; outptr
+        movdqa  xmmA,xmmD
+        sub     rcx, byte SIZEOF_XMMWORD
+.column_st15:
+        ; Store the lower 8 bytes of xmmA to the output when it has enough
+        ; space.
+        cmp     rcx, byte SIZEOF_MMWORD
+        jb      short .column_st7
+        movq    XMM_MMWORD [rdi], xmmA
+        add     rdi, byte SIZEOF_MMWORD
+        sub     rcx, byte SIZEOF_MMWORD
+        psrldq  xmmA, SIZEOF_MMWORD
+.column_st7:
+        ; Store the lower 4 bytes of xmmA to the output when it has enough
+        ; space.
+        cmp     rcx, byte SIZEOF_DWORD
+        jb      short .column_st3
+        movd    XMM_DWORD [rdi], xmmA
+        add     rdi, byte SIZEOF_DWORD
+        sub     rcx, byte SIZEOF_DWORD
+        psrldq  xmmA, SIZEOF_DWORD
+.column_st3:
+        ; Store the lower 2 bytes of rax to the output when it has enough
+        ; space.
+        movd    eax, xmmA
+        cmp     rcx, byte SIZEOF_WORD
+        jb      short .column_st1
+        mov     WORD [rdi], ax
+        add     rdi, byte SIZEOF_WORD
+        sub     rcx, byte SIZEOF_WORD
+        shr     rax, 16
+.column_st1:
+        ; Store the lower 1 byte of rax to the output when it has enough
+        ; space.
+        test    rcx, rcx
+        jz      short .endcolumn
+        mov     BYTE [rdi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%else
+        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%endif
+        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+        movdqa    xmmC,xmmA
+        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+        movdqa    xmmG,xmmB
+        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+        movdqa    xmmD,xmmA
+        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+        movdqa    xmmH,xmmC
+        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+        cmp     rcx, byte SIZEOF_XMMWORD
+        jb      short .column_st32
+
+        test    rdi, SIZEOF_XMMWORD-1
+        jnz     short .out1
+        ; --(aligned)-------------------
+        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+        movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+        jmp     short .out0
+.out1:  ; --(unaligned)-----------------
+        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+        movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+        sub     rcx, byte SIZEOF_XMMWORD
+        jz      near .endcolumn
+
+        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
+        dec     al                      ; Yctr
+        jnz     near .Yloop_2nd
+
+        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
+        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
+        jmp     near .columnloop
+
+.column_st32:
+        cmp     rcx, byte SIZEOF_XMMWORD/2
+        jb      short .column_st16
+        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
+        movdqa  xmmA,xmmC
+        movdqa  xmmD,xmmH
+        sub     rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+        cmp     rcx, byte SIZEOF_XMMWORD/4
+        jb      short .column_st15
+        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+        add     rdi, byte SIZEOF_XMMWORD        ; outptr
+        movdqa  xmmA,xmmD
+        sub     rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+        ; space.
+        cmp     rcx, byte SIZEOF_XMMWORD/8
+        jb      short .column_st7
+        movq    XMM_MMWORD [rdi], xmmA
+        add     rdi, byte SIZEOF_XMMWORD/8*4
+        sub     rcx, byte SIZEOF_XMMWORD/8
+        psrldq  xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+        ; space.
+        test    rcx, rcx
+        jz      short .endcolumn
+        movd    XMM_DWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+        sfence          ; flush the write buffer
+
+.return:
+        pop     rbx
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+; r10 = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+        align   16
+        global  EXTN(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+        push    rbp
+        mov     rax,rsp
+        mov     rbp,rsp
+        collect_args
+        push    rbx
+
+        mov     eax, r10d
+
+        mov     rdi, r11
+        mov     ecx, r12d
+        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+        mov     rdi, r13
+        lea     rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+        push    rdx                     ; inptr2
+        push    rbx                     ; inptr1
+        push    rsi                     ; inptr00
+        mov     rbx,rsp
+
+        push    rdi
+        push    rcx
+        push    rax
+
+        %ifdef WIN64
+        mov r8, rcx
+        mov r9, rdi
+        mov rcx, rax
+        mov rdx, rbx
+        %else
+        mov rdx, rcx
+        mov rcx, rdi
+        mov     rdi, rax
+        mov rsi, rbx
+        %endif
+
+        call    EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+        pop rax
+        pop rcx
+        pop rdi
+        pop rsi
+        pop rbx
+        pop rdx
+
+        add     rdi, byte SIZEOF_JSAMPROW       ; outptr1
+        add     rsi, byte SIZEOF_JSAMPROW       ; inptr01
+
+        push    rdx                     ; inptr2
+        push    rbx                     ; inptr1
+        push    rsi                     ; inptr00
+        mov     rbx,rsp
+
+        push    rdi
+        push    rcx
+        push    rax
+
+        %ifdef WIN64
+        mov r8, rcx
+        mov r9, rdi
+        mov rcx, rax
+        mov rdx, rbx
+        %else
+        mov rdx, rcx
+        mov rcx, rdi
+        mov     rdi, rax
+        mov rsi, rbx
+        %endif
+
+        call    EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+        pop rax
+        pop rcx
+        pop rdi
+        pop rsi
+        pop rbx
+        pop rdx
+
+        pop     rbx
+        uncollect_args
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jdmrgext-sse2.asm b/simd/jdmrgext-sse2.asm
new file mode 100644
index 0000000..c47916f
--- /dev/null
+++ b/simd/jdmrgext-sse2.asm
@@ -0,0 +1,519 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2012 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b)+8                   ; JDIMENSION output_width
+%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
+%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          3
+%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
+
+        align   16
+        global  EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic eax             ; make a room for GOT address
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx                     ; get GOT address
+        movpic  POINTER [gotptr], ebx   ; save GOT address
+
+        mov     ecx, JDIMENSION [output_width(eax)]     ; col
+        test    ecx,ecx
+        jz      near .return
+
+        push    ecx
+
+        mov     edi, JSAMPIMAGE [input_buf(eax)]
+        mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
+        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+        mov     edi, JSAMPARRAY [output_buf(eax)]
+        mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
+        mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
+        mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
+        mov     edi, JSAMPROW [edi]                             ; outptr
+
+        pop     ecx                     ; col
+
+        alignx  16,7
+.columnloop:
+        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
+
+        movdqa    xmm6, XMMWORD [ebx]   ; xmm6=Cb(0123456789ABCDEF)
+        movdqa    xmm7, XMMWORD [edx]   ; xmm7=Cr(0123456789ABCDEF)
+
+        pxor      xmm1,xmm1             ; xmm1=(all 0's)
+        pcmpeqw   xmm3,xmm3
+        psllw     xmm3,7                ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+        movdqa    xmm4,xmm6
+        punpckhbw xmm6,xmm1             ; xmm6=Cb(89ABCDEF)=CbH
+        punpcklbw xmm4,xmm1             ; xmm4=Cb(01234567)=CbL
+        movdqa    xmm0,xmm7
+        punpckhbw xmm7,xmm1             ; xmm7=Cr(89ABCDEF)=CrH
+        punpcklbw xmm0,xmm1             ; xmm0=Cr(01234567)=CrL
+
+        paddw     xmm6,xmm3
+        paddw     xmm4,xmm3
+        paddw     xmm7,xmm3
+        paddw     xmm0,xmm3
+
+        ; (Original)
+        ; R = Y                + 1.40200 * Cr
+        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+        ; B = Y + 1.77200 * Cb
+        ;
+        ; (This implementation)
+        ; R = Y                + 0.40200 * Cr + Cr
+        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+        ; B = Y - 0.22800 * Cb + Cb + Cb
+
+        movdqa  xmm5,xmm6               ; xmm5=CbH
+        movdqa  xmm2,xmm4               ; xmm2=CbL
+        paddw   xmm6,xmm6               ; xmm6=2*CbH
+        paddw   xmm4,xmm4               ; xmm4=2*CbL
+        movdqa  xmm1,xmm7               ; xmm1=CrH
+        movdqa  xmm3,xmm0               ; xmm3=CrL
+        paddw   xmm7,xmm7               ; xmm7=2*CrH
+        paddw   xmm0,xmm0               ; xmm0=2*CrL
+
+        pmulhw  xmm6,[GOTOFF(eax,PW_MF0228)]    ; xmm6=(2*CbH * -FIX(0.22800))
+        pmulhw  xmm4,[GOTOFF(eax,PW_MF0228)]    ; xmm4=(2*CbL * -FIX(0.22800))
+        pmulhw  xmm7,[GOTOFF(eax,PW_F0402)]     ; xmm7=(2*CrH * FIX(0.40200))
+        pmulhw  xmm0,[GOTOFF(eax,PW_F0402)]     ; xmm0=(2*CrL * FIX(0.40200))
+
+        paddw   xmm6,[GOTOFF(eax,PW_ONE)]
+        paddw   xmm4,[GOTOFF(eax,PW_ONE)]
+        psraw   xmm6,1                  ; xmm6=(CbH * -FIX(0.22800))
+        psraw   xmm4,1                  ; xmm4=(CbL * -FIX(0.22800))
+        paddw   xmm7,[GOTOFF(eax,PW_ONE)]
+        paddw   xmm0,[GOTOFF(eax,PW_ONE)]
+        psraw   xmm7,1                  ; xmm7=(CrH * FIX(0.40200))
+        psraw   xmm0,1                  ; xmm0=(CrL * FIX(0.40200))
+
+        paddw   xmm6,xmm5
+        paddw   xmm4,xmm2
+        paddw   xmm6,xmm5               ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+        paddw   xmm4,xmm2               ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+        paddw   xmm7,xmm1               ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+        paddw   xmm0,xmm3               ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+        movdqa    xmm6,xmm5
+        movdqa    xmm7,xmm2
+        punpcklwd xmm5,xmm1
+        punpckhwd xmm6,xmm1
+        pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+        pmaddwd   xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
+        punpcklwd xmm2,xmm3
+        punpckhwd xmm7,xmm3
+        pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+        pmaddwd   xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+        paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
+        paddd     xmm6,[GOTOFF(eax,PD_ONEHALF)]
+        psrad     xmm5,SCALEBITS
+        psrad     xmm6,SCALEBITS
+        paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
+        paddd     xmm7,[GOTOFF(eax,PD_ONEHALF)]
+        psrad     xmm2,SCALEBITS
+        psrad     xmm7,SCALEBITS
+
+        packssdw  xmm5,xmm6     ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+        packssdw  xmm2,xmm7     ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+        psubw     xmm5,xmm1     ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+        psubw     xmm2,xmm3     ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+        movdqa  XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+        mov     al,2                    ; Yctr
+        jmp     short .Yloop_1st
+        alignx  16,7
+
+.Yloop_2nd:
+        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+        alignx  16,7
+
+.Yloop_1st:
+        movdqa  xmm7, XMMWORD [esi]     ; xmm7=Y(0123456789ABCDEF)
+
+        pcmpeqw xmm6,xmm6
+        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+        pand    xmm6,xmm7               ; xmm6=Y(02468ACE)=YE
+        psrlw   xmm7,BYTE_BIT           ; xmm7=Y(13579BDF)=YO
+
+        movdqa  xmm1,xmm0               ; xmm1=xmm0=(R-Y)(L/H)
+        movdqa  xmm3,xmm2               ; xmm3=xmm2=(G-Y)(L/H)
+        movdqa  xmm5,xmm4               ; xmm5=xmm4=(B-Y)(L/H)
+
+        paddw     xmm0,xmm6             ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+        paddw     xmm1,xmm7             ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
+        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
+
+        paddw     xmm2,xmm6             ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+        paddw     xmm3,xmm7             ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
+        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
+
+        paddw     xmm4,xmm6             ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+        paddw     xmm5,xmm7             ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
+        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+        movdqa    xmmG,xmmA
+        movdqa    xmmH,xmmA
+        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+        movdqa    xmmC,xmmD
+        movdqa    xmmB,xmmD
+        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+        movdqa    xmmF,xmmE
+        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+        movdqa    xmmB,xmmE
+        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+        movdqa    xmmB,xmmF
+        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jb      short .column_st32
+
+        test    edi, SIZEOF_XMMWORD-1
+        jnz     short .out1
+        ; --(aligned)-------------------
+        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+        jmp     short .out0
+.out1:  ; --(unaligned)-----------------
+        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+        sub     ecx, byte SIZEOF_XMMWORD
+        jz      near .endcolumn
+
+        add     esi, byte SIZEOF_XMMWORD        ; inptr0
+        dec     al                      ; Yctr
+        jnz     near .Yloop_2nd
+
+        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
+        add     edx, byte SIZEOF_XMMWORD        ; inptr2
+        jmp     near .columnloop
+        alignx  16,7
+
+.column_st32:
+        lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
+        cmp     ecx, byte 2*SIZEOF_XMMWORD
+        jb      short .column_st16
+        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
+        movdqa  xmmA,xmmF
+        sub     ecx, byte 2*SIZEOF_XMMWORD
+        jmp     short .column_st15
+.column_st16:
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jb      short .column_st15
+        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        add     edi, byte SIZEOF_XMMWORD        ; outptr
+        movdqa  xmmA,xmmD
+        sub     ecx, byte SIZEOF_XMMWORD
+.column_st15:
+        ; Store the lower 8 bytes of xmmA to the output when it has enough
+        ; space.
+        cmp     ecx, byte SIZEOF_MMWORD
+        jb      short .column_st7
+        movq    XMM_MMWORD [edi], xmmA
+        add     edi, byte SIZEOF_MMWORD
+        sub     ecx, byte SIZEOF_MMWORD
+        psrldq  xmmA, SIZEOF_MMWORD
+.column_st7:
+        ; Store the lower 4 bytes of xmmA to the output when it has enough
+        ; space.
+        cmp     ecx, byte SIZEOF_DWORD
+        jb      short .column_st3
+        movd    XMM_DWORD [edi], xmmA
+        add     edi, byte SIZEOF_DWORD
+        sub     ecx, byte SIZEOF_DWORD
+        psrldq  xmmA, SIZEOF_DWORD
+.column_st3:
+        ; Store the lower 2 bytes of eax to the output when it has enough
+        ; space.
+        movd    eax, xmmA
+        cmp     ecx, byte SIZEOF_WORD
+        jb      short .column_st1
+        mov     WORD [edi], ax
+        add     edi, byte SIZEOF_WORD
+        sub     ecx, byte SIZEOF_WORD
+        shr     eax, 16
+.column_st1:
+        ; Store the lower 1 byte of eax to the output when it has enough
+        ; space.
+        test    ecx, ecx
+        jz      short .endcolumn
+        mov     BYTE [edi], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%else
+        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%endif
+        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+        movdqa    xmmC,xmmA
+        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+        movdqa    xmmG,xmmB
+        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+        movdqa    xmmD,xmmA
+        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+        movdqa    xmmH,xmmC
+        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+        cmp     ecx, byte SIZEOF_XMMWORD
+        jb      short .column_st32
+
+        test    edi, SIZEOF_XMMWORD-1
+        jnz     short .out1
+        ; --(aligned)-------------------
+        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+        movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+        jmp     short .out0
+.out1:  ; --(unaligned)-----------------
+        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+        movdqu  XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+        sub     ecx, byte SIZEOF_XMMWORD
+        jz      near .endcolumn
+
+        add     esi, byte SIZEOF_XMMWORD        ; inptr0
+        dec     al                      ; Yctr
+        jnz     near .Yloop_2nd
+
+        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
+        add     edx, byte SIZEOF_XMMWORD        ; inptr2
+        jmp     near .columnloop
+        alignx  16,7
+
+.column_st32:
+        cmp     ecx, byte SIZEOF_XMMWORD/2
+        jb      short .column_st16
+        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
+        movdqa  xmmA,xmmC
+        movdqa  xmmD,xmmH
+        sub     ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+        cmp     ecx, byte SIZEOF_XMMWORD/4
+        jb      short .column_st15
+        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+        add     edi, byte SIZEOF_XMMWORD        ; outptr
+        movdqa  xmmA,xmmD
+        sub     ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+        ; space.
+        cmp     ecx, byte SIZEOF_XMMWORD/8
+        jb      short .column_st7
+        movq    XMM_MMWORD [edi], xmmA
+        add     edi, byte SIZEOF_XMMWORD/8*4
+        sub     ecx, byte SIZEOF_XMMWORD/8
+        psrldq  xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+        ; space.
+        test    ecx, ecx
+        jz      short .endcolumn
+        movd    XMM_DWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+        sfence          ; flush the write buffer
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b)+8                   ; JDIMENSION output_width
+%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
+%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
+
+        align   16
+        global  EXTN(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+        push    ebp
+        mov     ebp,esp
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     eax, POINTER [output_width(ebp)]
+
+        mov     edi, JSAMPIMAGE [input_buf(ebp)]
+        mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
+        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+        mov     edi, JSAMPARRAY [output_buf(ebp)]
+        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+        push    edx                     ; inptr2
+        push    ebx                     ; inptr1
+        push    esi                     ; inptr00
+        mov     ebx,esp
+
+        push    edi                     ; output_buf (outptr0)
+        push    ecx                     ; in_row_group_ctr
+        push    ebx                     ; input_buf
+        push    eax                     ; output_width
+
+        call    near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+        add     esi, byte SIZEOF_JSAMPROW       ; inptr01
+        add     edi, byte SIZEOF_JSAMPROW       ; outptr1
+        mov     POINTER [ebx+0*SIZEOF_POINTER], esi
+        mov     POINTER [ebx-1*SIZEOF_POINTER], edi
+
+        call    near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+        add     esp, byte 7*SIZEOF_DWORD
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jdmrgmmx.asm b/simd/jdmrgmmx.asm
deleted file mode 100644
index bfa4c86..0000000
--- a/simd/jdmrgmmx.asm
+++ /dev/null
@@ -1,464 +0,0 @@
-;
-; jdmrgmmx.asm - merged upsampling/color conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
-;                                 JSAMPIMAGE input_buf,
-;                                 JDIMENSION in_row_group_ctr,
-;                                 JSAMPARRAY output_buf);
-;
-
-%define output_width(b)	(b)+8			; JDIMENSION output_width
-%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
-%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		3
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_merged_upsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v1_merged_upsample_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [output_width(eax)]	; col
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	edi, JSAMPIMAGE [input_buf(eax)]
-	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
-	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-	mov	edi, JSAMPARRAY [output_buf(eax)]
-	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
-	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
-	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
-	mov	edi, JSAMPROW [edi]				; outptr
-
-	pop	ecx			; col
-
-	alignx	16,7
-.columnloop:
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-
-	movq      mm6, MMWORD [ebx]	; mm6=Cb(01234567)
-	movq      mm7, MMWORD [edx]	; mm7=Cr(01234567)
-
-	pxor      mm1,mm1		; mm1=(all 0's)
-	pcmpeqw   mm3,mm3
-	psllw     mm3,7			; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
-
-	movq      mm4,mm6
-	punpckhbw mm6,mm1		; mm6=Cb(4567)=CbH
-	punpcklbw mm4,mm1		; mm4=Cb(0123)=CbL
-	movq      mm0,mm7
-	punpckhbw mm7,mm1		; mm7=Cr(4567)=CrH
-	punpcklbw mm0,mm1		; mm0=Cr(0123)=CrL
-
-	paddw     mm6,mm3
-	paddw     mm4,mm3
-	paddw     mm7,mm3
-	paddw     mm0,mm3
-
-	; (Original)
-	; R = Y                + 1.40200 * Cr
-	; G = Y - 0.34414 * Cb - 0.71414 * Cr
-	; B = Y + 1.77200 * Cb
-	;
-	; (This implementation)
-	; R = Y                + 0.40200 * Cr + Cr
-	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-	; B = Y - 0.22800 * Cb + Cb + Cb
-
-	movq	mm5,mm6			; mm5=CbH
-	movq	mm2,mm4			; mm2=CbL
-	paddw	mm6,mm6			; mm6=2*CbH
-	paddw	mm4,mm4			; mm4=2*CbL
-	movq	mm1,mm7			; mm1=CrH
-	movq	mm3,mm0			; mm3=CrL
-	paddw	mm7,mm7			; mm7=2*CrH
-	paddw	mm0,mm0			; mm0=2*CrL
-
-	pmulhw	mm6,[GOTOFF(eax,PW_MF0228)]	; mm6=(2*CbH * -FIX(0.22800))
-	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbL * -FIX(0.22800))
-	pmulhw	mm7,[GOTOFF(eax,PW_F0402)]	; mm7=(2*CrH * FIX(0.40200))
-	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrL * FIX(0.40200))
-
-	paddw	mm6,[GOTOFF(eax,PW_ONE)]
-	paddw	mm4,[GOTOFF(eax,PW_ONE)]
-	psraw	mm6,1			; mm6=(CbH * -FIX(0.22800))
-	psraw	mm4,1			; mm4=(CbL * -FIX(0.22800))
-	paddw	mm7,[GOTOFF(eax,PW_ONE)]
-	paddw	mm0,[GOTOFF(eax,PW_ONE)]
-	psraw	mm7,1			; mm7=(CrH * FIX(0.40200))
-	psraw	mm0,1			; mm0=(CrL * FIX(0.40200))
-
-	paddw	mm6,mm5
-	paddw	mm4,mm2
-	paddw	mm6,mm5			; mm6=(CbH * FIX(1.77200))=(B-Y)H
-	paddw	mm4,mm2			; mm4=(CbL * FIX(1.77200))=(B-Y)L
-	paddw	mm7,mm1			; mm7=(CrH * FIX(1.40200))=(R-Y)H
-	paddw	mm0,mm3			; mm0=(CrL * FIX(1.40200))=(R-Y)L
-
-	movq	MMWORD [wk(0)], mm6	; wk(0)=(B-Y)H
-	movq	MMWORD [wk(1)], mm7	; wk(1)=(R-Y)H
-
-	movq      mm6,mm5
-	movq      mm7,mm2
-	punpcklwd mm5,mm1
-	punpckhwd mm6,mm1
-	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
-	punpcklwd mm2,mm3
-	punpckhwd mm7,mm3
-	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
-	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     mm5,SCALEBITS
-	psrad     mm6,SCALEBITS
-	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     mm2,SCALEBITS
-	psrad     mm7,SCALEBITS
-
-	packssdw  mm5,mm6	; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-	packssdw  mm2,mm7	; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-	psubw     mm5,mm1	; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-	psubw     mm2,mm3	; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-	movq	MMWORD [wk(2)], mm5	; wk(2)=(G-Y)H
-
-	mov	al,2			; Yctr
-	jmp	short .Yloop_1st
-	alignx	16,7
-
-.Yloop_2nd:
-	movq	mm0, MMWORD [wk(1)]	; mm0=(R-Y)H
-	movq	mm2, MMWORD [wk(2)]	; mm2=(G-Y)H
-	movq	mm4, MMWORD [wk(0)]	; mm4=(B-Y)H
-	alignx	16,7
-
-.Yloop_1st:
-	movq	mm7, MMWORD [esi]	; mm7=Y(01234567)
-
-	pcmpeqw	mm6,mm6
-	psrlw	mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
-	pand	mm6,mm7			; mm6=Y(0246)=YE
-	psrlw	mm7,BYTE_BIT		; mm7=Y(1357)=YO
-
-	movq	mm1,mm0			; mm1=mm0=(R-Y)(L/H)
-	movq	mm3,mm2			; mm3=mm2=(G-Y)(L/H)
-	movq	mm5,mm4			; mm5=mm4=(B-Y)(L/H)
-
-	paddw     mm0,mm6		; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
-	paddw     mm1,mm7		; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
-	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
-	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
-
-	paddw     mm2,mm6		; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
-	paddw     mm3,mm7		; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
-	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
-	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
-
-	paddw     mm4,mm6		; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
-	paddw     mm5,mm7		; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
-	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
-	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
-	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
-	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
-	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
-
-	movq      mmG,mmA
-	movq      mmH,mmA
-	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
-	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
-
-	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
-	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
-
-	movq      mmC,mmD
-	movq      mmB,mmD
-	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
-	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
-
-	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
-
-	movq      mmF,mmE
-	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
-	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
-
-	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
-	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
-	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
-
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st16
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
-	sub	ecx, byte SIZEOF_MMWORD
-	jz	near .endcolumn
-
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
-	add	esi, byte SIZEOF_MMWORD			; inptr0
-	dec	al			; Yctr
-	jnz	near .Yloop_2nd
-
-	add	ebx, byte SIZEOF_MMWORD			; inptr1
-	add	edx, byte SIZEOF_MMWORD			; inptr2
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st16:
-	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
-	cmp	ecx, byte 2*SIZEOF_MMWORD
-	jb	short .column_st8
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
-	movq	mmA,mmC
-	sub	ecx, byte 2*SIZEOF_MMWORD
-	add	edi, byte 2*SIZEOF_MMWORD
-	jmp	short .column_st4
-.column_st8:
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st4
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	mmA,mmE
-	sub	ecx, byte SIZEOF_MMWORD
-	add	edi, byte SIZEOF_MMWORD
-.column_st4:
-	movd	eax,mmA
-	cmp	ecx, byte SIZEOF_DWORD
-	jb	short .column_st2
-	mov	DWORD [edi+0*SIZEOF_DWORD], eax
-	psrlq	mmA,DWORD_BIT
-	movd	eax,mmA
-	sub	ecx, byte SIZEOF_DWORD
-	add	edi, byte SIZEOF_DWORD
-.column_st2:
-	cmp	ecx, byte SIZEOF_WORD
-	jb	short .column_st1
-	mov	WORD [edi+0*SIZEOF_WORD], ax
-	shr	eax,WORD_BIT
-	sub	ecx, byte SIZEOF_WORD
-	add	edi, byte SIZEOF_WORD
-.column_st1:
-	cmp	ecx, byte SIZEOF_BYTE
-	jb	short .endcolumn
-	mov	BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
-	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
-	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
-	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
-	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
-	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
-	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
-	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
-	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
-
-	movq      mmC,mmA
-	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
-	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
-	movq      mmG,mmB
-	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
-	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
-
-	movq      mmD,mmA
-	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
-	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
-	movq      mmH,mmC
-	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
-	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
-
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st16
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
-	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
-	sub	ecx, byte SIZEOF_MMWORD
-	jz	short .endcolumn
-
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
-	add	esi, byte SIZEOF_MMWORD			; inptr0
-	dec	al			; Yctr
-	jnz	near .Yloop_2nd
-
-	add	ebx, byte SIZEOF_MMWORD			; inptr1
-	add	edx, byte SIZEOF_MMWORD			; inptr2
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st16:
-	cmp	ecx, byte SIZEOF_MMWORD/2
-	jb	short .column_st8
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
-	movq	mmA,mmC
-	movq	mmD,mmH
-	sub	ecx, byte SIZEOF_MMWORD/2
-	add	edi, byte 2*SIZEOF_MMWORD
-.column_st8:
-	cmp	ecx, byte SIZEOF_MMWORD/4
-	jb	short .column_st4
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
-	movq	mmA,mmD
-	sub	ecx, byte SIZEOF_MMWORD/4
-	add	edi, byte 1*SIZEOF_MMWORD
-.column_st4:
-	cmp	ecx, byte SIZEOF_MMWORD/8
-	jb	short .endcolumn
-	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
-;                                 JSAMPIMAGE input_buf,
-;                                 JDIMENSION in_row_group_ctr,
-;                                 JSAMPARRAY output_buf);
-;
-
-%define output_width(b)	(b)+8			; JDIMENSION output_width
-%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
-%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
-
-	align	16
-	global	EXTN(jsimd_h2v2_merged_upsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v2_merged_upsample_mmx):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	eax, JDIMENSION [output_width(ebp)]
-
-	mov	edi, JSAMPIMAGE [input_buf(ebp)]
-	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
-	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-	mov	edi, JSAMPARRAY [output_buf(ebp)]
-	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
-
-	push	edx			; inptr2
-	push	ebx			; inptr1
-	push	esi			; inptr00
-	mov	ebx,esp
-
-	push	edi			; output_buf (outptr0)
-	push	ecx			; in_row_group_ctr
-	push	ebx			; input_buf
-	push	eax			; output_width
-
-	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-	add	esi, byte SIZEOF_JSAMPROW	; inptr01
-	add	edi, byte SIZEOF_JSAMPROW	; outptr1
-	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
-	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
-
-	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-	add	esp, byte 7*SIZEOF_DWORD
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm
deleted file mode 100644
index 0936188..0000000
--- a/simd/jdmrgss2-64.asm
+++ /dev/null
@@ -1,538 +0,0 @@
-;
-; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009, 2012 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-				
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-; r10 = JDIMENSION output_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
-; r13 = JSAMPARRAY output_buf
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		3
-
-	align	16
-	global	EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-	push	rbx
-
-	mov	ecx, r10d	; col
-	test	rcx,rcx
-	jz	near .return
-
-	push	rcx
-
-	mov	rdi, r11
-	mov	ecx, r12d
-	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-	mov	rdi, r13
-	mov	rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]		; inptr0
-	mov	rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]		; inptr1
-	mov	rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]		; inptr2
-	mov	rdi, JSAMPROW [rdi]				; outptr
-
-	pop	rcx			; col
-
-.columnloop:
-
-	movdqa    xmm6, XMMWORD [rbx]	; xmm6=Cb(0123456789ABCDEF)
-	movdqa    xmm7, XMMWORD [rdx]	; xmm7=Cr(0123456789ABCDEF)
-
-	pxor      xmm1,xmm1		; xmm1=(all 0's)
-	pcmpeqw   xmm3,xmm3
-	psllw     xmm3,7		; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-	movdqa    xmm4,xmm6
-	punpckhbw xmm6,xmm1		; xmm6=Cb(89ABCDEF)=CbH
-	punpcklbw xmm4,xmm1		; xmm4=Cb(01234567)=CbL
-	movdqa    xmm0,xmm7
-	punpckhbw xmm7,xmm1		; xmm7=Cr(89ABCDEF)=CrH
-	punpcklbw xmm0,xmm1		; xmm0=Cr(01234567)=CrL
-
-	paddw     xmm6,xmm3
-	paddw     xmm4,xmm3
-	paddw     xmm7,xmm3
-	paddw     xmm0,xmm3
-
-	; (Original)
-	; R = Y                + 1.40200 * Cr
-	; G = Y - 0.34414 * Cb - 0.71414 * Cr
-	; B = Y + 1.77200 * Cb
-	;
-	; (This implementation)
-	; R = Y                + 0.40200 * Cr + Cr
-	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-	; B = Y - 0.22800 * Cb + Cb + Cb
-
-	movdqa	xmm5,xmm6		; xmm5=CbH
-	movdqa	xmm2,xmm4		; xmm2=CbL
-	paddw	xmm6,xmm6		; xmm6=2*CbH
-	paddw	xmm4,xmm4		; xmm4=2*CbL
-	movdqa	xmm1,xmm7		; xmm1=CrH
-	movdqa	xmm3,xmm0		; xmm3=CrL
-	paddw	xmm7,xmm7		; xmm7=2*CrH
-	paddw	xmm0,xmm0		; xmm0=2*CrL
-
-	pmulhw	xmm6,[rel PW_MF0228]	; xmm6=(2*CbH * -FIX(0.22800))
-	pmulhw	xmm4,[rel PW_MF0228]	; xmm4=(2*CbL * -FIX(0.22800))
-	pmulhw	xmm7,[rel PW_F0402]	; xmm7=(2*CrH * FIX(0.40200))
-	pmulhw	xmm0,[rel PW_F0402]	; xmm0=(2*CrL * FIX(0.40200))
-
-	paddw	xmm6,[rel PW_ONE]
-	paddw	xmm4,[rel PW_ONE]
-	psraw	xmm6,1			; xmm6=(CbH * -FIX(0.22800))
-	psraw	xmm4,1			; xmm4=(CbL * -FIX(0.22800))
-	paddw	xmm7,[rel PW_ONE]
-	paddw	xmm0,[rel PW_ONE]
-	psraw	xmm7,1			; xmm7=(CrH * FIX(0.40200))
-	psraw	xmm0,1			; xmm0=(CrL * FIX(0.40200))
-
-	paddw	xmm6,xmm5
-	paddw	xmm4,xmm2
-	paddw	xmm6,xmm5		; xmm6=(CbH * FIX(1.77200))=(B-Y)H
-	paddw	xmm4,xmm2		; xmm4=(CbL * FIX(1.77200))=(B-Y)L
-	paddw	xmm7,xmm1		; xmm7=(CrH * FIX(1.40200))=(R-Y)H
-	paddw	xmm0,xmm3		; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
-	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=(B-Y)H
-	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(R-Y)H
-
-	movdqa    xmm6,xmm5
-	movdqa    xmm7,xmm2
-	punpcklwd xmm5,xmm1
-	punpckhwd xmm6,xmm1
-	pmaddwd   xmm5,[rel PW_MF0344_F0285]
-	pmaddwd   xmm6,[rel PW_MF0344_F0285]
-	punpcklwd xmm2,xmm3
-	punpckhwd xmm7,xmm3
-	pmaddwd   xmm2,[rel PW_MF0344_F0285]
-	pmaddwd   xmm7,[rel PW_MF0344_F0285]
-
-	paddd     xmm5,[rel PD_ONEHALF]
-	paddd     xmm6,[rel PD_ONEHALF]
-	psrad     xmm5,SCALEBITS
-	psrad     xmm6,SCALEBITS
-	paddd     xmm2,[rel PD_ONEHALF]
-	paddd     xmm7,[rel PD_ONEHALF]
-	psrad     xmm2,SCALEBITS
-	psrad     xmm7,SCALEBITS
-
-	packssdw  xmm5,xmm6	; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-	packssdw  xmm2,xmm7	; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-	psubw     xmm5,xmm1	; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-	psubw     xmm2,xmm3	; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-	movdqa	XMMWORD [wk(2)], xmm5	; wk(2)=(G-Y)H
-
-	mov	al,2			; Yctr
-	jmp	short .Yloop_1st
-
-.Yloop_2nd:
-	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(R-Y)H
-	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(G-Y)H
-	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(B-Y)H
-
-.Yloop_1st:
-	movdqa	xmm7, XMMWORD [rsi]	; xmm7=Y(0123456789ABCDEF)
-
-	pcmpeqw	xmm6,xmm6
-	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
-	pand	xmm6,xmm7		; xmm6=Y(02468ACE)=YE
-	psrlw	xmm7,BYTE_BIT		; xmm7=Y(13579BDF)=YO
-
-	movdqa	xmm1,xmm0		; xmm1=xmm0=(R-Y)(L/H)
-	movdqa	xmm3,xmm2		; xmm3=xmm2=(G-Y)(L/H)
-	movdqa	xmm5,xmm4		; xmm5=xmm4=(B-Y)(L/H)
-
-	paddw     xmm0,xmm6		; xmm0=((R-Y)+YE)=RE=R(02468ACE)
-	paddw     xmm1,xmm7		; xmm1=((R-Y)+YO)=RO=R(13579BDF)
-	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
-	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
-
-	paddw     xmm2,xmm6		; xmm2=((G-Y)+YE)=GE=G(02468ACE)
-	paddw     xmm3,xmm7		; xmm3=((G-Y)+YO)=GO=G(13579BDF)
-	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
-	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
-
-	paddw     xmm4,xmm6		; xmm4=((B-Y)+YE)=BE=B(02468ACE)
-	paddw     xmm5,xmm7		; xmm5=((B-Y)+YO)=BO=B(13579BDF)
-	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
-	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-	movdqa    xmmG,xmmA
-	movdqa    xmmH,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-	movdqa    xmmC,xmmD
-	movdqa    xmmB,xmmD
-	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-	movdqa    xmmF,xmmE
-	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-	movdqa    xmmB,xmmE
-	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-	movdqa    xmmB,xmmF
-	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	rdi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	rcx, byte SIZEOF_XMMWORD
-	jz	near .endcolumn
-
-	add	rsi, byte SIZEOF_XMMWORD	; inptr0
-	dec	al			; Yctr
-	jnz	near .Yloop_2nd
-
-	add	rbx, byte SIZEOF_XMMWORD	; inptr1
-	add	rdx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-
-.column_st32:
-	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
-	cmp	rcx, byte 2*SIZEOF_XMMWORD
-	jb	short .column_st16
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmF
-	sub	rcx, byte 2*SIZEOF_XMMWORD
-	jmp	short .column_st15
-.column_st16:
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jb	short .column_st15
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	rcx, byte SIZEOF_XMMWORD
-.column_st15:
-	; Store the lower 8 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	rcx, byte SIZEOF_MMWORD
-	jb	short .column_st7
-	movq	XMM_MMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_MMWORD
-	sub	rcx, byte SIZEOF_MMWORD
-	psrldq	xmmA, SIZEOF_MMWORD
-.column_st7:
-	; Store the lower 4 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	rcx, byte SIZEOF_DWORD
-	jb	short .column_st3
-	movd	XMM_DWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_DWORD
-	sub	rcx, byte SIZEOF_DWORD
-	psrldq	xmmA, SIZEOF_DWORD
-.column_st3:
-	; Store the lower 2 bytes of rax to the output when it has enough
-	; space.
-	movd	eax, xmmA
-	cmp	rcx, byte SIZEOF_WORD
-	jb	short .column_st1
-	mov	WORD [rdi], ax
-	add	rdi, byte SIZEOF_WORD
-	sub	rcx, byte SIZEOF_WORD
-	shr	rax, 16
-.column_st1:
-	; Store the lower 1 byte of rax to the output when it has enough
-	; space.
-	test	rcx, rcx
-	jz	short .endcolumn
-	mov	BYTE [rdi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%else
-	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%endif
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-	movdqa    xmmC,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-	movdqa    xmmG,xmmB
-	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	movdqa    xmmH,xmmC
-	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	cmp	rcx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	rdi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-	movdqu	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	rcx, byte SIZEOF_XMMWORD
-	jz	near .endcolumn
-
-	add	rsi, byte SIZEOF_XMMWORD	; inptr0
-	dec	al			; Yctr
-	jnz	near .Yloop_2nd
-
-	add	rbx, byte SIZEOF_XMMWORD	; inptr1
-	add	rdx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-
-.column_st32:
-	cmp	rcx, byte SIZEOF_XMMWORD/2
-	jb	short .column_st16
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmC
-	movdqa	xmmD,xmmH
-	sub	rcx, byte SIZEOF_XMMWORD/2
-.column_st16:
-	cmp	rcx, byte SIZEOF_XMMWORD/4
-	jb	short .column_st15
-	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	rcx, byte SIZEOF_XMMWORD/4
-.column_st15:
-	; Store two pixels (8 bytes) of xmmA to the output when it has enough
-	; space.
-	cmp	rcx, byte SIZEOF_XMMWORD/8
-	jb	short .column_st7
-	movq	XMM_MMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD/8*4
-	sub	rcx, byte SIZEOF_XMMWORD/8
-	psrldq	xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-	; Store one pixel (4 bytes) of xmmA to the output when it has enough
-	; space.
-	test	rcx, rcx
-	jz	short .endcolumn
-	movd	XMM_DWORD [rdi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-	sfence		; flush the write buffer
-
-.return:
-	pop	rbx
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-; r10 = JDIMENSION output_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
-; r13 = JSAMPARRAY output_buf
-
-	align	16
-	global	EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-	push	rbx
-
-	mov	eax, r10d
-
-	mov	rdi, r11
-	mov	ecx, r12d
-	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-	mov	rdi, r13
-	lea	rsi, [rsi+rcx*SIZEOF_JSAMPROW]
-
-	push	rdx			; inptr2
-	push	rbx			; inptr1
-	push	rsi			; inptr00
-	mov	rbx,rsp
-
-	push	rdi
-	push	rcx
-	push	rax
-
-	%ifdef WIN64
-	mov r8, rcx
-	mov r9, rdi
-	mov rcx, rax
-	mov rdx, rbx
-	%else
-	mov rdx, rcx
-	mov rcx, rdi
-	mov	rdi, rax
-	mov rsi, rbx
-	%endif
-
-	call	EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-	pop rax
-	pop rcx
-	pop rdi
-	pop rsi
-	pop rbx
-	pop rdx
-
-	add	rdi, byte SIZEOF_JSAMPROW	; outptr1
-	add	rsi, byte SIZEOF_JSAMPROW	; inptr01
-
-	push	rdx			; inptr2
-	push	rbx			; inptr1
-	push	rsi			; inptr00
-	mov	rbx,rsp
-
-	push	rdi
-	push	rcx
-	push	rax
-
-	%ifdef WIN64
-	mov r8, rcx
-	mov r9, rdi
-	mov rcx, rax
-	mov rdx, rbx
-	%else
-	mov rdx, rcx
-	mov rcx, rdi
-	mov	rdi, rax
-	mov rsi, rbx
-	%endif
-
-	call	EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-	pop rax
-	pop rcx
-	pop rdi
-	pop rsi
-	pop rbx
-	pop rdx
-
-	pop	rbx
-	uncollect_args
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm
deleted file mode 100644
index 1fd15ba..0000000
--- a/simd/jdmrgss2.asm
+++ /dev/null
@@ -1,519 +0,0 @@
-;
-; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2012 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-				
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-%define output_width(b)	(b)+8			; JDIMENSION output_width
-%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
-%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		3
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	ecx, JDIMENSION [output_width(eax)]	; col
-	test	ecx,ecx
-	jz	near .return
-
-	push	ecx
-
-	mov	edi, JSAMPIMAGE [input_buf(eax)]
-	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
-	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-	mov	edi, JSAMPARRAY [output_buf(eax)]
-	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
-	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
-	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
-	mov	edi, JSAMPROW [edi]				; outptr
-
-	pop	ecx			; col
-
-	alignx	16,7
-.columnloop:
-	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
-
-	movdqa    xmm6, XMMWORD [ebx]	; xmm6=Cb(0123456789ABCDEF)
-	movdqa    xmm7, XMMWORD [edx]	; xmm7=Cr(0123456789ABCDEF)
-
-	pxor      xmm1,xmm1		; xmm1=(all 0's)
-	pcmpeqw   xmm3,xmm3
-	psllw     xmm3,7		; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-	movdqa    xmm4,xmm6
-	punpckhbw xmm6,xmm1		; xmm6=Cb(89ABCDEF)=CbH
-	punpcklbw xmm4,xmm1		; xmm4=Cb(01234567)=CbL
-	movdqa    xmm0,xmm7
-	punpckhbw xmm7,xmm1		; xmm7=Cr(89ABCDEF)=CrH
-	punpcklbw xmm0,xmm1		; xmm0=Cr(01234567)=CrL
-
-	paddw     xmm6,xmm3
-	paddw     xmm4,xmm3
-	paddw     xmm7,xmm3
-	paddw     xmm0,xmm3
-
-	; (Original)
-	; R = Y                + 1.40200 * Cr
-	; G = Y - 0.34414 * Cb - 0.71414 * Cr
-	; B = Y + 1.77200 * Cb
-	;
-	; (This implementation)
-	; R = Y                + 0.40200 * Cr + Cr
-	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-	; B = Y - 0.22800 * Cb + Cb + Cb
-
-	movdqa	xmm5,xmm6		; xmm5=CbH
-	movdqa	xmm2,xmm4		; xmm2=CbL
-	paddw	xmm6,xmm6		; xmm6=2*CbH
-	paddw	xmm4,xmm4		; xmm4=2*CbL
-	movdqa	xmm1,xmm7		; xmm1=CrH
-	movdqa	xmm3,xmm0		; xmm3=CrL
-	paddw	xmm7,xmm7		; xmm7=2*CrH
-	paddw	xmm0,xmm0		; xmm0=2*CrL
-
-	pmulhw	xmm6,[GOTOFF(eax,PW_MF0228)]	; xmm6=(2*CbH * -FIX(0.22800))
-	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbL * -FIX(0.22800))
-	pmulhw	xmm7,[GOTOFF(eax,PW_F0402)]	; xmm7=(2*CrH * FIX(0.40200))
-	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrL * FIX(0.40200))
-
-	paddw	xmm6,[GOTOFF(eax,PW_ONE)]
-	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
-	psraw	xmm6,1			; xmm6=(CbH * -FIX(0.22800))
-	psraw	xmm4,1			; xmm4=(CbL * -FIX(0.22800))
-	paddw	xmm7,[GOTOFF(eax,PW_ONE)]
-	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
-	psraw	xmm7,1			; xmm7=(CrH * FIX(0.40200))
-	psraw	xmm0,1			; xmm0=(CrL * FIX(0.40200))
-
-	paddw	xmm6,xmm5
-	paddw	xmm4,xmm2
-	paddw	xmm6,xmm5		; xmm6=(CbH * FIX(1.77200))=(B-Y)H
-	paddw	xmm4,xmm2		; xmm4=(CbL * FIX(1.77200))=(B-Y)L
-	paddw	xmm7,xmm1		; xmm7=(CrH * FIX(1.40200))=(R-Y)H
-	paddw	xmm0,xmm3		; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
-	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=(B-Y)H
-	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(R-Y)H
-
-	movdqa    xmm6,xmm5
-	movdqa    xmm7,xmm2
-	punpcklwd xmm5,xmm1
-	punpckhwd xmm6,xmm1
-	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
-	punpcklwd xmm2,xmm3
-	punpckhwd xmm7,xmm3
-	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
-	pmaddwd   xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
-	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     xmm6,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     xmm5,SCALEBITS
-	psrad     xmm6,SCALEBITS
-	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
-	paddd     xmm7,[GOTOFF(eax,PD_ONEHALF)]
-	psrad     xmm2,SCALEBITS
-	psrad     xmm7,SCALEBITS
-
-	packssdw  xmm5,xmm6	; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-	packssdw  xmm2,xmm7	; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-	psubw     xmm5,xmm1	; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-	psubw     xmm2,xmm3	; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-	movdqa	XMMWORD [wk(2)], xmm5	; wk(2)=(G-Y)H
-
-	mov	al,2			; Yctr
-	jmp	short .Yloop_1st
-	alignx	16,7
-
-.Yloop_2nd:
-	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(R-Y)H
-	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(G-Y)H
-	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(B-Y)H
-	alignx	16,7
-
-.Yloop_1st:
-	movdqa	xmm7, XMMWORD [esi]	; xmm7=Y(0123456789ABCDEF)
-
-	pcmpeqw	xmm6,xmm6
-	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
-	pand	xmm6,xmm7		; xmm6=Y(02468ACE)=YE
-	psrlw	xmm7,BYTE_BIT		; xmm7=Y(13579BDF)=YO
-
-	movdqa	xmm1,xmm0		; xmm1=xmm0=(R-Y)(L/H)
-	movdqa	xmm3,xmm2		; xmm3=xmm2=(G-Y)(L/H)
-	movdqa	xmm5,xmm4		; xmm5=xmm4=(B-Y)(L/H)
-
-	paddw     xmm0,xmm6		; xmm0=((R-Y)+YE)=RE=R(02468ACE)
-	paddw     xmm1,xmm7		; xmm1=((R-Y)+YO)=RO=R(13579BDF)
-	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
-	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
-
-	paddw     xmm2,xmm6		; xmm2=((G-Y)+YE)=GE=G(02468ACE)
-	paddw     xmm3,xmm7		; xmm3=((G-Y)+YO)=GO=G(13579BDF)
-	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
-	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
-
-	paddw     xmm4,xmm6		; xmm4=((B-Y)+YE)=BE=B(02468ACE)
-	paddw     xmm5,xmm7		; xmm5=((B-Y)+YO)=BO=B(13579BDF)
-	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
-	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-	movdqa    xmmG,xmmA
-	movdqa    xmmH,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-	movdqa    xmmC,xmmD
-	movdqa    xmmB,xmmD
-	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-	movdqa    xmmF,xmmE
-	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-	movdqa    xmmB,xmmE
-	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-	movdqa    xmmB,xmmF
-	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	edi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	ecx, byte SIZEOF_XMMWORD
-	jz	near .endcolumn
-
-	add	esi, byte SIZEOF_XMMWORD	; inptr0
-	dec	al			; Yctr
-	jnz	near .Yloop_2nd
-
-	add	ebx, byte SIZEOF_XMMWORD	; inptr1
-	add	edx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st32:
-	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
-	cmp	ecx, byte 2*SIZEOF_XMMWORD
-	jb	short .column_st16
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmF
-	sub	ecx, byte 2*SIZEOF_XMMWORD
-	jmp	short .column_st15
-.column_st16:
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jb	short .column_st15
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	ecx, byte SIZEOF_XMMWORD
-.column_st15:
-	; Store the lower 8 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	ecx, byte SIZEOF_MMWORD
-	jb	short .column_st7
-	movq	XMM_MMWORD [edi], xmmA
-	add	edi, byte SIZEOF_MMWORD
-	sub	ecx, byte SIZEOF_MMWORD
-	psrldq	xmmA, SIZEOF_MMWORD
-.column_st7:
-	; Store the lower 4 bytes of xmmA to the output when it has enough
-	; space.
-	cmp	ecx, byte SIZEOF_DWORD
-	jb	short .column_st3
-	movd	XMM_DWORD [edi], xmmA
-	add	edi, byte SIZEOF_DWORD
-	sub	ecx, byte SIZEOF_DWORD
-	psrldq	xmmA, SIZEOF_DWORD
-.column_st3:
-	; Store the lower 2 bytes of eax to the output when it has enough
-	; space.
-	movd	eax, xmmA
-	cmp	ecx, byte SIZEOF_WORD
-	jb	short .column_st1
-	mov	WORD [edi], ax
-	add	edi, byte SIZEOF_WORD
-	sub	ecx, byte SIZEOF_WORD
-	shr	eax, 16
-.column_st1:
-	; Store the lower 1 byte of eax to the output when it has enough
-	; space.
-	test	ecx, ecx
-	jz	short .endcolumn
-	mov	BYTE [edi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%else
-	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
-	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
-%endif
-	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-	movdqa    xmmC,xmmA
-	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-	movdqa    xmmG,xmmB
-	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-	movdqa    xmmD,xmmA
-	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-	movdqa    xmmH,xmmC
-	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-	cmp	ecx, byte SIZEOF_XMMWORD
-	jb	short .column_st32
-
-	test	edi, SIZEOF_XMMWORD-1
-	jnz	short .out1
-	; --(aligned)-------------------
-	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-	jmp	short .out0
-.out1:	; --(unaligned)-----------------
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-	movdqu	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
-	sub	ecx, byte SIZEOF_XMMWORD
-	jz	near .endcolumn
-
-	add	esi, byte SIZEOF_XMMWORD	; inptr0
-	dec	al			; Yctr
-	jnz	near .Yloop_2nd
-
-	add	ebx, byte SIZEOF_XMMWORD	; inptr1
-	add	edx, byte SIZEOF_XMMWORD	; inptr2
-	jmp	near .columnloop
-	alignx	16,7
-
-.column_st32:
-	cmp	ecx, byte SIZEOF_XMMWORD/2
-	jb	short .column_st16
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmC
-	movdqa	xmmD,xmmH
-	sub	ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
-	cmp	ecx, byte SIZEOF_XMMWORD/4
-	jb	short .column_st15
-	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	movdqa	xmmA,xmmD
-	sub	ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
-	; Store two pixels (8 bytes) of xmmA to the output when it has enough
-	; space.
-	cmp	ecx, byte SIZEOF_XMMWORD/8
-	jb	short .column_st7
-	movq	XMM_MMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD/8*4
-	sub	ecx, byte SIZEOF_XMMWORD/8
-	psrldq	xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-	; Store one pixel (4 bytes) of xmmA to the output when it has enough
-	; space.
-	test	ecx, ecx
-	jz	short .endcolumn
-	movd	XMM_DWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-	sfence		; flush the write buffer
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-%define output_width(b)	(b)+8			; JDIMENSION output_width
-%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
-%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
-
-	align	16
-	global	EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	eax, POINTER [output_width(ebp)]
-
-	mov	edi, JSAMPIMAGE [input_buf(ebp)]
-	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
-	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-	mov	edi, JSAMPARRAY [output_buf(ebp)]
-	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
-
-	push	edx			; inptr2
-	push	ebx			; inptr1
-	push	esi			; inptr00
-	mov	ebx,esp
-
-	push	edi			; output_buf (outptr0)
-	push	ecx			; in_row_group_ctr
-	push	ebx			; input_buf
-	push	eax			; output_width
-
-	call	near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-	add	esi, byte SIZEOF_JSAMPROW	; inptr01
-	add	edi, byte SIZEOF_JSAMPROW	; outptr1
-	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
-	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
-
-	call	near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-	add	esp, byte 7*SIZEOF_DWORD
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdsammmx.asm b/simd/jdsammmx.asm
deleted file mode 100644
index d92a8c9..0000000
--- a/simd/jdsammmx.asm
+++ /dev/null
@@ -1,737 +0,0 @@
-;
-; jdsammmx.asm - upsampling (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fancy_upsample_mmx) PRIVATE
-
-EXTN(jconst_fancy_upsample_mmx):
-
-PW_ONE		times 4 dw  1
-PW_TWO		times 4 dw  2
-PW_THREE	times 4 dw  3
-PW_SEVEN	times 4 dw  7
-PW_EIGHT	times 4 dw  8
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
-;                                JDIMENSION downsampled_width,
-;                                JSAMPARRAY input_data,
-;                                JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_fancy_upsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v1_fancy_upsample_mmx):
-	push	ebp
-	mov	ebp,esp
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
-	test	eax,eax
-	jz	near .return
-
-	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
-	test	ecx,ecx
-	jz	near .return
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, POINTER [output_data_ptr(ebp)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	eax			; colctr
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]	; inptr
-	mov	edi, JSAMPROW [edi]	; outptr
-
-	test	eax, SIZEOF_MMWORD-1
-	jz	short .skip
-	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
-.skip:
-	pxor	mm0,mm0			; mm0=(all 0's)
-	pcmpeqb	mm7,mm7
-	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
-	pand	mm7, MMWORD [esi+0*SIZEOF_MMWORD]
-
-	add	eax, byte SIZEOF_MMWORD-1
-	and	eax, byte -SIZEOF_MMWORD
-	cmp	eax, byte SIZEOF_MMWORD
-	ja	short .columnloop
-	alignx	16,7
-
-.columnloop_last:
-	pcmpeqb	mm6,mm6
-	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
-	pand	mm6, MMWORD [esi+0*SIZEOF_MMWORD]
-	jmp	short .upsample
-	alignx	16,7
-
-.columnloop:
-	movq	mm6, MMWORD [esi+1*SIZEOF_MMWORD]
-	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
-
-.upsample:
-	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
-	movq	mm2,mm1
-	movq	mm3,mm1			; mm1=( 0 1 2 3 4 5 6 7)
-	psllq	mm2,BYTE_BIT		; mm2=( - 0 1 2 3 4 5 6)
-	psrlq	mm3,BYTE_BIT		; mm3=( 1 2 3 4 5 6 7 -)
-
-	por	mm2,mm7			; mm2=(-1 0 1 2 3 4 5 6)
-	por	mm3,mm6			; mm3=( 1 2 3 4 5 6 7 8)
-
-	movq	mm7,mm1
-	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT	; mm7=( 7 - - - - - - -)
-
-	movq      mm4,mm1
-	punpcklbw mm1,mm0		; mm1=( 0 1 2 3)
-	punpckhbw mm4,mm0		; mm4=( 4 5 6 7)
-	movq      mm5,mm2
-	punpcklbw mm2,mm0		; mm2=(-1 0 1 2)
-	punpckhbw mm5,mm0		; mm5=( 3 4 5 6)
-	movq      mm6,mm3
-	punpcklbw mm3,mm0		; mm3=( 1 2 3 4)
-	punpckhbw mm6,mm0		; mm6=( 5 6 7 8)
-
-	pmullw	mm1,[GOTOFF(ebx,PW_THREE)]
-	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
-	paddw	mm2,[GOTOFF(ebx,PW_ONE)]
-	paddw	mm5,[GOTOFF(ebx,PW_ONE)]
-	paddw	mm3,[GOTOFF(ebx,PW_TWO)]
-	paddw	mm6,[GOTOFF(ebx,PW_TWO)]
-
-	paddw	mm2,mm1
-	paddw	mm5,mm4
-	psrlw	mm2,2			; mm2=OutLE=( 0  2  4  6)
-	psrlw	mm5,2			; mm5=OutHE=( 8 10 12 14)
-	paddw	mm3,mm1
-	paddw	mm6,mm4
-	psrlw	mm3,2			; mm3=OutLO=( 1  3  5  7)
-	psrlw	mm6,2			; mm6=OutHO=( 9 11 13 15)
-
-	psllw	mm3,BYTE_BIT
-	psllw	mm6,BYTE_BIT
-	por	mm2,mm3			; mm2=OutL=( 0  1  2  3  4  5  6  7)
-	por	mm5,mm6			; mm5=OutH=( 8  9 10 11 12 13 14 15)
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm5
-
-	sub	eax, byte SIZEOF_MMWORD
-	add	esi, byte 1*SIZEOF_MMWORD	; inptr
-	add	edi, byte 2*SIZEOF_MMWORD	; outptr
-	cmp	eax, byte SIZEOF_MMWORD
-	ja	near .columnloop
-	test	eax,eax
-	jnz	near .columnloop_last
-
-	pop	esi
-	pop	edi
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_data
-	add	edi, byte SIZEOF_JSAMPROW	; output_data
-	dec	ecx				; rowctr
-	jg	near .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
-;                                JDIMENSION downsampled_width,
-;                                JSAMPARRAY input_data,
-;                                JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		4
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_h2v2_fancy_upsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v2_fancy_upsample_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	edx,eax				; edx = original ebp
-	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
-	test	eax,eax
-	jz	near .return
-
-	mov	ecx, INT [max_v_samp(edx)]	; rowctr
-	test	ecx,ecx
-	jz	near .return
-
-	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
-	mov	edi, POINTER [output_data_ptr(edx)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	eax					; colctr
-	push	ecx
-	push	edi
-	push	esi
-
-	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
-	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
-	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
-
-	test	eax, SIZEOF_MMWORD-1
-	jz	short .skip
-	push	edx
-	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
-	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
-	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
-	pop	edx
-.skip:
-	; -- process the first column block
-
-	movq	mm0, MMWORD [ebx+0*SIZEOF_MMWORD]	; mm0=row[ 0][0]
-	movq	mm1, MMWORD [ecx+0*SIZEOF_MMWORD]	; mm1=row[-1][0]
-	movq	mm2, MMWORD [esi+0*SIZEOF_MMWORD]	; mm2=row[+1][0]
-
-	pushpic	ebx
-	movpic	ebx, POINTER [gotptr]	; load GOT address
-
-	pxor      mm3,mm3		; mm3=(all 0's)
-	movq      mm4,mm0
-	punpcklbw mm0,mm3		; mm0=row[ 0][0]( 0 1 2 3)
-	punpckhbw mm4,mm3		; mm4=row[ 0][0]( 4 5 6 7)
-	movq      mm5,mm1
-	punpcklbw mm1,mm3		; mm1=row[-1][0]( 0 1 2 3)
-	punpckhbw mm5,mm3		; mm5=row[-1][0]( 4 5 6 7)
-	movq      mm6,mm2
-	punpcklbw mm2,mm3		; mm2=row[+1][0]( 0 1 2 3)
-	punpckhbw mm6,mm3		; mm6=row[+1][0]( 4 5 6 7)
-
-	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
-	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
-
-	pcmpeqb	mm7,mm7
-	psrlq	mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
-
-	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
-	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
-	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
-	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
-
-	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1	; temporarily save
-	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5	; the intermediate data
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm6
-
-	pand	mm1,mm7			; mm1=( 0 - - -)
-	pand	mm2,mm7			; mm2=( 0 - - -)
-
-	movq	MMWORD [wk(0)], mm1
-	movq	MMWORD [wk(1)], mm2
-
-	poppic	ebx
-
-	add	eax, byte SIZEOF_MMWORD-1
-	and	eax, byte -SIZEOF_MMWORD
-	cmp	eax, byte SIZEOF_MMWORD
-	ja	short .columnloop
-	alignx	16,7
-
-.columnloop_last:
-	; -- process the last column block
-
-	pushpic	ebx
-	movpic	ebx, POINTER [gotptr]	; load GOT address
-
-	pcmpeqb	mm1,mm1
-	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
-	movq	mm2,mm1
-
-	pand	mm1, MMWORD [edx+1*SIZEOF_MMWORD]	; mm1=( - - - 7)
-	pand	mm2, MMWORD [edi+1*SIZEOF_MMWORD]	; mm2=( - - - 7)
-
-	movq	MMWORD [wk(2)], mm1
-	movq	MMWORD [wk(3)], mm2
-
-	jmp	short .upsample
-	alignx	16,7
-
-.columnloop:
-	; -- process the next column block
-
-	movq	mm0, MMWORD [ebx+1*SIZEOF_MMWORD]	; mm0=row[ 0][1]
-	movq	mm1, MMWORD [ecx+1*SIZEOF_MMWORD]	; mm1=row[-1][1]
-	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]	; mm2=row[+1][1]
-
-	pushpic	ebx
-	movpic	ebx, POINTER [gotptr]	; load GOT address
-
-	pxor      mm3,mm3		; mm3=(all 0's)
-	movq      mm4,mm0
-	punpcklbw mm0,mm3		; mm0=row[ 0][1]( 0 1 2 3)
-	punpckhbw mm4,mm3		; mm4=row[ 0][1]( 4 5 6 7)
-	movq      mm5,mm1
-	punpcklbw mm1,mm3		; mm1=row[-1][1]( 0 1 2 3)
-	punpckhbw mm5,mm3		; mm5=row[-1][1]( 4 5 6 7)
-	movq      mm6,mm2
-	punpcklbw mm2,mm3		; mm2=row[+1][1]( 0 1 2 3)
-	punpckhbw mm6,mm3		; mm6=row[+1][1]( 4 5 6 7)
-
-	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
-	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
-
-	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
-	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
-	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
-	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
-
-	movq	MMWORD [edx+2*SIZEOF_MMWORD], mm1	; temporarily save
-	movq	MMWORD [edx+3*SIZEOF_MMWORD], mm5	; the intermediate data
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
-	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm6
-
-	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm1=( - - - 0)
-	psllq	mm2,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm2=( - - - 0)
-
-	movq	MMWORD [wk(2)], mm1
-	movq	MMWORD [wk(3)], mm2
-
-.upsample:
-	; -- process the upper row
-
-	movq	mm7, MMWORD [edx+0*SIZEOF_MMWORD]	; mm7=Int0L=( 0 1 2 3)
-	movq	mm3, MMWORD [edx+1*SIZEOF_MMWORD]	; mm3=Int0H=( 4 5 6 7)
-
-	movq	mm0,mm7
-	movq	mm4,mm3
-	psrlq	mm0,2*BYTE_BIT			; mm0=( 1 2 3 -)
-	psllq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( - - - 4)
-	movq	mm5,mm7
-	movq	mm6,mm3
-	psrlq	mm5,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm5=( 3 - - -)
-	psllq	mm6,2*BYTE_BIT			; mm6=( - 4 5 6)
-
-	por	mm0,mm4				; mm0=( 1 2 3 4)
-	por	mm5,mm6				; mm5=( 3 4 5 6)
-
-	movq	mm1,mm7
-	movq	mm2,mm3
-	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
-	psrlq	mm2,2*BYTE_BIT			; mm2=( 5 6 7 -)
-	movq	mm4,mm3
-	psrlq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( 7 - - -)
-
-	por	mm1, MMWORD [wk(0)]		; mm1=(-1 0 1 2)
-	por	mm2, MMWORD [wk(2)]		; mm2=( 5 6 7 8)
-
-	movq	MMWORD [wk(0)], mm4
-
-	pmullw	mm7,[GOTOFF(ebx,PW_THREE)]
-	pmullw	mm3,[GOTOFF(ebx,PW_THREE)]
-	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	mm5,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	mm0,[GOTOFF(ebx,PW_SEVEN)]
-	paddw	mm2,[GOTOFF(ebx,PW_SEVEN)]
-
-	paddw	mm1,mm7
-	paddw	mm5,mm3
-	psrlw	mm1,4			; mm1=Out0LE=( 0  2  4  6)
-	psrlw	mm5,4			; mm5=Out0HE=( 8 10 12 14)
-	paddw	mm0,mm7
-	paddw	mm2,mm3
-	psrlw	mm0,4			; mm0=Out0LO=( 1  3  5  7)
-	psrlw	mm2,4			; mm2=Out0HO=( 9 11 13 15)
-
-	psllw	mm0,BYTE_BIT
-	psllw	mm2,BYTE_BIT
-	por	mm1,mm0			; mm1=Out0L=( 0  1  2  3  4  5  6  7)
-	por	mm5,mm2			; mm5=Out0H=( 8  9 10 11 12 13 14 15)
-
-	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1
-	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5
-
-	; -- process the lower row
-
-	movq	mm6, MMWORD [edi+0*SIZEOF_MMWORD]	; mm6=Int1L=( 0 1 2 3)
-	movq	mm4, MMWORD [edi+1*SIZEOF_MMWORD]	; mm4=Int1H=( 4 5 6 7)
-
-	movq	mm7,mm6
-	movq	mm3,mm4
-	psrlq	mm7,2*BYTE_BIT			; mm7=( 1 2 3 -)
-	psllq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( - - - 4)
-	movq	mm0,mm6
-	movq	mm2,mm4
-	psrlq	mm0,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm0=( 3 - - -)
-	psllq	mm2,2*BYTE_BIT			; mm2=( - 4 5 6)
-
-	por	mm7,mm3				; mm7=( 1 2 3 4)
-	por	mm0,mm2				; mm0=( 3 4 5 6)
-
-	movq	mm1,mm6
-	movq	mm5,mm4
-	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
-	psrlq	mm5,2*BYTE_BIT			; mm5=( 5 6 7 -)
-	movq	mm3,mm4
-	psrlq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( 7 - - -)
-
-	por	mm1, MMWORD [wk(1)]		; mm1=(-1 0 1 2)
-	por	mm5, MMWORD [wk(3)]		; mm5=( 5 6 7 8)
-
-	movq	MMWORD [wk(1)], mm3
-
-	pmullw	mm6,[GOTOFF(ebx,PW_THREE)]
-	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
-	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	mm0,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	mm7,[GOTOFF(ebx,PW_SEVEN)]
-	paddw	mm5,[GOTOFF(ebx,PW_SEVEN)]
-
-	paddw	mm1,mm6
-	paddw	mm0,mm4
-	psrlw	mm1,4			; mm1=Out1LE=( 0  2  4  6)
-	psrlw	mm0,4			; mm0=Out1HE=( 8 10 12 14)
-	paddw	mm7,mm6
-	paddw	mm5,mm4
-	psrlw	mm7,4			; mm7=Out1LO=( 1  3  5  7)
-	psrlw	mm5,4			; mm5=Out1HO=( 9 11 13 15)
-
-	psllw	mm7,BYTE_BIT
-	psllw	mm5,BYTE_BIT
-	por	mm1,mm7			; mm1=Out1L=( 0  1  2  3  4  5  6  7)
-	por	mm0,mm5			; mm0=Out1H=( 8  9 10 11 12 13 14 15)
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm1
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm0
-
-	poppic	ebx
-
-	sub	eax, byte SIZEOF_MMWORD
-	add	ecx, byte 1*SIZEOF_MMWORD	; inptr1(above)
-	add	ebx, byte 1*SIZEOF_MMWORD	; inptr0
-	add	esi, byte 1*SIZEOF_MMWORD	; inptr1(below)
-	add	edx, byte 2*SIZEOF_MMWORD	; outptr0
-	add	edi, byte 2*SIZEOF_MMWORD	; outptr1
-	cmp	eax, byte SIZEOF_MMWORD
-	ja	near .columnloop
-	test	eax,eax
-	jnz	near .columnloop_last
-
-	pop	esi
-	pop	edi
-	pop	ecx
-	pop	eax
-
-	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
-	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
-	sub	ecx, byte 2			; rowctr
-	jg	near .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
-;                          JDIMENSION output_width,
-;                          JSAMPARRAY input_data,
-;                          JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define output_width(b)	(b)+12		; JDIMENSION output_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_upsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v1_upsample_mmx):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	edx, JDIMENSION [output_width(ebp)]
-	add	edx, byte (2*SIZEOF_MMWORD)-1
-	and	edx, byte -(2*SIZEOF_MMWORD)
-	jz	short .return
-
-	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
-	test	ecx,ecx
-	jz	short .return
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, POINTER [output_data_ptr(ebp)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]		; inptr
-	mov	edi, JSAMPROW [edi]		; outptr
-	mov	eax,edx				; colctr
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-
-	movq      mm1,mm0
-	punpcklbw mm0,mm0
-	punpckhbw mm1,mm1
-
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
-
-	sub	eax, byte 2*SIZEOF_MMWORD
-	jz	short .nextrow
-
-	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
-
-	movq      mm3,mm2
-	punpcklbw mm2,mm2
-	punpckhbw mm3,mm3
-
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
-	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
-
-	sub	eax, byte 2*SIZEOF_MMWORD
-	jz	short .nextrow
-
-	add	esi, byte 2*SIZEOF_MMWORD	; inptr
-	add	edi, byte 4*SIZEOF_MMWORD	; outptr
-	jmp	short .columnloop
-	alignx	16,7
-
-.nextrow:
-	pop	esi
-	pop	edi
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_data
-	add	edi, byte SIZEOF_JSAMPROW	; output_data
-	dec	ecx				; rowctr
-	jg	short .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
-;                          JDIMENSION output_width,
-;                          JSAMPARRAY input_data,
-;                          JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define output_width(b)	(b)+12		; JDIMENSION output_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v2_upsample_mmx) PRIVATE
-
-EXTN(jsimd_h2v2_upsample_mmx):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	edx, JDIMENSION [output_width(ebp)]
-	add	edx, byte (2*SIZEOF_MMWORD)-1
-	and	edx, byte -(2*SIZEOF_MMWORD)
-	jz	near .return
-
-	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
-	test	ecx,ecx
-	jz	short .return
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, POINTER [output_data_ptr(ebp)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]			; inptr
-	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
-	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
-	mov	eax,edx					; colctr
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-
-	movq      mm1,mm0
-	punpcklbw mm0,mm0
-	punpckhbw mm1,mm1
-
-	movq	MMWORD [ebx+0*SIZEOF_MMWORD], mm0
-	movq	MMWORD [ebx+1*SIZEOF_MMWORD], mm1
-	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
-	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
-
-	sub	eax, byte 2*SIZEOF_MMWORD
-	jz	short .nextrow
-
-	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
-
-	movq      mm3,mm2
-	punpcklbw mm2,mm2
-	punpckhbw mm3,mm3
-
-	movq	MMWORD [ebx+2*SIZEOF_MMWORD], mm2
-	movq	MMWORD [ebx+3*SIZEOF_MMWORD], mm3
-	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
-	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
-
-	sub	eax, byte 2*SIZEOF_MMWORD
-	jz	short .nextrow
-
-	add	esi, byte 2*SIZEOF_MMWORD	; inptr
-	add	ebx, byte 4*SIZEOF_MMWORD	; outptr0
-	add	edi, byte 4*SIZEOF_MMWORD	; outptr1
-	jmp	short .columnloop
-	alignx	16,7
-
-.nextrow:
-	pop	esi
-	pop	edi
-
-	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
-	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
-	sub	ecx, byte 2			; rowctr
-	jg	short .rowloop
-
-	emms		; empty MMX state
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdsample-altivec.c b/simd/jdsample-altivec.c
new file mode 100644
index 0000000..63d6d8c
--- /dev/null
+++ b/simd/jdsample-altivec.c
@@ -0,0 +1,392 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+void
+jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow, incol;
+
+  __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
+    out;
+  __vector short this0e, this0o, this0l, this0h, last0l, last0h,
+    next0l, next0h, outle, outhe, outlo, outho;
+
+  /* Constants */
+  __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
+    last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
+    last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
+    next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
+    next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
+#if __BIG_ENDIAN__
+    merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+#else
+    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+#endif
+  __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+
+    if (downsampled_width & 15)
+      inptr[downsampled_width] = inptr[downsampled_width - 1];
+
+    this0 = vec_ld(0, inptr);
+    p_last0 = vec_perm(this0, this0, last_index_col0);
+    last0 = this0;
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 16, inptr += 16, outptr += 32) {
+
+      if (downsampled_width - incol > 0) {
+        p_last0 = vec_perm(last0, this0, last_index);
+        last0 = this0;
+      }
+
+      if (incol <= 16)
+        p_next0 = vec_perm(this0, this0, next_index_lastcol);
+      else {
+        next0 = vec_ld(16, inptr);
+        p_next0 = vec_perm(this0, next0, next_index);
+      }
+
+      this0e = (__vector short)vec_mule(this0, pb_three);
+      this0o = (__vector short)vec_mulo(this0, pb_three);
+      this0l = vec_mergeh(this0e, this0o);
+      this0h = vec_mergel(this0e, this0o);
+
+      last0l = (__vector short)VEC_UNPACKHU(p_last0);
+      last0h = (__vector short)VEC_UNPACKLU(p_last0);
+      last0l = vec_add(last0l, pw_one);
+
+      next0l = (__vector short)VEC_UNPACKHU(p_next0);
+      next0h = (__vector short)VEC_UNPACKLU(p_next0);
+      next0l = vec_add(next0l, pw_two);
+
+      outle = vec_add(this0l, last0l);
+      outlo = vec_add(this0l, next0l);
+      outle = vec_sr(outle, (__vector unsigned short)pw_two);
+      outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
+
+      out = vec_perm((__vector unsigned char)outle,
+                     (__vector unsigned char)outlo, merge_pack_index);
+      vec_st(out, 0, outptr);
+
+      if (incol > 8) {
+        last0h = vec_add(last0h, pw_one);
+        next0h = vec_add(next0h, pw_two);
+
+        outhe = vec_add(this0h, last0h);
+        outho = vec_add(this0h, next0h);
+        outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
+        outho = vec_sr(outho, (__vector unsigned short)pw_two);
+
+        out = vec_perm((__vector unsigned char)outhe,
+                       (__vector unsigned char)outho, merge_pack_index);
+        vec_st(out, 16, outptr);
+      }
+
+      this0 = next0;
+    }
+  }
+}
+
+
+void
+jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol;
+
+  __vector unsigned char this_1, this0, this1, out;
+  __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
+    lastcolsum_1h, lastcolsum1h,
+    p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
+    thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
+    nextcolsum_1l = {0}, nextcolsum_1h = {0},
+    nextcolsum1l = {0}, nextcolsum1h = {0},
+    p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
+    tmpl, tmph, outle, outhe, outlo, outho;
+
+  /* Constants */
+  __vector unsigned char pb_zero = { __16X(0) },
+    last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
+    last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
+    next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
+    next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
+#if __BIG_ENDIAN__
+    merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+#else
+    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+#endif
+  __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
+    pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
+  __vector unsigned short pw_four = { __8X(4) };
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 15) {
+      inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
+      inptr0[downsampled_width] = inptr0[downsampled_width - 1];
+      inptr1[downsampled_width] = inptr1[downsampled_width - 1];
+    }
+
+    this0 = vec_ld(0, inptr0);
+    this0l = (__vector short)VEC_UNPACKHU(this0);
+    this0h = (__vector short)VEC_UNPACKLU(this0);
+    this0l = vec_mladd(this0l, pw_three, pw_zero);
+    this0h = vec_mladd(this0h, pw_three, pw_zero);
+
+    this_1 = vec_ld(0, inptr_1);
+    this_1l = (__vector short)VEC_UNPACKHU(this_1);
+    this_1h = (__vector short)VEC_UNPACKLU(this_1);
+    thiscolsum_1l = vec_add(this0l, this_1l);
+    thiscolsum_1h = vec_add(this0h, this_1h);
+    lastcolsum_1h = thiscolsum_1h;
+    p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
+    p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
+
+    this1 = vec_ld(0, inptr1);
+    this1l = (__vector short)VEC_UNPACKHU(this1);
+    this1h = (__vector short)VEC_UNPACKLU(this1);
+    thiscolsum1l = vec_add(this0l, this1l);
+    thiscolsum1h = vec_add(this0h, this1h);
+    lastcolsum1h = thiscolsum1h;
+    p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
+    p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
+         outptr0 += 32, outptr1 += 32) {
+
+      if (downsampled_width - incol > 0) {
+        p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
+        p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
+        p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
+        p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
+        lastcolsum_1h = thiscolsum_1h;  lastcolsum1h = thiscolsum1h;
+      }
+
+      if (incol <= 16) {
+        p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
+        p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
+                                   next_index_lastcol);
+        p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
+        p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
+                                  next_index_lastcol);
+      } else {
+        this0 = vec_ld(16, inptr0);
+        this0l = (__vector short)VEC_UNPACKHU(this0);
+        this0h = (__vector short)VEC_UNPACKLU(this0);
+        this0l = vec_mladd(this0l, pw_three, pw_zero);
+        this0h = vec_mladd(this0h, pw_three, pw_zero);
+
+        this_1 = vec_ld(16, inptr_1);
+        this_1l = (__vector short)VEC_UNPACKHU(this_1);
+        this_1h = (__vector short)VEC_UNPACKLU(this_1);
+        nextcolsum_1l = vec_add(this0l, this_1l);
+        nextcolsum_1h = vec_add(this0h, this_1h);
+        p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
+        p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
+
+        this1 = vec_ld(16, inptr1);
+        this1l = (__vector short)VEC_UNPACKHU(this1);
+        this1h = (__vector short)VEC_UNPACKLU(this1);
+        nextcolsum1l = vec_add(this0l, this1l);
+        nextcolsum1h = vec_add(this0h, this1h);
+        p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
+        p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
+      }
+
+      /* Process the upper row */
+
+      tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
+      outle = vec_add(tmpl, p_lastcolsum_1l);
+      outle = vec_add(outle, pw_eight);
+      outle = vec_sr(outle, pw_four);
+
+      outlo = vec_add(tmpl, p_nextcolsum_1l);
+      outlo = vec_add(outlo, pw_seven);
+      outlo = vec_sr(outlo, pw_four);
+
+      out = vec_perm((__vector unsigned char)outle,
+                     (__vector unsigned char)outlo, merge_pack_index);
+      vec_st(out, 0, outptr0);
+
+      if (incol > 8) {
+        tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
+        outhe = vec_add(tmph, p_lastcolsum_1h);
+        outhe = vec_add(outhe, pw_eight);
+        outhe = vec_sr(outhe, pw_four);
+
+        outho = vec_add(tmph, p_nextcolsum_1h);
+        outho = vec_add(outho, pw_seven);
+        outho = vec_sr(outho, pw_four);
+
+        out = vec_perm((__vector unsigned char)outhe,
+                       (__vector unsigned char)outho, merge_pack_index);
+        vec_st(out, 16, outptr0);
+      }
+
+      /* Process the lower row */
+
+      tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
+      outle = vec_add(tmpl, p_lastcolsum1l);
+      outle = vec_add(outle, pw_eight);
+      outle = vec_sr(outle, pw_four);
+
+      outlo = vec_add(tmpl, p_nextcolsum1l);
+      outlo = vec_add(outlo, pw_seven);
+      outlo = vec_sr(outlo, pw_four);
+
+      out = vec_perm((__vector unsigned char)outle,
+                     (__vector unsigned char)outlo, merge_pack_index);
+      vec_st(out, 0, outptr1);
+
+      if (incol > 8) {
+        tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
+        outhe = vec_add(tmph, p_lastcolsum1h);
+        outhe = vec_add(outhe, pw_eight);
+        outhe = vec_sr(outhe, pw_four);
+
+        outho = vec_add(tmph, p_nextcolsum1h);
+        outho = vec_add(outho, pw_seven);
+        outho = vec_sr(outho, pw_four);
+
+        out = vec_perm((__vector unsigned char)outhe,
+                       (__vector unsigned char)outho, merge_pack_index);
+        vec_st(out, 16, outptr1);
+      }
+
+      thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
+      thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
+    }
+  }
+}
+
+
+/* These are rarely used (mainly just for decompressing YCCK images) */
+
+void
+jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
+                             JDIMENSION output_width,
+                             JSAMPARRAY input_data,
+                             JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow, incol;
+
+  __vector unsigned char in, inl, inh;
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+
+    for (incol = (output_width + 31) & (~31); incol > 0;
+         incol -= 64, inptr += 32, outptr += 64) {
+
+      in = vec_ld(0, inptr);
+      inl = vec_mergeh(in, in);
+      inh = vec_mergel(in, in);
+
+      vec_st(inl, 0, outptr);
+      vec_st(inh, 16, outptr);
+
+      if (incol > 32) {
+        in = vec_ld(16, inptr);
+        inl = vec_mergeh(in, in);
+        inh = vec_mergel(in, in);
+
+        vec_st(inl, 32, outptr);
+        vec_st(inh, 48, outptr);
+      }
+    }
+  }
+}
+
+
+void
+jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
+                             JDIMENSION output_width,
+                             JSAMPARRAY input_data,
+                             JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow, incol;
+
+  __vector unsigned char in, inl, inh;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    for (incol = (output_width + 31) & (~31); incol > 0;
+         incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
+
+      in = vec_ld(0, inptr);
+      inl = vec_mergeh(in, in);
+      inh = vec_mergel(in, in);
+
+      vec_st(inl, 0, outptr0);
+      vec_st(inl, 0, outptr1);
+
+      vec_st(inh, 16, outptr0);
+      vec_st(inh, 16, outptr1);
+
+      if (incol > 32) {
+        in = vec_ld(16, inptr);
+        inl = vec_mergeh(in, in);
+        inh = vec_mergel(in, in);
+
+        vec_st(inl, 32, outptr0);
+        vec_st(inl, 32, outptr1);
+
+        vec_st(inh, 48, outptr0);
+        vec_st(inh, 48, outptr1);
+      }
+    }
+  }
+}
diff --git a/simd/jdsample-mmx.asm b/simd/jdsample-mmx.asm
new file mode 100644
index 0000000..c9e2b8b
--- /dev/null
+++ b/simd/jdsample-mmx.asm
@@ -0,0 +1,737 @@
+;
+; jdsample.asm - upsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE          times 4 dw  1
+PW_TWO          times 4 dw  2
+PW_THREE        times 4 dw  3
+PW_SEVEN        times 4 dw  7
+PW_EIGHT        times 4 dw  8
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
+%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
+%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
+
+        align   16
+        global  EXTN(jsimd_h2v1_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v1_fancy_upsample_mmx):
+        push    ebp
+        mov     ebp,esp
+        pushpic ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+        test    eax,eax
+        jz      near .return
+
+        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
+        test    ecx,ecx
+        jz      near .return
+
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        mov     edi, POINTER [output_data_ptr(ebp)]
+        mov     edi, JSAMPARRAY [edi]                   ; output_data
+        alignx  16,7
+.rowloop:
+        push    eax                     ; colctr
+        push    edi
+        push    esi
+
+        mov     esi, JSAMPROW [esi]     ; inptr
+        mov     edi, JSAMPROW [edi]     ; outptr
+
+        test    eax, SIZEOF_MMWORD-1
+        jz      short .skip
+        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+        pxor    mm0,mm0                 ; mm0=(all 0's)
+        pcmpeqb mm7,mm7
+        psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
+        pand    mm7, MMWORD [esi+0*SIZEOF_MMWORD]
+
+        add     eax, byte SIZEOF_MMWORD-1
+        and     eax, byte -SIZEOF_MMWORD
+        cmp     eax, byte SIZEOF_MMWORD
+        ja      short .columnloop
+        alignx  16,7
+
+.columnloop_last:
+        pcmpeqb mm6,mm6
+        psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+        pand    mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+        jmp     short .upsample
+        alignx  16,7
+
+.columnloop:
+        movq    mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+        psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+        movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+        movq    mm2,mm1
+        movq    mm3,mm1                 ; mm1=( 0 1 2 3 4 5 6 7)
+        psllq   mm2,BYTE_BIT            ; mm2=( - 0 1 2 3 4 5 6)
+        psrlq   mm3,BYTE_BIT            ; mm3=( 1 2 3 4 5 6 7 -)
+
+        por     mm2,mm7                 ; mm2=(-1 0 1 2 3 4 5 6)
+        por     mm3,mm6                 ; mm3=( 1 2 3 4 5 6 7 8)
+
+        movq    mm7,mm1
+        psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
+
+        movq      mm4,mm1
+        punpcklbw mm1,mm0               ; mm1=( 0 1 2 3)
+        punpckhbw mm4,mm0               ; mm4=( 4 5 6 7)
+        movq      mm5,mm2
+        punpcklbw mm2,mm0               ; mm2=(-1 0 1 2)
+        punpckhbw mm5,mm0               ; mm5=( 3 4 5 6)
+        movq      mm6,mm3
+        punpcklbw mm3,mm0               ; mm3=( 1 2 3 4)
+        punpckhbw mm6,mm0               ; mm6=( 5 6 7 8)
+
+        pmullw  mm1,[GOTOFF(ebx,PW_THREE)]
+        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
+        paddw   mm2,[GOTOFF(ebx,PW_ONE)]
+        paddw   mm5,[GOTOFF(ebx,PW_ONE)]
+        paddw   mm3,[GOTOFF(ebx,PW_TWO)]
+        paddw   mm6,[GOTOFF(ebx,PW_TWO)]
+
+        paddw   mm2,mm1
+        paddw   mm5,mm4
+        psrlw   mm2,2                   ; mm2=OutLE=( 0  2  4  6)
+        psrlw   mm5,2                   ; mm5=OutHE=( 8 10 12 14)
+        paddw   mm3,mm1
+        paddw   mm6,mm4
+        psrlw   mm3,2                   ; mm3=OutLO=( 1  3  5  7)
+        psrlw   mm6,2                   ; mm6=OutHO=( 9 11 13 15)
+
+        psllw   mm3,BYTE_BIT
+        psllw   mm6,BYTE_BIT
+        por     mm2,mm3                 ; mm2=OutL=( 0  1  2  3  4  5  6  7)
+        por     mm5,mm6                 ; mm5=OutH=( 8  9 10 11 12 13 14 15)
+
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+        sub     eax, byte SIZEOF_MMWORD
+        add     esi, byte 1*SIZEOF_MMWORD       ; inptr
+        add     edi, byte 2*SIZEOF_MMWORD       ; outptr
+        cmp     eax, byte SIZEOF_MMWORD
+        ja      near .columnloop
+        test    eax,eax
+        jnz     near .columnloop_last
+
+        pop     esi
+        pop     edi
+        pop     eax
+
+        add     esi, byte SIZEOF_JSAMPROW       ; input_data
+        add     edi, byte SIZEOF_JSAMPROW       ; output_data
+        dec     ecx                             ; rowctr
+        jg      near .rowloop
+
+        emms            ; empty MMX state
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        poppic  ebx
+        pop     ebp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
+%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
+%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM          4
+%define gotptr          wk(0)-SIZEOF_POINTER    ; void *gotptr
+
+        align   16
+        global  EXTN(jsimd_h2v2_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v2_fancy_upsample_mmx):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic eax             ; make a room for GOT address
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx                     ; get GOT address
+        movpic  POINTER [gotptr], ebx   ; save GOT address
+
+        mov     edx,eax                         ; edx = original ebp
+        mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+        test    eax,eax
+        jz      near .return
+
+        mov     ecx, INT [max_v_samp(edx)]      ; rowctr
+        test    ecx,ecx
+        jz      near .return
+
+        mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
+        mov     edi, POINTER [output_data_ptr(edx)]
+        mov     edi, JSAMPARRAY [edi]                   ; output_data
+        alignx  16,7
+.rowloop:
+        push    eax                                     ; colctr
+        push    ecx
+        push    edi
+        push    esi
+
+        mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
+        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
+        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
+        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
+
+        test    eax, SIZEOF_MMWORD-1
+        jz      short .skip
+        push    edx
+        mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+        mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+        mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+        mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+        pop     edx
+.skip:
+        ; -- process the first column block
+
+        movq    mm0, MMWORD [ebx+0*SIZEOF_MMWORD]       ; mm0=row[ 0][0]
+        movq    mm1, MMWORD [ecx+0*SIZEOF_MMWORD]       ; mm1=row[-1][0]
+        movq    mm2, MMWORD [esi+0*SIZEOF_MMWORD]       ; mm2=row[+1][0]
+
+        pushpic ebx
+        movpic  ebx, POINTER [gotptr]   ; load GOT address
+
+        pxor      mm3,mm3               ; mm3=(all 0's)
+        movq      mm4,mm0
+        punpcklbw mm0,mm3               ; mm0=row[ 0][0]( 0 1 2 3)
+        punpckhbw mm4,mm3               ; mm4=row[ 0][0]( 4 5 6 7)
+        movq      mm5,mm1
+        punpcklbw mm1,mm3               ; mm1=row[-1][0]( 0 1 2 3)
+        punpckhbw mm5,mm3               ; mm5=row[-1][0]( 4 5 6 7)
+        movq      mm6,mm2
+        punpcklbw mm2,mm3               ; mm2=row[+1][0]( 0 1 2 3)
+        punpckhbw mm6,mm3               ; mm6=row[+1][0]( 4 5 6 7)
+
+        pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
+        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
+
+        pcmpeqb mm7,mm7
+        psrlq   mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
+
+        paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
+        paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
+        paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
+        paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
+
+        movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1       ; temporarily save
+        movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5       ; the intermediate data
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+        pand    mm1,mm7                 ; mm1=( 0 - - -)
+        pand    mm2,mm7                 ; mm2=( 0 - - -)
+
+        movq    MMWORD [wk(0)], mm1
+        movq    MMWORD [wk(1)], mm2
+
+        poppic  ebx
+
+        add     eax, byte SIZEOF_MMWORD-1
+        and     eax, byte -SIZEOF_MMWORD
+        cmp     eax, byte SIZEOF_MMWORD
+        ja      short .columnloop
+        alignx  16,7
+
+.columnloop_last:
+        ; -- process the last column block
+
+        pushpic ebx
+        movpic  ebx, POINTER [gotptr]   ; load GOT address
+
+        pcmpeqb mm1,mm1
+        psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
+        movq    mm2,mm1
+
+        pand    mm1, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm1=( - - - 7)
+        pand    mm2, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm2=( - - - 7)
+
+        movq    MMWORD [wk(2)], mm1
+        movq    MMWORD [wk(3)], mm2
+
+        jmp     short .upsample
+        alignx  16,7
+
+.columnloop:
+        ; -- process the next column block
+
+        movq    mm0, MMWORD [ebx+1*SIZEOF_MMWORD]       ; mm0=row[ 0][1]
+        movq    mm1, MMWORD [ecx+1*SIZEOF_MMWORD]       ; mm1=row[-1][1]
+        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]       ; mm2=row[+1][1]
+
+        pushpic ebx
+        movpic  ebx, POINTER [gotptr]   ; load GOT address
+
+        pxor      mm3,mm3               ; mm3=(all 0's)
+        movq      mm4,mm0
+        punpcklbw mm0,mm3               ; mm0=row[ 0][1]( 0 1 2 3)
+        punpckhbw mm4,mm3               ; mm4=row[ 0][1]( 4 5 6 7)
+        movq      mm5,mm1
+        punpcklbw mm1,mm3               ; mm1=row[-1][1]( 0 1 2 3)
+        punpckhbw mm5,mm3               ; mm5=row[-1][1]( 4 5 6 7)
+        movq      mm6,mm2
+        punpcklbw mm2,mm3               ; mm2=row[+1][1]( 0 1 2 3)
+        punpckhbw mm6,mm3               ; mm6=row[+1][1]( 4 5 6 7)
+
+        pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
+        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
+
+        paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
+        paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
+        paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
+        paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
+
+        movq    MMWORD [edx+2*SIZEOF_MMWORD], mm1       ; temporarily save
+        movq    MMWORD [edx+3*SIZEOF_MMWORD], mm5       ; the intermediate data
+        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
+        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+        psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
+        psllq   mm2,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
+
+        movq    MMWORD [wk(2)], mm1
+        movq    MMWORD [wk(3)], mm2
+
+.upsample:
+        ; -- process the upper row
+
+        movq    mm7, MMWORD [edx+0*SIZEOF_MMWORD]       ; mm7=Int0L=( 0 1 2 3)
+        movq    mm3, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm3=Int0H=( 4 5 6 7)
+
+        movq    mm0,mm7
+        movq    mm4,mm3
+        psrlq   mm0,2*BYTE_BIT                  ; mm0=( 1 2 3 -)
+        psllq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
+        movq    mm5,mm7
+        movq    mm6,mm3
+        psrlq   mm5,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
+        psllq   mm6,2*BYTE_BIT                  ; mm6=( - 4 5 6)
+
+        por     mm0,mm4                         ; mm0=( 1 2 3 4)
+        por     mm5,mm6                         ; mm5=( 3 4 5 6)
+
+        movq    mm1,mm7
+        movq    mm2,mm3
+        psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
+        psrlq   mm2,2*BYTE_BIT                  ; mm2=( 5 6 7 -)
+        movq    mm4,mm3
+        psrlq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
+
+        por     mm1, MMWORD [wk(0)]             ; mm1=(-1 0 1 2)
+        por     mm2, MMWORD [wk(2)]             ; mm2=( 5 6 7 8)
+
+        movq    MMWORD [wk(0)], mm4
+
+        pmullw  mm7,[GOTOFF(ebx,PW_THREE)]
+        pmullw  mm3,[GOTOFF(ebx,PW_THREE)]
+        paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
+        paddw   mm5,[GOTOFF(ebx,PW_EIGHT)]
+        paddw   mm0,[GOTOFF(ebx,PW_SEVEN)]
+        paddw   mm2,[GOTOFF(ebx,PW_SEVEN)]
+
+        paddw   mm1,mm7
+        paddw   mm5,mm3
+        psrlw   mm1,4                   ; mm1=Out0LE=( 0  2  4  6)
+        psrlw   mm5,4                   ; mm5=Out0HE=( 8 10 12 14)
+        paddw   mm0,mm7
+        paddw   mm2,mm3
+        psrlw   mm0,4                   ; mm0=Out0LO=( 1  3  5  7)
+        psrlw   mm2,4                   ; mm2=Out0HO=( 9 11 13 15)
+
+        psllw   mm0,BYTE_BIT
+        psllw   mm2,BYTE_BIT
+        por     mm1,mm0                 ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
+        por     mm5,mm2                 ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
+
+        movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1
+        movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+        ; -- process the lower row
+
+        movq    mm6, MMWORD [edi+0*SIZEOF_MMWORD]       ; mm6=Int1L=( 0 1 2 3)
+        movq    mm4, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm4=Int1H=( 4 5 6 7)
+
+        movq    mm7,mm6
+        movq    mm3,mm4
+        psrlq   mm7,2*BYTE_BIT                  ; mm7=( 1 2 3 -)
+        psllq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
+        movq    mm0,mm6
+        movq    mm2,mm4
+        psrlq   mm0,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
+        psllq   mm2,2*BYTE_BIT                  ; mm2=( - 4 5 6)
+
+        por     mm7,mm3                         ; mm7=( 1 2 3 4)
+        por     mm0,mm2                         ; mm0=( 3 4 5 6)
+
+        movq    mm1,mm6
+        movq    mm5,mm4
+        psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
+        psrlq   mm5,2*BYTE_BIT                  ; mm5=( 5 6 7 -)
+        movq    mm3,mm4
+        psrlq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
+
+        por     mm1, MMWORD [wk(1)]             ; mm1=(-1 0 1 2)
+        por     mm5, MMWORD [wk(3)]             ; mm5=( 5 6 7 8)
+
+        movq    MMWORD [wk(1)], mm3
+
+        pmullw  mm6,[GOTOFF(ebx,PW_THREE)]
+        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
+        paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
+        paddw   mm0,[GOTOFF(ebx,PW_EIGHT)]
+        paddw   mm7,[GOTOFF(ebx,PW_SEVEN)]
+        paddw   mm5,[GOTOFF(ebx,PW_SEVEN)]
+
+        paddw   mm1,mm6
+        paddw   mm0,mm4
+        psrlw   mm1,4                   ; mm1=Out1LE=( 0  2  4  6)
+        psrlw   mm0,4                   ; mm0=Out1HE=( 8 10 12 14)
+        paddw   mm7,mm6
+        paddw   mm5,mm4
+        psrlw   mm7,4                   ; mm7=Out1LO=( 1  3  5  7)
+        psrlw   mm5,4                   ; mm5=Out1HO=( 9 11 13 15)
+
+        psllw   mm7,BYTE_BIT
+        psllw   mm5,BYTE_BIT
+        por     mm1,mm7                 ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
+        por     mm0,mm5                 ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
+
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm1
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+        poppic  ebx
+
+        sub     eax, byte SIZEOF_MMWORD
+        add     ecx, byte 1*SIZEOF_MMWORD       ; inptr1(above)
+        add     ebx, byte 1*SIZEOF_MMWORD       ; inptr0
+        add     esi, byte 1*SIZEOF_MMWORD       ; inptr1(below)
+        add     edx, byte 2*SIZEOF_MMWORD       ; outptr0
+        add     edi, byte 2*SIZEOF_MMWORD       ; outptr1
+        cmp     eax, byte SIZEOF_MMWORD
+        ja      near .columnloop
+        test    eax,eax
+        jnz     near .columnloop_last
+
+        pop     esi
+        pop     edi
+        pop     ecx
+        pop     eax
+
+        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
+        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
+        sub     ecx, byte 2                     ; rowctr
+        jg      near .rowloop
+
+        emms            ; empty MMX state
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
+;                          JDIMENSION output_width,
+;                          JSAMPARRAY input_data,
+;                          JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
+%define output_width(b)         (b)+12          ; JDIMENSION output_width
+%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
+
+        align   16
+        global  EXTN(jsimd_h2v1_upsample_mmx)
+
+EXTN(jsimd_h2v1_upsample_mmx):
+        push    ebp
+        mov     ebp,esp
+;       push    ebx             ; unused
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     edx, JDIMENSION [output_width(ebp)]
+        add     edx, byte (2*SIZEOF_MMWORD)-1
+        and     edx, byte -(2*SIZEOF_MMWORD)
+        jz      short .return
+
+        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
+        test    ecx,ecx
+        jz      short .return
+
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        mov     edi, POINTER [output_data_ptr(ebp)]
+        mov     edi, JSAMPARRAY [edi]                   ; output_data
+        alignx  16,7
+.rowloop:
+        push    edi
+        push    esi
+
+        mov     esi, JSAMPROW [esi]             ; inptr
+        mov     edi, JSAMPROW [edi]             ; outptr
+        mov     eax,edx                         ; colctr
+        alignx  16,7
+.columnloop:
+
+        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+        movq      mm1,mm0
+        punpcklbw mm0,mm0
+        punpckhbw mm1,mm1
+
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+        sub     eax, byte 2*SIZEOF_MMWORD
+        jz      short .nextrow
+
+        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+        movq      mm3,mm2
+        punpcklbw mm2,mm2
+        punpckhbw mm3,mm3
+
+        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
+        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+        sub     eax, byte 2*SIZEOF_MMWORD
+        jz      short .nextrow
+
+        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
+        add     edi, byte 4*SIZEOF_MMWORD       ; outptr
+        jmp     short .columnloop
+        alignx  16,7
+
+.nextrow:
+        pop     esi
+        pop     edi
+
+        add     esi, byte SIZEOF_JSAMPROW       ; input_data
+        add     edi, byte SIZEOF_JSAMPROW       ; output_data
+        dec     ecx                             ; rowctr
+        jg      short .rowloop
+
+        emms            ; empty MMX state
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+;       pop     ebx             ; unused
+        pop     ebp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
+;                          JDIMENSION output_width,
+;                          JSAMPARRAY input_data,
+;                          JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
+%define output_width(b)         (b)+12          ; JDIMENSION output_width
+%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
+
+        align   16
+        global  EXTN(jsimd_h2v2_upsample_mmx)
+
+EXTN(jsimd_h2v2_upsample_mmx):
+        push    ebp
+        mov     ebp,esp
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     edx, JDIMENSION [output_width(ebp)]
+        add     edx, byte (2*SIZEOF_MMWORD)-1
+        and     edx, byte -(2*SIZEOF_MMWORD)
+        jz      near .return
+
+        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
+        test    ecx,ecx
+        jz      short .return
+
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        mov     edi, POINTER [output_data_ptr(ebp)]
+        mov     edi, JSAMPARRAY [edi]                   ; output_data
+        alignx  16,7
+.rowloop:
+        push    edi
+        push    esi
+
+        mov     esi, JSAMPROW [esi]                     ; inptr
+        mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
+        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
+        mov     eax,edx                                 ; colctr
+        alignx  16,7
+.columnloop:
+
+        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+        movq      mm1,mm0
+        punpcklbw mm0,mm0
+        punpckhbw mm1,mm1
+
+        movq    MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+        movq    MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
+        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+        sub     eax, byte 2*SIZEOF_MMWORD
+        jz      short .nextrow
+
+        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+        movq      mm3,mm2
+        punpcklbw mm2,mm2
+        punpckhbw mm3,mm3
+
+        movq    MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+        movq    MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
+        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+        sub     eax, byte 2*SIZEOF_MMWORD
+        jz      short .nextrow
+
+        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
+        add     ebx, byte 4*SIZEOF_MMWORD       ; outptr0
+        add     edi, byte 4*SIZEOF_MMWORD       ; outptr1
+        jmp     short .columnloop
+        alignx  16,7
+
+.nextrow:
+        pop     esi
+        pop     edi
+
+        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
+        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
+        sub     ecx, byte 2                     ; rowctr
+        jg      short .rowloop
+
+        emms            ; empty MMX state
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jdsample-sse2-64.asm b/simd/jdsample-sse2-64.asm
new file mode 100644
index 0000000..3aec69f
--- /dev/null
+++ b/simd/jdsample-sse2-64.asm
@@ -0,0 +1,671 @@
+;
+; jdsample.asm - upsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE          times 8 dw  1
+PW_TWO          times 8 dw  2
+PW_THREE        times 8 dw  3
+PW_SEVEN        times 8 dw  7
+PW_EIGHT        times 8 dw  8
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+        align   16
+        global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+        push    rbp
+        mov     rax,rsp
+        mov     rbp,rsp
+        collect_args
+
+        mov     eax, r11d  ; colctr
+        test    rax,rax
+        jz      near .return
+
+        mov     rcx, r10        ; rowctr
+        test    rcx,rcx
+        jz      near .return
+
+        mov     rsi, r12        ; input_data
+        mov     rdi, r13
+        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
+.rowloop:
+        push    rax                     ; colctr
+        push    rdi
+        push    rsi
+
+        mov     rsi, JSAMPROW [rsi]     ; inptr
+        mov     rdi, JSAMPROW [rdi]     ; outptr
+
+        test    rax, SIZEOF_XMMWORD-1
+        jz      short .skip
+        mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+        mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+        pxor    xmm0,xmm0               ; xmm0=(all 0's)
+        pcmpeqb xmm7,xmm7
+        psrldq  xmm7,(SIZEOF_XMMWORD-1)
+        pand    xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+        add     rax, byte SIZEOF_XMMWORD-1
+        and     rax, byte -SIZEOF_XMMWORD
+        cmp     rax, byte SIZEOF_XMMWORD
+        ja      short .columnloop
+
+.columnloop_last:
+        pcmpeqb xmm6,xmm6
+        pslldq  xmm6,(SIZEOF_XMMWORD-1)
+        pand    xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        jmp     short .upsample
+
+.columnloop:
+        movdqa  xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+        pslldq  xmm6,(SIZEOF_XMMWORD-1)
+
+.upsample:
+        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+        movdqa  xmm2,xmm1
+        movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
+        pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
+        psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
+
+        por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
+        por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
+
+        movdqa  xmm7,xmm1
+        psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
+
+        movdqa    xmm4,xmm1
+        punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
+        movdqa    xmm5,xmm2
+        punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
+        punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
+        movdqa    xmm6,xmm3
+        punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
+        punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
+
+        pmullw  xmm1,[rel PW_THREE]
+        pmullw  xmm4,[rel PW_THREE]
+        paddw   xmm2,[rel PW_ONE]
+        paddw   xmm5,[rel PW_ONE]
+        paddw   xmm3,[rel PW_TWO]
+        paddw   xmm6,[rel PW_TWO]
+
+        paddw   xmm2,xmm1
+        paddw   xmm5,xmm4
+        psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+        psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+        paddw   xmm3,xmm1
+        paddw   xmm6,xmm4
+        psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+        psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+        psllw   xmm3,BYTE_BIT
+        psllw   xmm6,BYTE_BIT
+        por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+        por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+        sub     rax, byte SIZEOF_XMMWORD
+        add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr
+        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
+        cmp     rax, byte SIZEOF_XMMWORD
+        ja      near .columnloop
+        test    eax,eax
+        jnz     near .columnloop_last
+
+        pop     rsi
+        pop     rdi
+        pop     rax
+
+        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
+        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
+        dec     rcx                             ; rowctr
+        jg      near .rowloop
+
+.return:
+        uncollect_args
+        pop     rbp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          4
+
+        align   16
+        global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [wk(0)]
+        collect_args
+        push    rbx
+
+        mov     eax, r11d  ; colctr
+        test    rax,rax
+        jz      near .return
+
+        mov     rcx, r10        ; rowctr
+        test    rcx,rcx
+        jz      near .return
+
+        mov     rsi, r12        ; input_data
+        mov     rdi, r13
+        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
+.rowloop:
+        push    rax                                     ; colctr
+        push    rcx
+        push    rdi
+        push    rsi
+
+        mov     rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
+        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
+        mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
+        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
+        mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
+
+        test    rax, SIZEOF_XMMWORD-1
+        jz      short .skip
+        push    rdx
+        mov     dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+        mov     JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+        mov     dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+        mov     JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+        mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+        mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+        pop     rdx
+.skip:
+        ; -- process the first column block
+
+        movdqa  xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
+        movdqa  xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
+        movdqa  xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
+
+        pxor      xmm3,xmm3             ; xmm3=(all 0's)
+        movdqa    xmm4,xmm0
+        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+        movdqa    xmm5,xmm1
+        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+        movdqa    xmm6,xmm2
+        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+        pmullw  xmm0,[rel PW_THREE]
+        pmullw  xmm4,[rel PW_THREE]
+
+        pcmpeqb xmm7,xmm7
+        psrldq  xmm7,(SIZEOF_XMMWORD-2)
+
+        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+        movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
+        movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
+        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+        pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
+        pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
+
+        movdqa  XMMWORD [wk(0)], xmm1
+        movdqa  XMMWORD [wk(1)], xmm2
+
+        add     rax, byte SIZEOF_XMMWORD-1
+        and     rax, byte -SIZEOF_XMMWORD
+        cmp     rax, byte SIZEOF_XMMWORD
+        ja      short .columnloop
+
+.columnloop_last:
+        ; -- process the last column block
+
+        pcmpeqb xmm1,xmm1
+        pslldq  xmm1,(SIZEOF_XMMWORD-2)
+        movdqa  xmm2,xmm1
+
+        pand    xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+        pand    xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+        movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+        movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+        jmp     near .upsample
+
+.columnloop:
+        ; -- process the next column block
+
+        movdqa  xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
+        movdqa  xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
+        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
+
+        pxor      xmm3,xmm3             ; xmm3=(all 0's)
+        movdqa    xmm4,xmm0
+        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+        movdqa    xmm5,xmm1
+        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+        movdqa    xmm6,xmm2
+        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+        pmullw  xmm0,[rel PW_THREE]
+        pmullw  xmm4,[rel PW_THREE]
+
+        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+        movdqa  XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
+        movdqa  XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
+        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+        pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
+        pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
+
+        movdqa  XMMWORD [wk(2)], xmm1
+        movdqa  XMMWORD [wk(3)], xmm2
+
+.upsample:
+        ; -- process the upper row
+
+        movdqa  xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+        movdqa  xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+        movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+        movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+        psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
+        pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
+        movdqa  xmm5,xmm7
+        movdqa  xmm6,xmm3
+        psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
+        pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
+
+        por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
+        por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
+
+        movdqa  xmm1,xmm7
+        movdqa  xmm2,xmm3
+        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
+        psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
+        movdqa  xmm4,xmm3
+        psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
+
+        por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
+        por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
+
+        movdqa  XMMWORD [wk(0)], xmm4
+
+        pmullw  xmm7,[rel PW_THREE]
+        pmullw  xmm3,[rel PW_THREE]
+        paddw   xmm1,[rel PW_EIGHT]
+        paddw   xmm5,[rel PW_EIGHT]
+        paddw   xmm0,[rel PW_SEVEN]
+        paddw   xmm2,[rel PW_SEVEN]
+
+        paddw   xmm1,xmm7
+        paddw   xmm5,xmm3
+        psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+        psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+        paddw   xmm0,xmm7
+        paddw   xmm2,xmm3
+        psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+        psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+        psllw   xmm0,BYTE_BIT
+        psllw   xmm2,BYTE_BIT
+        por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+        por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+        movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+        movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+        ; -- process the lower row
+
+        movdqa  xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+        movdqa  xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+        movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+        movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+        psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
+        pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
+        movdqa  xmm0,xmm6
+        movdqa  xmm2,xmm4
+        psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
+        pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
+
+        por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
+        por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
+
+        movdqa  xmm1,xmm6
+        movdqa  xmm5,xmm4
+        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
+        psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
+        movdqa  xmm3,xmm4
+        psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
+
+        por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
+        por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
+
+        movdqa  XMMWORD [wk(1)], xmm3
+
+        pmullw  xmm6,[rel PW_THREE]
+        pmullw  xmm4,[rel PW_THREE]
+        paddw   xmm1,[rel PW_EIGHT]
+        paddw   xmm0,[rel PW_EIGHT]
+        paddw   xmm7,[rel PW_SEVEN]
+        paddw   xmm5,[rel PW_SEVEN]
+
+        paddw   xmm1,xmm6
+        paddw   xmm0,xmm4
+        psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+        psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+        paddw   xmm7,xmm6
+        paddw   xmm5,xmm4
+        psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+        psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+        psllw   xmm7,BYTE_BIT
+        psllw   xmm5,BYTE_BIT
+        por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+        por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+        sub     rax, byte SIZEOF_XMMWORD
+        add     rcx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
+        add     rbx, byte 1*SIZEOF_XMMWORD      ; inptr0
+        add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
+        add     rdx, byte 2*SIZEOF_XMMWORD      ; outptr0
+        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr1
+        cmp     rax, byte SIZEOF_XMMWORD
+        ja      near .columnloop
+        test    rax,rax
+        jnz     near .columnloop_last
+
+        pop     rsi
+        pop     rdi
+        pop     rcx
+        pop     rax
+
+        add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
+        add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
+        sub     rcx, byte 2                     ; rowctr
+        jg      near .rowloop
+
+.return:
+        pop     rbx
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+        align   16
+        global  EXTN(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+        push    rbp
+        mov     rax,rsp
+        mov     rbp,rsp
+        collect_args
+
+        mov     edx, r11d
+        add     rdx, byte (2*SIZEOF_XMMWORD)-1
+        and     rdx, byte -(2*SIZEOF_XMMWORD)
+        jz      near .return
+
+        mov     rcx, r10        ; rowctr
+        test    rcx,rcx
+        jz      short .return
+
+        mov     rsi, r12 ; input_data
+        mov     rdi, r13
+        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
+.rowloop:
+        push    rdi
+        push    rsi
+
+        mov     rsi, JSAMPROW [rsi]             ; inptr
+        mov     rdi, JSAMPROW [rdi]             ; outptr
+        mov     rax,rdx                         ; colctr
+.columnloop:
+
+        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+        movdqa    xmm1,xmm0
+        punpcklbw xmm0,xmm0
+        punpckhbw xmm1,xmm1
+
+        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+        sub     rax, byte 2*SIZEOF_XMMWORD
+        jz      short .nextrow
+
+        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+        movdqa    xmm3,xmm2
+        punpcklbw xmm2,xmm2
+        punpckhbw xmm3,xmm3
+
+        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+        sub     rax, byte 2*SIZEOF_XMMWORD
+        jz      short .nextrow
+
+        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
+        add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr
+        jmp     short .columnloop
+
+.nextrow:
+        pop     rsi
+        pop     rdi
+
+        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
+        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
+        dec     rcx                             ; rowctr
+        jg      short .rowloop
+
+.return:
+        uncollect_args
+        pop     rbp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+        align   16
+        global  EXTN(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+        push    rbp
+        mov     rax,rsp
+        mov     rbp,rsp
+        collect_args
+        push    rbx
+
+        mov     edx, r11d
+        add     rdx, byte (2*SIZEOF_XMMWORD)-1
+        and     rdx, byte -(2*SIZEOF_XMMWORD)
+        jz      near .return
+
+        mov     rcx, r10        ; rowctr
+        test    rcx,rcx
+        jz      near .return
+
+        mov     rsi, r12        ; input_data
+        mov     rdi, r13
+        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
+.rowloop:
+        push    rdi
+        push    rsi
+
+        mov     rsi, JSAMPROW [rsi]                     ; inptr
+        mov     rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
+        mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
+        mov     rax,rdx                                 ; colctr
+.columnloop:
+
+        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+        movdqa    xmm1,xmm0
+        punpcklbw xmm0,xmm0
+        punpckhbw xmm1,xmm1
+
+        movdqa  XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+        movdqa  XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+        sub     rax, byte 2*SIZEOF_XMMWORD
+        jz      short .nextrow
+
+        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+        movdqa    xmm3,xmm2
+        punpcklbw xmm2,xmm2
+        punpckhbw xmm3,xmm3
+
+        movdqa  XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+        movdqa  XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+        sub     rax, byte 2*SIZEOF_XMMWORD
+        jz      short .nextrow
+
+        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
+        add     rbx, byte 4*SIZEOF_XMMWORD      ; outptr0
+        add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr1
+        jmp     short .columnloop
+
+.nextrow:
+        pop     rsi
+        pop     rdi
+
+        add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
+        add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
+        sub     rcx, byte 2                     ; rowctr
+        jg      near .rowloop
+
+.return:
+        pop     rbx
+        uncollect_args
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jdsample-sse2.asm b/simd/jdsample-sse2.asm
new file mode 100644
index 0000000..f75e594
--- /dev/null
+++ b/simd/jdsample-sse2.asm
@@ -0,0 +1,729 @@
+;
+; jdsample.asm - upsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE          times 8 dw  1
+PW_TWO          times 8 dw  2
+PW_THREE        times 8 dw  3
+PW_SEVEN        times 8 dw  7
+PW_EIGHT        times 8 dw  8
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
+%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
+%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
+
+        align   16
+        global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+        push    ebp
+        mov     ebp,esp
+        pushpic ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+        test    eax,eax
+        jz      near .return
+
+        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
+        test    ecx,ecx
+        jz      near .return
+
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        mov     edi, POINTER [output_data_ptr(ebp)]
+        mov     edi, JSAMPARRAY [edi]                   ; output_data
+        alignx  16,7
+.rowloop:
+        push    eax                     ; colctr
+        push    edi
+        push    esi
+
+        mov     esi, JSAMPROW [esi]     ; inptr
+        mov     edi, JSAMPROW [edi]     ; outptr
+
+        test    eax, SIZEOF_XMMWORD-1
+        jz      short .skip
+        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+        pxor    xmm0,xmm0               ; xmm0=(all 0's)
+        pcmpeqb xmm7,xmm7
+        psrldq  xmm7,(SIZEOF_XMMWORD-1)
+        pand    xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+        add     eax, byte SIZEOF_XMMWORD-1
+        and     eax, byte -SIZEOF_XMMWORD
+        cmp     eax, byte SIZEOF_XMMWORD
+        ja      short .columnloop
+        alignx  16,7
+
+.columnloop_last:
+        pcmpeqb xmm6,xmm6
+        pslldq  xmm6,(SIZEOF_XMMWORD-1)
+        pand    xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        jmp     short .upsample
+        alignx  16,7
+
+.columnloop:
+        movdqa  xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+        pslldq  xmm6,(SIZEOF_XMMWORD-1)
+
+.upsample:
+        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+        movdqa  xmm2,xmm1
+        movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
+        pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
+        psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
+
+        por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
+        por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
+
+        movdqa  xmm7,xmm1
+        psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
+
+        movdqa    xmm4,xmm1
+        punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
+        movdqa    xmm5,xmm2
+        punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
+        punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
+        movdqa    xmm6,xmm3
+        punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
+        punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
+
+        pmullw  xmm1,[GOTOFF(ebx,PW_THREE)]
+        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
+        paddw   xmm2,[GOTOFF(ebx,PW_ONE)]
+        paddw   xmm5,[GOTOFF(ebx,PW_ONE)]
+        paddw   xmm3,[GOTOFF(ebx,PW_TWO)]
+        paddw   xmm6,[GOTOFF(ebx,PW_TWO)]
+
+        paddw   xmm2,xmm1
+        paddw   xmm5,xmm4
+        psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+        psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+        paddw   xmm3,xmm1
+        paddw   xmm6,xmm4
+        psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+        psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+        psllw   xmm3,BYTE_BIT
+        psllw   xmm6,BYTE_BIT
+        por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+        por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+        sub     eax, byte SIZEOF_XMMWORD
+        add     esi, byte 1*SIZEOF_XMMWORD      ; inptr
+        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
+        cmp     eax, byte SIZEOF_XMMWORD
+        ja      near .columnloop
+        test    eax,eax
+        jnz     near .columnloop_last
+
+        pop     esi
+        pop     edi
+        pop     eax
+
+        add     esi, byte SIZEOF_JSAMPROW       ; input_data
+        add     edi, byte SIZEOF_JSAMPROW       ; output_data
+        dec     ecx                             ; rowctr
+        jg      near .rowloop
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        poppic  ebx
+        pop     ebp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
+%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
+%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          4
+%define gotptr          wk(0)-SIZEOF_POINTER    ; void *gotptr
+
+        align   16
+        global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic eax             ; make a room for GOT address
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx                     ; get GOT address
+        movpic  POINTER [gotptr], ebx   ; save GOT address
+
+        mov     edx,eax                         ; edx = original ebp
+        mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+        test    eax,eax
+        jz      near .return
+
+        mov     ecx, INT [max_v_samp(edx)]      ; rowctr
+        test    ecx,ecx
+        jz      near .return
+
+        mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
+        mov     edi, POINTER [output_data_ptr(edx)]
+        mov     edi, JSAMPARRAY [edi]                   ; output_data
+        alignx  16,7
+.rowloop:
+        push    eax                                     ; colctr
+        push    ecx
+        push    edi
+        push    esi
+
+        mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
+        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
+        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
+        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
+
+        test    eax, SIZEOF_XMMWORD-1
+        jz      short .skip
+        push    edx
+        mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+        mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+        mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+        mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+        pop     edx
+.skip:
+        ; -- process the first column block
+
+        movdqa  xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
+        movdqa  xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
+        movdqa  xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
+
+        pushpic ebx
+        movpic  ebx, POINTER [gotptr]   ; load GOT address
+
+        pxor      xmm3,xmm3             ; xmm3=(all 0's)
+        movdqa    xmm4,xmm0
+        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+        movdqa    xmm5,xmm1
+        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+        movdqa    xmm6,xmm2
+        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+        pmullw  xmm0,[GOTOFF(ebx,PW_THREE)]
+        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
+
+        pcmpeqb xmm7,xmm7
+        psrldq  xmm7,(SIZEOF_XMMWORD-2)
+
+        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+        movdqa  XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
+        movdqa  XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
+        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+        pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
+        pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
+
+        movdqa  XMMWORD [wk(0)], xmm1
+        movdqa  XMMWORD [wk(1)], xmm2
+
+        poppic  ebx
+
+        add     eax, byte SIZEOF_XMMWORD-1
+        and     eax, byte -SIZEOF_XMMWORD
+        cmp     eax, byte SIZEOF_XMMWORD
+        ja      short .columnloop
+        alignx  16,7
+
+.columnloop_last:
+        ; -- process the last column block
+
+        pushpic ebx
+        movpic  ebx, POINTER [gotptr]   ; load GOT address
+
+        pcmpeqb xmm1,xmm1
+        pslldq  xmm1,(SIZEOF_XMMWORD-2)
+        movdqa  xmm2,xmm1
+
+        pand    xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+        pand    xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+        movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+        movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+        jmp     near .upsample
+        alignx  16,7
+
+.columnloop:
+        ; -- process the next column block
+
+        movdqa  xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
+        movdqa  xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
+        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
+
+        pushpic ebx
+        movpic  ebx, POINTER [gotptr]   ; load GOT address
+
+        pxor      xmm3,xmm3             ; xmm3=(all 0's)
+        movdqa    xmm4,xmm0
+        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+        movdqa    xmm5,xmm1
+        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+        movdqa    xmm6,xmm2
+        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+        pmullw  xmm0,[GOTOFF(ebx,PW_THREE)]
+        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
+
+        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+        movdqa  XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
+        movdqa  XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
+        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+        pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
+        pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
+
+        movdqa  XMMWORD [wk(2)], xmm1
+        movdqa  XMMWORD [wk(3)], xmm2
+
+.upsample:
+        ; -- process the upper row
+
+        movdqa  xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+        movdqa  xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+        movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+        movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+        psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
+        pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
+        movdqa  xmm5,xmm7
+        movdqa  xmm6,xmm3
+        psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
+        pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
+
+        por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
+        por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
+
+        movdqa  xmm1,xmm7
+        movdqa  xmm2,xmm3
+        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
+        psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
+        movdqa  xmm4,xmm3
+        psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
+
+        por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
+        por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
+
+        movdqa  XMMWORD [wk(0)], xmm4
+
+        pmullw  xmm7,[GOTOFF(ebx,PW_THREE)]
+        pmullw  xmm3,[GOTOFF(ebx,PW_THREE)]
+        paddw   xmm1,[GOTOFF(ebx,PW_EIGHT)]
+        paddw   xmm5,[GOTOFF(ebx,PW_EIGHT)]
+        paddw   xmm0,[GOTOFF(ebx,PW_SEVEN)]
+        paddw   xmm2,[GOTOFF(ebx,PW_SEVEN)]
+
+        paddw   xmm1,xmm7
+        paddw   xmm5,xmm3
+        psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+        psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+        paddw   xmm0,xmm7
+        paddw   xmm2,xmm3
+        psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+        psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+        psllw   xmm0,BYTE_BIT
+        psllw   xmm2,BYTE_BIT
+        por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+        por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+        movdqa  XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+        movdqa  XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+        ; -- process the lower row
+
+        movdqa  xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+        movdqa  xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+        movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+        movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+        psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
+        pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
+        movdqa  xmm0,xmm6
+        movdqa  xmm2,xmm4
+        psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
+        pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
+
+        por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
+        por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
+
+        movdqa  xmm1,xmm6
+        movdqa  xmm5,xmm4
+        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
+        psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
+        movdqa  xmm3,xmm4
+        psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
+
+        por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
+        por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
+
+        movdqa  XMMWORD [wk(1)], xmm3
+
+        pmullw  xmm6,[GOTOFF(ebx,PW_THREE)]
+        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
+        paddw   xmm1,[GOTOFF(ebx,PW_EIGHT)]
+        paddw   xmm0,[GOTOFF(ebx,PW_EIGHT)]
+        paddw   xmm7,[GOTOFF(ebx,PW_SEVEN)]
+        paddw   xmm5,[GOTOFF(ebx,PW_SEVEN)]
+
+        paddw   xmm1,xmm6
+        paddw   xmm0,xmm4
+        psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+        psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+        paddw   xmm7,xmm6
+        paddw   xmm5,xmm4
+        psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+        psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+        psllw   xmm7,BYTE_BIT
+        psllw   xmm5,BYTE_BIT
+        por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+        por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+        poppic  ebx
+
+        sub     eax, byte SIZEOF_XMMWORD
+        add     ecx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
+        add     ebx, byte 1*SIZEOF_XMMWORD      ; inptr0
+        add     esi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
+        add     edx, byte 2*SIZEOF_XMMWORD      ; outptr0
+        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr1
+        cmp     eax, byte SIZEOF_XMMWORD
+        ja      near .columnloop
+        test    eax,eax
+        jnz     near .columnloop_last
+
+        pop     esi
+        pop     edi
+        pop     ecx
+        pop     eax
+
+        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
+        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
+        sub     ecx, byte 2                     ; rowctr
+        jg      near .rowloop
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
+%define output_width(b)         (b)+12          ; JDIMENSION output_width
+%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
+
+        align   16
+        global  EXTN(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+        push    ebp
+        mov     ebp,esp
+;       push    ebx             ; unused
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     edx, JDIMENSION [output_width(ebp)]
+        add     edx, byte (2*SIZEOF_XMMWORD)-1
+        and     edx, byte -(2*SIZEOF_XMMWORD)
+        jz      short .return
+
+        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
+        test    ecx,ecx
+        jz      short .return
+
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        mov     edi, POINTER [output_data_ptr(ebp)]
+        mov     edi, JSAMPARRAY [edi]                   ; output_data
+        alignx  16,7
+.rowloop:
+        push    edi
+        push    esi
+
+        mov     esi, JSAMPROW [esi]             ; inptr
+        mov     edi, JSAMPROW [edi]             ; outptr
+        mov     eax,edx                         ; colctr
+        alignx  16,7
+.columnloop:
+
+        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+        movdqa    xmm1,xmm0
+        punpcklbw xmm0,xmm0
+        punpckhbw xmm1,xmm1
+
+        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+        sub     eax, byte 2*SIZEOF_XMMWORD
+        jz      short .nextrow
+
+        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+        movdqa    xmm3,xmm2
+        punpcklbw xmm2,xmm2
+        punpckhbw xmm3,xmm3
+
+        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+        sub     eax, byte 2*SIZEOF_XMMWORD
+        jz      short .nextrow
+
+        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
+        add     edi, byte 4*SIZEOF_XMMWORD      ; outptr
+        jmp     short .columnloop
+        alignx  16,7
+
+.nextrow:
+        pop     esi
+        pop     edi
+
+        add     esi, byte SIZEOF_JSAMPROW       ; input_data
+        add     edi, byte SIZEOF_JSAMPROW       ; output_data
+        dec     ecx                             ; rowctr
+        jg      short .rowloop
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+;       pop     ebx             ; unused
+        pop     ebp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
+%define output_width(b)         (b)+12          ; JDIMENSION output_width
+%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
+%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
+
+        align   16
+        global  EXTN(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+        push    ebp
+        mov     ebp,esp
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     edx, JDIMENSION [output_width(ebp)]
+        add     edx, byte (2*SIZEOF_XMMWORD)-1
+        and     edx, byte -(2*SIZEOF_XMMWORD)
+        jz      near .return
+
+        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
+        test    ecx,ecx
+        jz      near .return
+
+        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
+        mov     edi, POINTER [output_data_ptr(ebp)]
+        mov     edi, JSAMPARRAY [edi]                   ; output_data
+        alignx  16,7
+.rowloop:
+        push    edi
+        push    esi
+
+        mov     esi, JSAMPROW [esi]                     ; inptr
+        mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
+        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
+        mov     eax,edx                                 ; colctr
+        alignx  16,7
+.columnloop:
+
+        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+        movdqa    xmm1,xmm0
+        punpcklbw xmm0,xmm0
+        punpckhbw xmm1,xmm1
+
+        movdqa  XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+        movdqa  XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+        sub     eax, byte 2*SIZEOF_XMMWORD
+        jz      short .nextrow
+
+        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+        movdqa    xmm3,xmm2
+        punpcklbw xmm2,xmm2
+        punpckhbw xmm3,xmm3
+
+        movdqa  XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+        movdqa  XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+        sub     eax, byte 2*SIZEOF_XMMWORD
+        jz      short .nextrow
+
+        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
+        add     ebx, byte 4*SIZEOF_XMMWORD      ; outptr0
+        add     edi, byte 4*SIZEOF_XMMWORD      ; outptr1
+        jmp     short .columnloop
+        alignx  16,7
+
+.nextrow:
+        pop     esi
+        pop     edi
+
+        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
+        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
+        sub     ecx, byte 2                     ; rowctr
+        jg      short .rowloop
+
+.return:
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jdsamss2-64.asm b/simd/jdsamss2-64.asm
deleted file mode 100644
index fb7dbf7..0000000
--- a/simd/jdsamss2-64.asm
+++ /dev/null
@@ -1,671 +0,0 @@
-;
-; jdsamss2-64.asm - upsampling (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fancy_upsample_sse2) PRIVATE
-
-EXTN(jconst_fancy_upsample_sse2):
-
-PW_ONE		times 8 dw  1
-PW_TWO		times 8 dw  2
-PW_THREE	times 8 dw  3
-PW_SEVEN	times 8 dw  7
-PW_EIGHT	times 8 dw  8
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY * output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_fancy_upsample_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-
-	mov	eax, r11d  ; colctr
-	test	rax,rax
-	jz	near .return
-
-	mov	ecx, r10d	; rowctr
-	test	rcx,rcx
-	jz	near .return
-
-	mov	rsi, r12	; input_data
-	mov	rdi, r13
-	mov	rdi, JSAMPARRAY [rdi]			; output_data
-.rowloop:
-	push	rax			; colctr
-	push	rdi
-	push	rsi
-
-	mov	rsi, JSAMPROW [rsi]	; inptr
-	mov	rdi, JSAMPROW [rdi]	; outptr
-
-	test	rax, SIZEOF_XMMWORD-1
-	jz	short .skip
-	mov	dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
-.skip:
-	pxor	xmm0,xmm0		; xmm0=(all 0's)
-	pcmpeqb	xmm7,xmm7
-	psrldq	xmm7,(SIZEOF_XMMWORD-1)
-	pand	xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-	add	rax, byte SIZEOF_XMMWORD-1
-	and	rax, byte -SIZEOF_XMMWORD
-	cmp	rax, byte SIZEOF_XMMWORD
-	ja	short .columnloop
-
-.columnloop_last:
-	pcmpeqb	xmm6,xmm6
-	pslldq	xmm6,(SIZEOF_XMMWORD-1)
-	pand	xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	jmp	short .upsample
-
-.columnloop:
-	movdqa	xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-	pslldq	xmm6,(SIZEOF_XMMWORD-1)
-
-.upsample:
-	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-	movdqa	xmm2,xmm1
-	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
-	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
-	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
-
-	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
-	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
-
-	movdqa	xmm7,xmm1
-	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
-
-	movdqa    xmm4,xmm1
-	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
-	movdqa    xmm5,xmm2
-	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
-	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
-	movdqa    xmm6,xmm3
-	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
-	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
-
-	pmullw	xmm1,[rel PW_THREE]
-	pmullw	xmm4,[rel PW_THREE]
-	paddw	xmm2,[rel PW_ONE]
-	paddw	xmm5,[rel PW_ONE]
-	paddw	xmm3,[rel PW_TWO]
-	paddw	xmm6,[rel PW_TWO]
-
-	paddw	xmm2,xmm1
-	paddw	xmm5,xmm4
-	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
-	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
-	paddw	xmm3,xmm1
-	paddw	xmm6,xmm4
-	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
-	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
-	psllw	xmm3,BYTE_BIT
-	psllw	xmm6,BYTE_BIT
-	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
-	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
-
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
-
-	sub	rax, byte SIZEOF_XMMWORD
-	add	rsi, byte 1*SIZEOF_XMMWORD	; inptr
-	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
-	cmp	rax, byte SIZEOF_XMMWORD
-	ja	near .columnloop
-	test	eax,eax
-	jnz	near .columnloop_last
-
-	pop	rsi
-	pop	rdi
-	pop	rax
-
-	add	rsi, byte SIZEOF_JSAMPROW	; input_data
-	add	rdi, byte SIZEOF_JSAMPROW	; output_data
-	dec	rcx				; rowctr
-	jg	near .rowloop
-
-.return:
-	uncollect_args
-	pop	rbp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY * output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY * output_data_ptr
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		4
-
-	align	16
-	global	EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_fancy_upsample_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-	push	rbx
-
-	mov	rax, r11  ; colctr
-	test	rax,rax
-	jz	near .return
-
-	mov	rcx, r10	; rowctr
-	test	rcx,rcx
-	jz	near .return
-
-	mov	rsi, r12	; input_data
-	mov	rdi, r13
-	mov	rdi, JSAMPARRAY [rdi]			; output_data
-.rowloop:
-	push	rax					; colctr
-	push	rcx
-	push	rdi
-	push	rsi
-
-	mov	rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]	; inptr1(above)
-	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
-	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1(below)
-	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]	; outptr0
-	mov	rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]	; outptr1
-
-	test	rax, SIZEOF_XMMWORD-1
-	jz	short .skip
-	push	rdx
-	mov	dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
-	mov	dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
-	mov	dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
-	pop	rdx
-.skip:
-	; -- process the first column block
-
-	movdqa	xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
-	movdqa	xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
-	movdqa	xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
-
-	pxor      xmm3,xmm3		; xmm3=(all 0's)
-	movdqa    xmm4,xmm0
-	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm5,xmm1
-	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm6,xmm2
-	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-	pmullw	xmm0,[rel PW_THREE]
-	pmullw	xmm4,[rel PW_THREE]
-
-	pcmpeqb	xmm7,xmm7
-	psrldq	xmm7,(SIZEOF_XMMWORD-2)
-
-	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-	movdqa	XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
-	movdqa	XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
-
-	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
-	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
-
-	movdqa	XMMWORD [wk(0)], xmm1
-	movdqa	XMMWORD [wk(1)], xmm2
-
-	add	rax, byte SIZEOF_XMMWORD-1
-	and	rax, byte -SIZEOF_XMMWORD
-	cmp	rax, byte SIZEOF_XMMWORD
-	ja	short .columnloop
-
-.columnloop_last:
-	; -- process the last column block
-
-	pcmpeqb	xmm1,xmm1
-	pslldq	xmm1,(SIZEOF_XMMWORD-2)
-	movdqa	xmm2,xmm1
-
-	pand	xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-	pand	xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
-	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
-	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
-
-	jmp	near .upsample
-
-.columnloop:
-	; -- process the next column block
-
-	movdqa	xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
-	movdqa	xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
-	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
-
-	pxor      xmm3,xmm3		; xmm3=(all 0's)
-	movdqa    xmm4,xmm0
-	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm5,xmm1
-	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm6,xmm2
-	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-	pmullw	xmm0,[rel PW_THREE]
-	pmullw	xmm4,[rel PW_THREE]
-
-	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-	movdqa	XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
-	movdqa	XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
-	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
-
-	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
-	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
-
-	movdqa	XMMWORD [wk(2)], xmm1
-	movdqa	XMMWORD [wk(3)], xmm2
-
-.upsample:
-	; -- process the upper row
-
-	movdqa	xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-	movdqa	xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-
-	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
-	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
-	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
-	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
-	movdqa	xmm5,xmm7
-	movdqa	xmm6,xmm3
-	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
-	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
-
-	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
-	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
-
-	movdqa	xmm1,xmm7
-	movdqa	xmm2,xmm3
-	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
-	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
-	movdqa	xmm4,xmm3
-	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
-
-	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
-	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
-
-	movdqa	XMMWORD [wk(0)], xmm4
-
-	pmullw	xmm7,[rel PW_THREE]
-	pmullw	xmm3,[rel PW_THREE]
-	paddw	xmm1,[rel PW_EIGHT]
-	paddw	xmm5,[rel PW_EIGHT]
-	paddw	xmm0,[rel PW_SEVEN]
-	paddw	xmm2,[rel PW_SEVEN]
-
-	paddw	xmm1,xmm7
-	paddw	xmm5,xmm3
-	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
-	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
-	paddw	xmm0,xmm7
-	paddw	xmm2,xmm3
-	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
-	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
-	psllw	xmm0,BYTE_BIT
-	psllw	xmm2,BYTE_BIT
-	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
-	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
-	movdqa	XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
-	movdqa	XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
-
-	; -- process the lower row
-
-	movdqa	xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
-	movdqa	xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
-	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
-	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
-	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
-	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
-	movdqa	xmm0,xmm6
-	movdqa	xmm2,xmm4
-	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
-	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
-
-	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
-	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
-
-	movdqa	xmm1,xmm6
-	movdqa	xmm5,xmm4
-	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
-	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
-	movdqa	xmm3,xmm4
-	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
-
-	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
-	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
-
-	movdqa	XMMWORD [wk(1)], xmm3
-
-	pmullw	xmm6,[rel PW_THREE]
-	pmullw	xmm4,[rel PW_THREE]
-	paddw	xmm1,[rel PW_EIGHT]
-	paddw	xmm0,[rel PW_EIGHT]
-	paddw	xmm7,[rel PW_SEVEN]
-	paddw	xmm5,[rel PW_SEVEN]
-
-	paddw	xmm1,xmm6
-	paddw	xmm0,xmm4
-	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
-	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
-	paddw	xmm7,xmm6
-	paddw	xmm5,xmm4
-	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
-	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
-	psllw	xmm7,BYTE_BIT
-	psllw	xmm5,BYTE_BIT
-	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
-	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
-	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
-
-	sub	rax, byte SIZEOF_XMMWORD
-	add	rcx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
-	add	rbx, byte 1*SIZEOF_XMMWORD	; inptr0
-	add	rsi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
-	add	rdx, byte 2*SIZEOF_XMMWORD	; outptr0
-	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr1
-	cmp	rax, byte SIZEOF_XMMWORD
-	ja	near .columnloop
-	test	rax,rax
-	jnz	near .columnloop_last
-
-	pop	rsi
-	pop	rdi
-	pop	rcx
-	pop	rax
-
-	add	rsi, byte 1*SIZEOF_JSAMPROW	; input_data
-	add	rdi, byte 2*SIZEOF_JSAMPROW	; output_data
-	sub	rcx, byte 2			; rowctr
-	jg	near .rowloop
-
-.return:
-	pop	rbx
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY * output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_upsample_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-
-	mov	edx, r11d
-	add	rdx, byte (2*SIZEOF_XMMWORD)-1
-	and	rdx, byte -(2*SIZEOF_XMMWORD)
-	jz	near .return
-
-	mov	rcx, r10	; rowctr
-	test	rcx,rcx
-	jz	short .return
-
-	mov	rsi, r12 ; input_data
-	mov	rdi, r13
-	mov	rdi, JSAMPARRAY [rdi]			; output_data
-.rowloop:
-	push	rdi
-	push	rsi
-
-	mov	rsi, JSAMPROW [rsi]		; inptr
-	mov	rdi, JSAMPROW [rdi]		; outptr
-	mov	rax,rdx				; colctr
-.columnloop:
-
-	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-	movdqa    xmm1,xmm0
-	punpcklbw xmm0,xmm0
-	punpckhbw xmm1,xmm1
-
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
-
-	sub	rax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-	movdqa    xmm3,xmm2
-	punpcklbw xmm2,xmm2
-	punpckhbw xmm3,xmm3
-
-	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
-
-	sub	rax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
-	add	rdi, byte 4*SIZEOF_XMMWORD	; outptr
-	jmp	short .columnloop
-
-.nextrow:
-	pop	rsi
-	pop	rdi
-
-	add	rsi, byte SIZEOF_JSAMPROW	; input_data
-	add	rdi, byte SIZEOF_JSAMPROW	; output_data
-	dec	rcx				; rowctr
-	jg	short .rowloop
-
-.return:
-	uncollect_args
-	pop	rbp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY * output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_upsample_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-	push	rbx
-
-	mov	edx, r11d
-	add	rdx, byte (2*SIZEOF_XMMWORD)-1
-	and	rdx, byte -(2*SIZEOF_XMMWORD)
-	jz	near .return
-
-	mov	rcx, r10	; rowctr
-	test	rcx,rcx
-	jz	near .return
-
-	mov	rsi, r12	; input_data
-	mov	rdi, r13
-	mov	rdi, JSAMPARRAY [rdi]			; output_data
-.rowloop:
-	push	rdi
-	push	rsi
-
-	mov	rsi, JSAMPROW [rsi]			; inptr
-	mov	rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]	; outptr0
-	mov	rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]	; outptr1
-	mov	rax,rdx					; colctr
-.columnloop:
-
-	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-	movdqa    xmm1,xmm0
-	punpcklbw xmm0,xmm0
-	punpckhbw xmm1,xmm1
-
-	movdqa	XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
-	movdqa	XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
-	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
-
-	sub	rax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-	movdqa    xmm3,xmm2
-	punpcklbw xmm2,xmm2
-	punpckhbw xmm3,xmm3
-
-	movdqa	XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
-	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
-
-	sub	rax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
-	add	rbx, byte 4*SIZEOF_XMMWORD	; outptr0
-	add	rdi, byte 4*SIZEOF_XMMWORD	; outptr1
-	jmp	short .columnloop
-
-.nextrow:
-	pop	rsi
-	pop	rdi
-
-	add	rsi, byte 1*SIZEOF_JSAMPROW	; input_data
-	add	rdi, byte 2*SIZEOF_JSAMPROW	; output_data
-	sub	rcx, byte 2			; rowctr
-	jg	near .rowloop
-
-.return:
-	pop	rbx
-	uncollect_args
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jdsamss2.asm b/simd/jdsamss2.asm
deleted file mode 100644
index c91a863..0000000
--- a/simd/jdsamss2.asm
+++ /dev/null
@@ -1,729 +0,0 @@
-;
-; jdsamss2.asm - upsampling (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fancy_upsample_sse2) PRIVATE
-
-EXTN(jconst_fancy_upsample_sse2):
-
-PW_ONE		times 8 dw  1
-PW_TWO		times 8 dw  2
-PW_THREE	times 8 dw  3
-PW_SEVEN	times 8 dw  7
-PW_EIGHT	times 8 dw  8
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_fancy_upsample_sse2):
-	push	ebp
-	mov	ebp,esp
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
-	test	eax,eax
-	jz	near .return
-
-	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
-	test	ecx,ecx
-	jz	near .return
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, POINTER [output_data_ptr(ebp)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	eax			; colctr
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]	; inptr
-	mov	edi, JSAMPROW [edi]	; outptr
-
-	test	eax, SIZEOF_XMMWORD-1
-	jz	short .skip
-	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
-.skip:
-	pxor	xmm0,xmm0		; xmm0=(all 0's)
-	pcmpeqb	xmm7,xmm7
-	psrldq	xmm7,(SIZEOF_XMMWORD-1)
-	pand	xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-	add	eax, byte SIZEOF_XMMWORD-1
-	and	eax, byte -SIZEOF_XMMWORD
-	cmp	eax, byte SIZEOF_XMMWORD
-	ja	short .columnloop
-	alignx	16,7
-
-.columnloop_last:
-	pcmpeqb	xmm6,xmm6
-	pslldq	xmm6,(SIZEOF_XMMWORD-1)
-	pand	xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	jmp	short .upsample
-	alignx	16,7
-
-.columnloop:
-	movdqa	xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
-	pslldq	xmm6,(SIZEOF_XMMWORD-1)
-
-.upsample:
-	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-	movdqa	xmm2,xmm1
-	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
-	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
-	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
-
-	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
-	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
-
-	movdqa	xmm7,xmm1
-	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
-
-	movdqa    xmm4,xmm1
-	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
-	movdqa    xmm5,xmm2
-	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
-	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
-	movdqa    xmm6,xmm3
-	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
-	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
-
-	pmullw	xmm1,[GOTOFF(ebx,PW_THREE)]
-	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
-	paddw	xmm2,[GOTOFF(ebx,PW_ONE)]
-	paddw	xmm5,[GOTOFF(ebx,PW_ONE)]
-	paddw	xmm3,[GOTOFF(ebx,PW_TWO)]
-	paddw	xmm6,[GOTOFF(ebx,PW_TWO)]
-
-	paddw	xmm2,xmm1
-	paddw	xmm5,xmm4
-	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
-	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
-	paddw	xmm3,xmm1
-	paddw	xmm6,xmm4
-	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
-	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
-	psllw	xmm3,BYTE_BIT
-	psllw	xmm6,BYTE_BIT
-	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
-	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
-
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
-
-	sub	eax, byte SIZEOF_XMMWORD
-	add	esi, byte 1*SIZEOF_XMMWORD	; inptr
-	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
-	cmp	eax, byte SIZEOF_XMMWORD
-	ja	near .columnloop
-	test	eax,eax
-	jnz	near .columnloop_last
-
-	pop	esi
-	pop	edi
-	pop	eax
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_data
-	add	edi, byte SIZEOF_JSAMPROW	; output_data
-	dec	ecx				; rowctr
-	jg	near .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		4
-%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
-
-	align	16
-	global	EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_fancy_upsample_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	eax		; make a room for GOT address
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx			; get GOT address
-	movpic	POINTER [gotptr], ebx	; save GOT address
-
-	mov	edx,eax				; edx = original ebp
-	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
-	test	eax,eax
-	jz	near .return
-
-	mov	ecx, INT [max_v_samp(edx)]	; rowctr
-	test	ecx,ecx
-	jz	near .return
-
-	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
-	mov	edi, POINTER [output_data_ptr(edx)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	eax					; colctr
-	push	ecx
-	push	edi
-	push	esi
-
-	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
-	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
-	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
-	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
-
-	test	eax, SIZEOF_XMMWORD-1
-	jz	short .skip
-	push	edx
-	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
-	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
-	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
-	pop	edx
-.skip:
-	; -- process the first column block
-
-	movdqa	xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
-	movdqa	xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
-	movdqa	xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
-
-	pushpic	ebx
-	movpic	ebx, POINTER [gotptr]	; load GOT address
-
-	pxor      xmm3,xmm3		; xmm3=(all 0's)
-	movdqa    xmm4,xmm0
-	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm5,xmm1
-	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm6,xmm2
-	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
-	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
-
-	pcmpeqb	xmm7,xmm7
-	psrldq	xmm7,(SIZEOF_XMMWORD-2)
-
-	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
-	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
-
-	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
-	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
-
-	movdqa	XMMWORD [wk(0)], xmm1
-	movdqa	XMMWORD [wk(1)], xmm2
-
-	poppic	ebx
-
-	add	eax, byte SIZEOF_XMMWORD-1
-	and	eax, byte -SIZEOF_XMMWORD
-	cmp	eax, byte SIZEOF_XMMWORD
-	ja	short .columnloop
-	alignx	16,7
-
-.columnloop_last:
-	; -- process the last column block
-
-	pushpic	ebx
-	movpic	ebx, POINTER [gotptr]	; load GOT address
-
-	pcmpeqb	xmm1,xmm1
-	pslldq	xmm1,(SIZEOF_XMMWORD-2)
-	movdqa	xmm2,xmm1
-
-	pand	xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
-	pand	xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
-	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
-	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
-
-	jmp	near .upsample
-	alignx	16,7
-
-.columnloop:
-	; -- process the next column block
-
-	movdqa	xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
-	movdqa	xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
-	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
-
-	pushpic	ebx
-	movpic	ebx, POINTER [gotptr]	; load GOT address
-
-	pxor      xmm3,xmm3		; xmm3=(all 0's)
-	movdqa    xmm4,xmm0
-	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm5,xmm1
-	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-	movdqa    xmm6,xmm2
-	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
-	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
-
-	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-	movdqa	XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
-	movdqa	XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
-	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
-
-	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
-	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
-
-	movdqa	XMMWORD [wk(2)], xmm1
-	movdqa	XMMWORD [wk(3)], xmm2
-
-.upsample:
-	; -- process the upper row
-
-	movdqa	xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
-	movdqa	xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
-
-	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
-	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
-	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
-	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
-	movdqa	xmm5,xmm7
-	movdqa	xmm6,xmm3
-	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
-	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
-
-	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
-	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
-
-	movdqa	xmm1,xmm7
-	movdqa	xmm2,xmm3
-	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
-	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
-	movdqa	xmm4,xmm3
-	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
-
-	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
-	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
-
-	movdqa	XMMWORD [wk(0)], xmm4
-
-	pmullw	xmm7,[GOTOFF(ebx,PW_THREE)]
-	pmullw	xmm3,[GOTOFF(ebx,PW_THREE)]
-	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	xmm5,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	xmm0,[GOTOFF(ebx,PW_SEVEN)]
-	paddw	xmm2,[GOTOFF(ebx,PW_SEVEN)]
-
-	paddw	xmm1,xmm7
-	paddw	xmm5,xmm3
-	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
-	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
-	paddw	xmm0,xmm7
-	paddw	xmm2,xmm3
-	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
-	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
-	psllw	xmm0,BYTE_BIT
-	psllw	xmm2,BYTE_BIT
-	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
-	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
-	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
-	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
-
-	; -- process the lower row
-
-	movdqa	xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
-	movdqa	xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
-	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
-	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
-	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
-	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
-	movdqa	xmm0,xmm6
-	movdqa	xmm2,xmm4
-	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
-	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
-
-	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
-	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
-
-	movdqa	xmm1,xmm6
-	movdqa	xmm5,xmm4
-	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
-	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
-	movdqa	xmm3,xmm4
-	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
-
-	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
-	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
-
-	movdqa	XMMWORD [wk(1)], xmm3
-
-	pmullw	xmm6,[GOTOFF(ebx,PW_THREE)]
-	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
-	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	xmm0,[GOTOFF(ebx,PW_EIGHT)]
-	paddw	xmm7,[GOTOFF(ebx,PW_SEVEN)]
-	paddw	xmm5,[GOTOFF(ebx,PW_SEVEN)]
-
-	paddw	xmm1,xmm6
-	paddw	xmm0,xmm4
-	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
-	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
-	paddw	xmm7,xmm6
-	paddw	xmm5,xmm4
-	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
-	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
-	psllw	xmm7,BYTE_BIT
-	psllw	xmm5,BYTE_BIT
-	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
-	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
-	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
-
-	poppic	ebx
-
-	sub	eax, byte SIZEOF_XMMWORD
-	add	ecx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
-	add	ebx, byte 1*SIZEOF_XMMWORD	; inptr0
-	add	esi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
-	add	edx, byte 2*SIZEOF_XMMWORD	; outptr0
-	add	edi, byte 2*SIZEOF_XMMWORD	; outptr1
-	cmp	eax, byte SIZEOF_XMMWORD
-	ja	near .columnloop
-	test	eax,eax
-	jnz	near .columnloop_last
-
-	pop	esi
-	pop	edi
-	pop	ecx
-	pop	eax
-
-	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
-	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
-	sub	ecx, byte 2			; rowctr
-	jg	near .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define output_width(b)	(b)+12		; JDIMENSION output_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v1_upsample_sse2):
-	push	ebp
-	mov	ebp,esp
-;	push	ebx		; unused
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	edx, JDIMENSION [output_width(ebp)]
-	add	edx, byte (2*SIZEOF_XMMWORD)-1
-	and	edx, byte -(2*SIZEOF_XMMWORD)
-	jz	short .return
-
-	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
-	test	ecx,ecx
-	jz	short .return
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, POINTER [output_data_ptr(ebp)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]		; inptr
-	mov	edi, JSAMPROW [edi]		; outptr
-	mov	eax,edx				; colctr
-	alignx	16,7
-.columnloop:
-
-	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-	movdqa    xmm1,xmm0
-	punpcklbw xmm0,xmm0
-	punpckhbw xmm1,xmm1
-
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
-
-	sub	eax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-	movdqa    xmm3,xmm2
-	punpcklbw xmm2,xmm2
-	punpckhbw xmm3,xmm3
-
-	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
-
-	sub	eax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
-	add	edi, byte 4*SIZEOF_XMMWORD	; outptr
-	jmp	short .columnloop
-	alignx	16,7
-
-.nextrow:
-	pop	esi
-	pop	edi
-
-	add	esi, byte SIZEOF_JSAMPROW	; input_data
-	add	edi, byte SIZEOF_JSAMPROW	; output_data
-	dec	ecx				; rowctr
-	jg	short .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-;	pop	ebx		; unused
-	pop	ebp
-	ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY * output_data_ptr);
-;
-
-%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
-%define output_width(b)	(b)+12		; JDIMENSION output_width
-%define input_data(b)		(b)+16		; JSAMPARRAY input_data
-%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
-
-	align	16
-	global	EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
-
-EXTN(jsimd_h2v2_upsample_sse2):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	mov	edx, JDIMENSION [output_width(ebp)]
-	add	edx, byte (2*SIZEOF_XMMWORD)-1
-	and	edx, byte -(2*SIZEOF_XMMWORD)
-	jz	near .return
-
-	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
-	test	ecx,ecx
-	jz	near .return
-
-	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
-	mov	edi, POINTER [output_data_ptr(ebp)]
-	mov	edi, JSAMPARRAY [edi]			; output_data
-	alignx	16,7
-.rowloop:
-	push	edi
-	push	esi
-
-	mov	esi, JSAMPROW [esi]			; inptr
-	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
-	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
-	mov	eax,edx					; colctr
-	alignx	16,7
-.columnloop:
-
-	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-	movdqa    xmm1,xmm0
-	punpcklbw xmm0,xmm0
-	punpckhbw xmm1,xmm1
-
-	movdqa	XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
-	movdqa	XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
-	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
-
-	sub	eax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-	movdqa    xmm3,xmm2
-	punpcklbw xmm2,xmm2
-	punpckhbw xmm3,xmm3
-
-	movdqa	XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
-	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
-
-	sub	eax, byte 2*SIZEOF_XMMWORD
-	jz	short .nextrow
-
-	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
-	add	ebx, byte 4*SIZEOF_XMMWORD	; outptr0
-	add	edi, byte 4*SIZEOF_XMMWORD	; outptr1
-	jmp	short .columnloop
-	alignx	16,7
-
-.nextrow:
-	pop	esi
-	pop	edi
-
-	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
-	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
-	sub	ecx, byte 2			; rowctr
-	jg	short .rowloop
-
-.return:
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jf3dnflt.asm b/simd/jf3dnflt.asm
deleted file mode 100644
index 432e304..0000000
--- a/simd/jf3dnflt.asm
+++ /dev/null
@@ -1,320 +0,0 @@
-;
-; jf3dnflt.asm - floating-point FDCT (3DNow!)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fdct_float_3dnow) PRIVATE
-
-EXTN(jconst_fdct_float_3dnow):
-
-PD_0_382	times 2 dd  0.382683432365089771728460
-PD_0_707	times 2 dd  0.707106781186547524400844
-PD_0_541	times 2 dd  0.541196100146196984399723
-PD_1_306	times 2 dd  1.306562964876376527856643
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_3dnow (FAST_FLOAT * data)
-;
-
-%define data(b)		(b)+8		; FAST_FLOAT * data
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_float_3dnow) PRIVATE
-
-EXTN(jsimd_fdct_float_3dnow):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-;	push	edi		; unused
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process rows.
-
-	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
-	mov	ecx, DCTSIZE/2
-	alignx	16,7
-.rowloop:
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
-
-	; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
-
-	movq      mm4,mm0		; transpose coefficients
-	punpckldq mm0,mm1		; mm0=(00 10)=data0
-	punpckhdq mm4,mm1		; mm4=(01 11)=data1
-	movq      mm5,mm2		; transpose coefficients
-	punpckldq mm2,mm3		; mm2=(06 16)=data6
-	punpckhdq mm5,mm3		; mm5=(07 17)=data7
-
-	movq	mm6,mm4
-	movq	mm7,mm0
-	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
-	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
-	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
-	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
-
-	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
-
-	; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
-	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
-
-	movq      mm4,mm1		; transpose coefficients
-	punpckldq mm1,mm3		; mm1=(02 12)=data2
-	punpckhdq mm4,mm3		; mm4=(03 13)=data3
-	movq      mm0,mm2		; transpose coefficients
-	punpckldq mm2,mm5		; mm2=(04 14)=data4
-	punpckhdq mm0,mm5		; mm0=(05 15)=data5
-
-	movq	mm3,mm4
-	movq	mm5,mm1
-	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
-	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
-	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
-	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
-
-	; -- Even part
-
-	movq	mm2,mm7
-	movq	mm0,mm6
-	pfsub	mm7,mm4			; mm7=tmp13
-	pfsub	mm6,mm1			; mm6=tmp12
-	pfadd	mm2,mm4			; mm2=tmp10
-	pfadd	mm0,mm1			; mm0=tmp11
-
-	pfadd	mm6,mm7
-	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
-
-	movq	mm4,mm2
-	movq	mm1,mm7
-	pfsub	mm2,mm0			; mm2=data4
-	pfsub	mm7,mm6			; mm7=data6
-	pfadd	mm4,mm0			; mm4=data0
-	pfadd	mm1,mm6			; mm1=data2
-
-	movq	MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
-	movq	MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
-	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
-	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
-
-	; -- Odd part
-
-	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
-	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
-
-	pfadd	mm3,mm5			; mm3=tmp10
-	pfadd	mm5,mm0			; mm5=tmp11
-	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
-
-	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
-
-	movq	mm2,mm3			; mm2=tmp10
-	pfsub	mm3,mm0
-	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
-	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
-	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
-	pfadd	mm2,mm3			; mm2=z2
-	pfadd	mm0,mm3			; mm0=z4
-
-	movq	mm7,mm6
-	pfsub	mm6,mm5			; mm6=z13
-	pfadd	mm7,mm5			; mm7=z11
-
-	movq	mm4,mm6
-	movq	mm1,mm7
-	pfsub	mm6,mm2			; mm6=data3
-	pfsub	mm7,mm0			; mm7=data7
-	pfadd	mm4,mm2			; mm4=data5
-	pfadd	mm1,mm0			; mm1=data1
-
-	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
-	movq	MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
-	movq	MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
-	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-	add	edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	near .rowloop
-
-	; ---- Pass 2: process columns.
-
-	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
-	mov	ecx, DCTSIZE/2
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
-	; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
-
-	movq      mm4,mm0		; transpose coefficients
-	punpckldq mm0,mm1		; mm0=(00 01)=data0
-	punpckhdq mm4,mm1		; mm4=(10 11)=data1
-	movq      mm5,mm2		; transpose coefficients
-	punpckldq mm2,mm3		; mm2=(60 61)=data6
-	punpckhdq mm5,mm3		; mm5=(70 71)=data7
-
-	movq	mm6,mm4
-	movq	mm7,mm0
-	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
-	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
-	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
-	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
-
-	movq	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
-	movq	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
-	; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
-	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
-
-	movq      mm4,mm1		; transpose coefficients
-	punpckldq mm1,mm3		; mm1=(20 21)=data2
-	punpckhdq mm4,mm3		; mm4=(30 31)=data3
-	movq      mm0,mm2		; transpose coefficients
-	punpckldq mm2,mm5		; mm2=(40 41)=data4
-	punpckhdq mm0,mm5		; mm0=(50 51)=data5
-
-	movq	mm3,mm4
-	movq	mm5,mm1
-	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
-	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
-	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
-	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
-
-	; -- Even part
-
-	movq	mm2,mm7
-	movq	mm0,mm6
-	pfsub	mm7,mm4			; mm7=tmp13
-	pfsub	mm6,mm1			; mm6=tmp12
-	pfadd	mm2,mm4			; mm2=tmp10
-	pfadd	mm0,mm1			; mm0=tmp11
-
-	pfadd	mm6,mm7
-	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
-
-	movq	mm4,mm2
-	movq	mm1,mm7
-	pfsub	mm2,mm0			; mm2=data4
-	pfsub	mm7,mm6			; mm7=data6
-	pfadd	mm4,mm0			; mm4=data0
-	pfadd	mm1,mm6			; mm1=data2
-
-	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
-	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
-	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
-	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-	; -- Odd part
-
-	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
-	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
-
-	pfadd	mm3,mm5			; mm3=tmp10
-	pfadd	mm5,mm0			; mm5=tmp11
-	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
-
-	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
-
-	movq	mm2,mm3			; mm2=tmp10
-	pfsub	mm3,mm0
-	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
-	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
-	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
-	pfadd	mm2,mm3			; mm2=z2
-	pfadd	mm0,mm3			; mm0=z4
-
-	movq	mm7,mm6
-	pfsub	mm6,mm5			; mm6=z13
-	pfadd	mm7,mm5			; mm7=z11
-
-	movq	mm4,mm6
-	movq	mm1,mm7
-	pfsub	mm6,mm2			; mm6=data3
-	pfsub	mm7,mm0			; mm7=data7
-	pfadd	mm4,mm2			; mm4=data5
-	pfadd	mm1,mm0			; mm1=data1
-
-	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
-	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
-	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
-	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-	add	edx, byte 2*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	near .columnloop
-
-	femms		; empty MMX/3DNow! state
-
-;	pop	edi		; unused
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfdctflt-3dn.asm b/simd/jfdctflt-3dn.asm
new file mode 100644
index 0000000..133fe4d
--- /dev/null
+++ b/simd/jfdctflt-3dn.asm
@@ -0,0 +1,320 @@
+;
+; jfdctflt.asm - floating-point FDCT (3DNow!)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382        times 2 dd  0.382683432365089771728460
+PD_0_707        times 2 dd  0.707106781186547524400844
+PD_0_541        times 2 dd  0.541196100146196984399723
+PD_1_306        times 2 dd  1.306562964876376527856643
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_3dnow (FAST_FLOAT *data)
+;
+
+%define data(b)         (b)+8           ; FAST_FLOAT *data
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+        global  EXTN(jsimd_fdct_float_3dnow)
+
+EXTN(jsimd_fdct_float_3dnow):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+;       push    esi             ; unused
+;       push    edi             ; unused
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process rows.
+
+        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
+        mov     ecx, DCTSIZE/2
+        alignx  16,7
+.rowloop:
+
+        movq    mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+        ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+        movq      mm4,mm0               ; transpose coefficients
+        punpckldq mm0,mm1               ; mm0=(00 10)=data0
+        punpckhdq mm4,mm1               ; mm4=(01 11)=data1
+        movq      mm5,mm2               ; transpose coefficients
+        punpckldq mm2,mm3               ; mm2=(06 16)=data6
+        punpckhdq mm5,mm3               ; mm5=(07 17)=data7
+
+        movq    mm6,mm4
+        movq    mm7,mm0
+        pfsub   mm4,mm2                 ; mm4=data1-data6=tmp6
+        pfsub   mm0,mm5                 ; mm0=data0-data7=tmp7
+        pfadd   mm6,mm2                 ; mm6=data1+data6=tmp1
+        pfadd   mm7,mm5                 ; mm7=data0+data7=tmp0
+
+        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+        ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+        movq    MMWORD [wk(0)], mm4     ; wk(0)=tmp6
+        movq    MMWORD [wk(1)], mm0     ; wk(1)=tmp7
+
+        movq      mm4,mm1               ; transpose coefficients
+        punpckldq mm1,mm3               ; mm1=(02 12)=data2
+        punpckhdq mm4,mm3               ; mm4=(03 13)=data3
+        movq      mm0,mm2               ; transpose coefficients
+        punpckldq mm2,mm5               ; mm2=(04 14)=data4
+        punpckhdq mm0,mm5               ; mm0=(05 15)=data5
+
+        movq    mm3,mm4
+        movq    mm5,mm1
+        pfadd   mm4,mm2                 ; mm4=data3+data4=tmp3
+        pfadd   mm1,mm0                 ; mm1=data2+data5=tmp2
+        pfsub   mm3,mm2                 ; mm3=data3-data4=tmp4
+        pfsub   mm5,mm0                 ; mm5=data2-data5=tmp5
+
+        ; -- Even part
+
+        movq    mm2,mm7
+        movq    mm0,mm6
+        pfsub   mm7,mm4                 ; mm7=tmp13
+        pfsub   mm6,mm1                 ; mm6=tmp12
+        pfadd   mm2,mm4                 ; mm2=tmp10
+        pfadd   mm0,mm1                 ; mm0=tmp11
+
+        pfadd   mm6,mm7
+        pfmul   mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+        movq    mm4,mm2
+        movq    mm1,mm7
+        pfsub   mm2,mm0                 ; mm2=data4
+        pfsub   mm7,mm6                 ; mm7=data6
+        pfadd   mm4,mm0                 ; mm4=data0
+        pfadd   mm1,mm6                 ; mm1=data2
+
+        movq    MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+        movq    MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+        ; -- Odd part
+
+        movq    mm0, MMWORD [wk(0)]     ; mm0=tmp6
+        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp7
+
+        pfadd   mm3,mm5                 ; mm3=tmp10
+        pfadd   mm5,mm0                 ; mm5=tmp11
+        pfadd   mm0,mm6                 ; mm0=tmp12, mm6=tmp7
+
+        pfmul   mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+        movq    mm2,mm3                 ; mm2=tmp10
+        pfsub   mm3,mm0
+        pfmul   mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
+        pfmul   mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+        pfmul   mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+        pfadd   mm2,mm3                 ; mm2=z2
+        pfadd   mm0,mm3                 ; mm0=z4
+
+        movq    mm7,mm6
+        pfsub   mm6,mm5                 ; mm6=z13
+        pfadd   mm7,mm5                 ; mm7=z11
+
+        movq    mm4,mm6
+        movq    mm1,mm7
+        pfsub   mm6,mm2                 ; mm6=data3
+        pfsub   mm7,mm0                 ; mm7=data7
+        pfadd   mm4,mm2                 ; mm4=data5
+        pfadd   mm1,mm0                 ; mm1=data1
+
+        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+        movq    MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+        movq    MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+        add     edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+        dec     ecx
+        jnz     near .rowloop
+
+        ; ---- Pass 2: process columns.
+
+        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
+        mov     ecx, DCTSIZE/2
+        alignx  16,7
+.columnloop:
+
+        movq    mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+        ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+        movq      mm4,mm0               ; transpose coefficients
+        punpckldq mm0,mm1               ; mm0=(00 01)=data0
+        punpckhdq mm4,mm1               ; mm4=(10 11)=data1
+        movq      mm5,mm2               ; transpose coefficients
+        punpckldq mm2,mm3               ; mm2=(60 61)=data6
+        punpckhdq mm5,mm3               ; mm5=(70 71)=data7
+
+        movq    mm6,mm4
+        movq    mm7,mm0
+        pfsub   mm4,mm2                 ; mm4=data1-data6=tmp6
+        pfsub   mm0,mm5                 ; mm0=data0-data7=tmp7
+        pfadd   mm6,mm2                 ; mm6=data1+data6=tmp1
+        pfadd   mm7,mm5                 ; mm7=data0+data7=tmp0
+
+        movq    mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+        ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+        movq    MMWORD [wk(0)], mm4     ; wk(0)=tmp6
+        movq    MMWORD [wk(1)], mm0     ; wk(1)=tmp7
+
+        movq      mm4,mm1               ; transpose coefficients
+        punpckldq mm1,mm3               ; mm1=(20 21)=data2
+        punpckhdq mm4,mm3               ; mm4=(30 31)=data3
+        movq      mm0,mm2               ; transpose coefficients
+        punpckldq mm2,mm5               ; mm2=(40 41)=data4
+        punpckhdq mm0,mm5               ; mm0=(50 51)=data5
+
+        movq    mm3,mm4
+        movq    mm5,mm1
+        pfadd   mm4,mm2                 ; mm4=data3+data4=tmp3
+        pfadd   mm1,mm0                 ; mm1=data2+data5=tmp2
+        pfsub   mm3,mm2                 ; mm3=data3-data4=tmp4
+        pfsub   mm5,mm0                 ; mm5=data2-data5=tmp5
+
+        ; -- Even part
+
+        movq    mm2,mm7
+        movq    mm0,mm6
+        pfsub   mm7,mm4                 ; mm7=tmp13
+        pfsub   mm6,mm1                 ; mm6=tmp12
+        pfadd   mm2,mm4                 ; mm2=tmp10
+        pfadd   mm0,mm1                 ; mm0=tmp11
+
+        pfadd   mm6,mm7
+        pfmul   mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+        movq    mm4,mm2
+        movq    mm1,mm7
+        pfsub   mm2,mm0                 ; mm2=data4
+        pfsub   mm7,mm6                 ; mm7=data6
+        pfadd   mm4,mm0                 ; mm4=data0
+        pfadd   mm1,mm6                 ; mm1=data2
+
+        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+        ; -- Odd part
+
+        movq    mm0, MMWORD [wk(0)]     ; mm0=tmp6
+        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp7
+
+        pfadd   mm3,mm5                 ; mm3=tmp10
+        pfadd   mm5,mm0                 ; mm5=tmp11
+        pfadd   mm0,mm6                 ; mm0=tmp12, mm6=tmp7
+
+        pfmul   mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+        movq    mm2,mm3                 ; mm2=tmp10
+        pfsub   mm3,mm0
+        pfmul   mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
+        pfmul   mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+        pfmul   mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+        pfadd   mm2,mm3                 ; mm2=z2
+        pfadd   mm0,mm3                 ; mm0=z4
+
+        movq    mm7,mm6
+        pfsub   mm6,mm5                 ; mm6=z13
+        pfadd   mm7,mm5                 ; mm7=z11
+
+        movq    mm4,mm6
+        movq    mm1,mm7
+        pfsub   mm6,mm2                 ; mm6=data3
+        pfsub   mm7,mm0                 ; mm7=data7
+        pfadd   mm4,mm2                 ; mm4=data5
+        pfadd   mm1,mm0                 ; mm1=data1
+
+        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+        add     edx, byte 2*SIZEOF_FAST_FLOAT
+        dec     ecx
+        jnz     near .columnloop
+
+        femms           ; empty MMX/3DNow! state
+
+;       pop     edi             ; unused
+;       pop     esi             ; unused
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        poppic  ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jfdctflt-sse-64.asm b/simd/jfdctflt-sse-64.asm
new file mode 100644
index 0000000..02d5463
--- /dev/null
+++ b/simd/jfdctflt-sse-64.asm
@@ -0,0 +1,358 @@
+;
+; jfdctflt.asm - floating-point FDCT (64-bit SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+        shufps  %1,%2,0x44
+%endmacro
+
+%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+        shufps  %1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382        times 4 dd  0.382683432365089771728460
+PD_0_707        times 4 dd  0.707106781186547524400844
+PD_0_541        times 4 dd  0.541196100146196984399723
+PD_1_306        times 4 dd  1.306562964876376527856643
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse (FAST_FLOAT *data)
+;
+
+; r10 = FAST_FLOAT *data
+
+%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+        global  EXTN(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [wk(0)]
+        collect_args
+
+        ; ---- Pass 1: process rows.
+
+        mov     rdx, r10        ; (FAST_FLOAT *)
+        mov     rcx, DCTSIZE/4
+.rowloop:
+
+        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+        ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+        ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
+        unpcklps xmm0,xmm1              ; xmm0=(20 30 21 31)
+        unpckhps xmm4,xmm1              ; xmm4=(22 32 23 33)
+        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
+        unpcklps xmm2,xmm3              ; xmm2=(24 34 25 35)
+        unpckhps xmm5,xmm3              ; xmm5=(26 36 27 37)
+
+        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+        ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+        ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
+        unpcklps xmm6,xmm7              ; xmm6=(00 10 01 11)
+        unpckhps xmm4,xmm7              ; xmm4=(02 12 03 13)
+        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
+        unpcklps xmm1,xmm3              ; xmm1=(04 14 05 15)
+        unpckhps xmm2,xmm3              ; xmm2=(06 16 07 17)
+
+        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
+        unpcklps2 xmm6,xmm0             ; xmm6=(00 10 20 30)=data0
+        unpckhps2 xmm7,xmm0             ; xmm7=(01 11 21 31)=data1
+        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
+        unpcklps2 xmm2,xmm5             ; xmm2=(06 16 26 36)=data6
+        unpckhps2 xmm3,xmm5             ; xmm3=(07 17 27 37)=data7
+
+        movaps  xmm0,xmm7
+        movaps  xmm5,xmm6
+        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
+        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
+        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
+        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
+
+        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
+        unpcklps2 xmm4,xmm2             ; xmm4=(02 12 22 32)=data2
+        unpckhps2 xmm7,xmm2             ; xmm7=(03 13 23 33)=data3
+        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
+        unpcklps2 xmm1,xmm3             ; xmm1=(04 14 24 34)=data4
+        unpckhps2 xmm6,xmm3             ; xmm6=(05 15 25 35)=data5
+
+        movaps  xmm2,xmm7
+        movaps  xmm3,xmm4
+        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
+        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
+        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
+        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
+
+        ; -- Even part
+
+        movaps  xmm1,xmm5
+        movaps  xmm6,xmm0
+        subps   xmm5,xmm7               ; xmm5=tmp13
+        subps   xmm0,xmm4               ; xmm0=tmp12
+        addps   xmm1,xmm7               ; xmm1=tmp10
+        addps   xmm6,xmm4               ; xmm6=tmp11
+
+        addps   xmm0,xmm5
+        mulps   xmm0,[rel PD_0_707] ; xmm0=z1
+
+        movaps  xmm7,xmm1
+        movaps  xmm4,xmm5
+        subps   xmm1,xmm6               ; xmm1=data4
+        subps   xmm5,xmm0               ; xmm5=data6
+        addps   xmm7,xmm6               ; xmm7=data0
+        addps   xmm4,xmm0               ; xmm4=data2
+
+        movaps  XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+        movaps  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+        movaps  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+        ; -- Odd part
+
+        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+        addps   xmm2,xmm3               ; xmm2=tmp10
+        addps   xmm3,xmm6               ; xmm3=tmp11
+        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
+
+        mulps   xmm3,[rel PD_0_707] ; xmm3=z3
+
+        movaps  xmm1,xmm2               ; xmm1=tmp10
+        subps   xmm2,xmm6
+        mulps   xmm2,[rel PD_0_382] ; xmm2=z5
+        mulps   xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+        mulps   xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+        addps   xmm1,xmm2               ; xmm1=z2
+        addps   xmm6,xmm2               ; xmm6=z4
+
+        movaps  xmm5,xmm0
+        subps   xmm0,xmm3               ; xmm0=z13
+        addps   xmm5,xmm3               ; xmm5=z11
+
+        movaps  xmm7,xmm0
+        movaps  xmm4,xmm5
+        subps   xmm0,xmm1               ; xmm0=data3
+        subps   xmm5,xmm6               ; xmm5=data7
+        addps   xmm7,xmm1               ; xmm7=data5
+        addps   xmm4,xmm6               ; xmm4=data1
+
+        movaps  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+        movaps  XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+        movaps  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+        add     rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+        dec     rcx
+        jnz     near .rowloop
+
+        ; ---- Pass 2: process columns.
+
+        mov     rdx, r10        ; (FAST_FLOAT *)
+        mov     rcx, DCTSIZE/4
+.columnloop:
+
+        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+        ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+        ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
+        unpcklps xmm0,xmm1              ; xmm0=(02 03 12 13)
+        unpckhps xmm4,xmm1              ; xmm4=(22 23 32 33)
+        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
+        unpcklps xmm2,xmm3              ; xmm2=(42 43 52 53)
+        unpckhps xmm5,xmm3              ; xmm5=(62 63 72 73)
+
+        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+        ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+        ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
+        unpcklps xmm6,xmm7              ; xmm6=(00 01 10 11)
+        unpckhps xmm4,xmm7              ; xmm4=(20 21 30 31)
+        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
+        unpcklps xmm1,xmm3              ; xmm1=(40 41 50 51)
+        unpckhps xmm2,xmm3              ; xmm2=(60 61 70 71)
+
+        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
+        unpcklps2 xmm6,xmm0             ; xmm6=(00 01 02 03)=data0
+        unpckhps2 xmm7,xmm0             ; xmm7=(10 11 12 13)=data1
+        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
+        unpcklps2 xmm2,xmm5             ; xmm2=(60 61 62 63)=data6
+        unpckhps2 xmm3,xmm5             ; xmm3=(70 71 72 73)=data7
+
+        movaps  xmm0,xmm7
+        movaps  xmm5,xmm6
+        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
+        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
+        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
+        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
+
+        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
+        unpcklps2 xmm4,xmm2             ; xmm4=(20 21 22 23)=data2
+        unpckhps2 xmm7,xmm2             ; xmm7=(30 31 32 33)=data3
+        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
+        unpcklps2 xmm1,xmm3             ; xmm1=(40 41 42 43)=data4
+        unpckhps2 xmm6,xmm3             ; xmm6=(50 51 52 53)=data5
+
+        movaps  xmm2,xmm7
+        movaps  xmm3,xmm4
+        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
+        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
+        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
+        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
+
+        ; -- Even part
+
+        movaps  xmm1,xmm5
+        movaps  xmm6,xmm0
+        subps   xmm5,xmm7               ; xmm5=tmp13
+        subps   xmm0,xmm4               ; xmm0=tmp12
+        addps   xmm1,xmm7               ; xmm1=tmp10
+        addps   xmm6,xmm4               ; xmm6=tmp11
+
+        addps   xmm0,xmm5
+        mulps   xmm0,[rel PD_0_707] ; xmm0=z1
+
+        movaps  xmm7,xmm1
+        movaps  xmm4,xmm5
+        subps   xmm1,xmm6               ; xmm1=data4
+        subps   xmm5,xmm0               ; xmm5=data6
+        addps   xmm7,xmm6               ; xmm7=data0
+        addps   xmm4,xmm0               ; xmm4=data2
+
+        movaps  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+        movaps  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+        movaps  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+        ; -- Odd part
+
+        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+        addps   xmm2,xmm3               ; xmm2=tmp10
+        addps   xmm3,xmm6               ; xmm3=tmp11
+        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
+
+        mulps   xmm3,[rel PD_0_707] ; xmm3=z3
+
+        movaps  xmm1,xmm2               ; xmm1=tmp10
+        subps   xmm2,xmm6
+        mulps   xmm2,[rel PD_0_382] ; xmm2=z5
+        mulps   xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+        mulps   xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+        addps   xmm1,xmm2               ; xmm1=z2
+        addps   xmm6,xmm2               ; xmm6=z4
+
+        movaps  xmm5,xmm0
+        subps   xmm0,xmm3               ; xmm0=z13
+        addps   xmm5,xmm3               ; xmm5=z11
+
+        movaps  xmm7,xmm0
+        movaps  xmm4,xmm5
+        subps   xmm0,xmm1               ; xmm0=data3
+        subps   xmm5,xmm6               ; xmm5=data7
+        addps   xmm7,xmm1               ; xmm7=data5
+        addps   xmm4,xmm6               ; xmm4=data1
+
+        movaps  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+        movaps  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+        movaps  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+        add     rdx, byte 4*SIZEOF_FAST_FLOAT
+        dec     rcx
+        jnz     near .columnloop
+
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jfdctflt-sse.asm b/simd/jfdctflt-sse.asm
new file mode 100644
index 0000000..c2f61c8
--- /dev/null
+++ b/simd/jfdctflt-sse.asm
@@ -0,0 +1,370 @@
+;
+; jfdctflt.asm - floating-point FDCT (SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+        shufps  %1,%2,0x44
+%endmacro
+
+%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+        shufps  %1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382        times 4 dd  0.382683432365089771728460
+PD_0_707        times 4 dd  0.707106781186547524400844
+PD_0_541        times 4 dd  0.541196100146196984399723
+PD_1_306        times 4 dd  1.306562964876376527856643
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse (FAST_FLOAT *data)
+;
+
+%define data(b)         (b)+8           ; FAST_FLOAT *data
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+        global  EXTN(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+;       push    esi             ; unused
+;       push    edi             ; unused
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process rows.
+
+        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
+        mov     ecx, DCTSIZE/4
+        alignx  16,7
+.rowloop:
+
+        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+        ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+        ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
+        unpcklps xmm0,xmm1              ; xmm0=(20 30 21 31)
+        unpckhps xmm4,xmm1              ; xmm4=(22 32 23 33)
+        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
+        unpcklps xmm2,xmm3              ; xmm2=(24 34 25 35)
+        unpckhps xmm5,xmm3              ; xmm5=(26 36 27 37)
+
+        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+        ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+        ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
+        unpcklps xmm6,xmm7              ; xmm6=(00 10 01 11)
+        unpckhps xmm4,xmm7              ; xmm4=(02 12 03 13)
+        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
+        unpcklps xmm1,xmm3              ; xmm1=(04 14 05 15)
+        unpckhps xmm2,xmm3              ; xmm2=(06 16 07 17)
+
+        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
+        unpcklps2 xmm6,xmm0             ; xmm6=(00 10 20 30)=data0
+        unpckhps2 xmm7,xmm0             ; xmm7=(01 11 21 31)=data1
+        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
+        unpcklps2 xmm2,xmm5             ; xmm2=(06 16 26 36)=data6
+        unpckhps2 xmm3,xmm5             ; xmm3=(07 17 27 37)=data7
+
+        movaps  xmm0,xmm7
+        movaps  xmm5,xmm6
+        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
+        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
+        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
+        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
+
+        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
+        unpcklps2 xmm4,xmm2             ; xmm4=(02 12 22 32)=data2
+        unpckhps2 xmm7,xmm2             ; xmm7=(03 13 23 33)=data3
+        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
+        unpcklps2 xmm1,xmm3             ; xmm1=(04 14 24 34)=data4
+        unpckhps2 xmm6,xmm3             ; xmm6=(05 15 25 35)=data5
+
+        movaps  xmm2,xmm7
+        movaps  xmm3,xmm4
+        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
+        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
+        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
+        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
+
+        ; -- Even part
+
+        movaps  xmm1,xmm5
+        movaps  xmm6,xmm0
+        subps   xmm5,xmm7               ; xmm5=tmp13
+        subps   xmm0,xmm4               ; xmm0=tmp12
+        addps   xmm1,xmm7               ; xmm1=tmp10
+        addps   xmm6,xmm4               ; xmm6=tmp11
+
+        addps   xmm0,xmm5
+        mulps   xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+        movaps  xmm7,xmm1
+        movaps  xmm4,xmm5
+        subps   xmm1,xmm6               ; xmm1=data4
+        subps   xmm5,xmm0               ; xmm5=data6
+        addps   xmm7,xmm6               ; xmm7=data0
+        addps   xmm4,xmm0               ; xmm4=data2
+
+        movaps  XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+        movaps  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+        movaps  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+        ; -- Odd part
+
+        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+        addps   xmm2,xmm3               ; xmm2=tmp10
+        addps   xmm3,xmm6               ; xmm3=tmp11
+        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
+
+        mulps   xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+        movaps  xmm1,xmm2               ; xmm1=tmp10
+        subps   xmm2,xmm6
+        mulps   xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+        mulps   xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+        mulps   xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+        addps   xmm1,xmm2               ; xmm1=z2
+        addps   xmm6,xmm2               ; xmm6=z4
+
+        movaps  xmm5,xmm0
+        subps   xmm0,xmm3               ; xmm0=z13
+        addps   xmm5,xmm3               ; xmm5=z11
+
+        movaps  xmm7,xmm0
+        movaps  xmm4,xmm5
+        subps   xmm0,xmm1               ; xmm0=data3
+        subps   xmm5,xmm6               ; xmm5=data7
+        addps   xmm7,xmm1               ; xmm7=data5
+        addps   xmm4,xmm6               ; xmm4=data1
+
+        movaps  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+        movaps  XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+        movaps  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+        add     edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+        dec     ecx
+        jnz     near .rowloop
+
+        ; ---- Pass 2: process columns.
+
+        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
+        mov     ecx, DCTSIZE/4
+        alignx  16,7
+.columnloop:
+
+        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+        ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+        ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
+        unpcklps xmm0,xmm1              ; xmm0=(02 03 12 13)
+        unpckhps xmm4,xmm1              ; xmm4=(22 23 32 33)
+        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
+        unpcklps xmm2,xmm3              ; xmm2=(42 43 52 53)
+        unpckhps xmm5,xmm3              ; xmm5=(62 63 72 73)
+
+        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+        ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+        ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
+        unpcklps xmm6,xmm7              ; xmm6=(00 01 10 11)
+        unpckhps xmm4,xmm7              ; xmm4=(20 21 30 31)
+        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
+        unpcklps xmm1,xmm3              ; xmm1=(40 41 50 51)
+        unpckhps xmm2,xmm3              ; xmm2=(60 61 70 71)
+
+        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
+        unpcklps2 xmm6,xmm0             ; xmm6=(00 01 02 03)=data0
+        unpckhps2 xmm7,xmm0             ; xmm7=(10 11 12 13)=data1
+        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
+        unpcklps2 xmm2,xmm5             ; xmm2=(60 61 62 63)=data6
+        unpckhps2 xmm3,xmm5             ; xmm3=(70 71 72 73)=data7
+
+        movaps  xmm0,xmm7
+        movaps  xmm5,xmm6
+        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
+        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
+        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
+        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
+
+        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
+        unpcklps2 xmm4,xmm2             ; xmm4=(20 21 22 23)=data2
+        unpckhps2 xmm7,xmm2             ; xmm7=(30 31 32 33)=data3
+        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
+        unpcklps2 xmm1,xmm3             ; xmm1=(40 41 42 43)=data4
+        unpckhps2 xmm6,xmm3             ; xmm6=(50 51 52 53)=data5
+
+        movaps  xmm2,xmm7
+        movaps  xmm3,xmm4
+        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
+        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
+        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
+        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
+
+        ; -- Even part
+
+        movaps  xmm1,xmm5
+        movaps  xmm6,xmm0
+        subps   xmm5,xmm7               ; xmm5=tmp13
+        subps   xmm0,xmm4               ; xmm0=tmp12
+        addps   xmm1,xmm7               ; xmm1=tmp10
+        addps   xmm6,xmm4               ; xmm6=tmp11
+
+        addps   xmm0,xmm5
+        mulps   xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+        movaps  xmm7,xmm1
+        movaps  xmm4,xmm5
+        subps   xmm1,xmm6               ; xmm1=data4
+        subps   xmm5,xmm0               ; xmm5=data6
+        addps   xmm7,xmm6               ; xmm7=data0
+        addps   xmm4,xmm0               ; xmm4=data2
+
+        movaps  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+        movaps  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+        movaps  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+        ; -- Odd part
+
+        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+        addps   xmm2,xmm3               ; xmm2=tmp10
+        addps   xmm3,xmm6               ; xmm3=tmp11
+        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
+
+        mulps   xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+        movaps  xmm1,xmm2               ; xmm1=tmp10
+        subps   xmm2,xmm6
+        mulps   xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+        mulps   xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+        mulps   xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+        addps   xmm1,xmm2               ; xmm1=z2
+        addps   xmm6,xmm2               ; xmm6=z4
+
+        movaps  xmm5,xmm0
+        subps   xmm0,xmm3               ; xmm0=z13
+        addps   xmm5,xmm3               ; xmm5=z11
+
+        movaps  xmm7,xmm0
+        movaps  xmm4,xmm5
+        subps   xmm0,xmm1               ; xmm0=data3
+        subps   xmm5,xmm6               ; xmm5=data7
+        addps   xmm7,xmm1               ; xmm7=data5
+        addps   xmm4,xmm6               ; xmm4=data1
+
+        movaps  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+        movaps  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+        movaps  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+        add     edx, byte 4*SIZEOF_FAST_FLOAT
+        dec     ecx
+        jnz     near .columnloop
+
+;       pop     edi             ; unused
+;       pop     esi             ; unused
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        poppic  ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jfdctfst-altivec.c b/simd/jfdctfst-altivec.c
new file mode 100644
index 0000000..c4cc26e
--- /dev/null
+++ b/simd/jfdctfst-altivec.c
@@ -0,0 +1,156 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_382 98   /* FIX(0.382683433) */
+#define F_0_541 139  /* FIX(0.541196100) */
+#define F_0_707 181  /* FIX(0.707106781) */
+#define F_1_306 334  /* FIX(1.306562965) */
+
+#define CONST_BITS 8
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_FDCT()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  \
+  z1 = vec_add(tmp12, tmp13);  \
+  z1 = vec_sl(z1, pre_multiply_scale_bits);  \
+  z1 = vec_madds(z1, pw_0707, pw_zero);  \
+  \
+  out2 = vec_add(tmp13, z1);  \
+  out6 = vec_sub(tmp13, z1);  \
+  \
+  /* Odd part */  \
+  \
+  tmp10 = vec_add(tmp4, tmp5);  \
+  tmp11 = vec_add(tmp5, tmp6);  \
+  tmp12 = vec_add(tmp6, tmp7);  \
+  \
+  tmp10 = vec_sl(tmp10, pre_multiply_scale_bits);  \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
+  z5 = vec_sub(tmp10, tmp12);  \
+  z5 = vec_madds(z5, pw_0382, pw_zero);  \
+  \
+  z2 = vec_madds(tmp10, pw_0541, z5);  \
+  z4 = vec_madds(tmp12, pw_1306, z5);  \
+  \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
+  z3 = vec_madds(tmp11, pw_0707, pw_zero);  \
+  \
+  z11 = vec_add(tmp7, z3);  \
+  z13 = vec_sub(tmp7, z3);  \
+  \
+  out5 = vec_add(z13, z2);  \
+  out3 = vec_sub(z13, z2);  \
+  out1 = vec_add(z11, z4);  \
+  out7 = vec_sub(z11, z4);  \
+}
+
+
+void
+jsimd_fdct_ifast_altivec (DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z1, z2, z3, z4, z5, z11, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
+    pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
+    pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
+    pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
+  __vector unsigned short
+    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
+
+  /* Pass 1: process rows */
+
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT();
+
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
+}
diff --git a/simd/jfdctfst-mmx.asm b/simd/jfdctfst-mmx.asm
new file mode 100644
index 0000000..41ba00e
--- /dev/null
+++ b/simd/jfdctfst-mmx.asm
@@ -0,0 +1,397 @@
+;
+; jfdctfst.asm - fast integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      8       ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ      98             ; FIX(0.382683433)
+F_0_541 equ     139             ; FIX(0.541196100)
+F_0_707 equ     181             ; FIX(0.707106781)
+F_1_306 equ     334             ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
+F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
+F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
+F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+        alignz  16
+        global  EXTN(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707        times 4 dw  F_0_707 << CONST_SHIFT
+PW_F0382        times 4 dw  F_0_382 << CONST_SHIFT
+PW_F0541        times 4 dw  F_0_541 << CONST_SHIFT
+PW_F1306        times 4 dw  F_1_306 << CONST_SHIFT
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_mmx (DCTELEM *data)
+;
+
+%define data(b)         (b)+8           ; DCTELEM *data
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+        global  EXTN(jsimd_fdct_ifast_mmx)
+
+EXTN(jsimd_fdct_ifast_mmx):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+;       push    esi             ; unused
+;       push    edi             ; unused
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process rows.
+
+        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
+        mov     ecx, DCTSIZE/4
+        alignx  16,7
+.rowloop:
+
+        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+        movq    mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+        movq    mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+        ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+        ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+        movq      mm4,mm0               ; transpose coefficients(phase 1)
+        punpcklwd mm0,mm1               ; mm0=(20 30 21 31)
+        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
+        movq      mm5,mm2               ; transpose coefficients(phase 1)
+        punpcklwd mm2,mm3               ; mm2=(24 34 25 35)
+        punpckhwd mm5,mm3               ; mm5=(26 36 27 37)
+
+        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+        ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+        ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
+        movq    MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
+
+        movq      mm4,mm6               ; transpose coefficients(phase 1)
+        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
+        punpckhwd mm4,mm7               ; mm4=(02 12 03 13)
+        movq      mm2,mm1               ; transpose coefficients(phase 1)
+        punpcklwd mm1,mm3               ; mm1=(04 14 05 15)
+        punpckhwd mm2,mm3               ; mm2=(06 16 07 17)
+
+        movq      mm7,mm6               ; transpose coefficients(phase 2)
+        punpckldq mm6,mm0               ; mm6=(00 10 20 30)=data0
+        punpckhdq mm7,mm0               ; mm7=(01 11 21 31)=data1
+        movq      mm3,mm2               ; transpose coefficients(phase 2)
+        punpckldq mm2,mm5               ; mm2=(06 16 26 36)=data6
+        punpckhdq mm3,mm5               ; mm3=(07 17 27 37)=data7
+
+        movq    mm0,mm7
+        movq    mm5,mm6
+        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
+        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
+        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
+        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
+
+        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
+        movq    mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
+        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+        movq      mm7,mm4               ; transpose coefficients(phase 2)
+        punpckldq mm4,mm2               ; mm4=(02 12 22 32)=data2
+        punpckhdq mm7,mm2               ; mm7=(03 13 23 33)=data3
+        movq      mm6,mm1               ; transpose coefficients(phase 2)
+        punpckldq mm1,mm3               ; mm1=(04 14 24 34)=data4
+        punpckhdq mm6,mm3               ; mm6=(05 15 25 35)=data5
+
+        movq    mm2,mm7
+        movq    mm3,mm4
+        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
+        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
+        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
+        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
+
+        ; -- Even part
+
+        movq    mm1,mm5
+        movq    mm6,mm0
+        psubw   mm5,mm7                 ; mm5=tmp13
+        psubw   mm0,mm4                 ; mm0=tmp12
+        paddw   mm1,mm7                 ; mm1=tmp10
+        paddw   mm6,mm4                 ; mm6=tmp11
+
+        paddw   mm0,mm5
+        psllw   mm0,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+        movq    mm7,mm1
+        movq    mm4,mm5
+        psubw   mm1,mm6                 ; mm1=data4
+        psubw   mm5,mm0                 ; mm5=data6
+        paddw   mm7,mm6                 ; mm7=data0
+        paddw   mm4,mm0                 ; mm4=data2
+
+        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+        movq    MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+        ; -- Odd part
+
+        movq    mm6, MMWORD [wk(0)]     ; mm6=tmp6
+        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp7
+
+        paddw   mm2,mm3                 ; mm2=tmp10
+        paddw   mm3,mm6                 ; mm3=tmp11
+        paddw   mm6,mm0                 ; mm6=tmp12, mm0=tmp7
+
+        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
+        psllw   mm6,PRE_MULTIPLY_SCALE_BITS
+
+        psllw   mm3,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+        movq    mm1,mm2                 ; mm1=tmp10
+        psubw   mm2,mm6
+        pmulhw  mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+        pmulhw  mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+        pmulhw  mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+        paddw   mm1,mm2                 ; mm1=z2
+        paddw   mm6,mm2                 ; mm6=z4
+
+        movq    mm5,mm0
+        psubw   mm0,mm3                 ; mm0=z13
+        paddw   mm5,mm3                 ; mm5=z11
+
+        movq    mm7,mm0
+        movq    mm4,mm5
+        psubw   mm0,mm1                 ; mm0=data3
+        psubw   mm5,mm6                 ; mm5=data7
+        paddw   mm7,mm1                 ; mm7=data5
+        paddw   mm4,mm6                 ; mm4=data1
+
+        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+        movq    MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+        add     edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+        dec     ecx
+        jnz     near .rowloop
+
+        ; ---- Pass 2: process columns.
+
+        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
+        mov     ecx, DCTSIZE/4
+        alignx  16,7
+.columnloop:
+
+        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+        ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+        ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+        movq      mm4,mm0               ; transpose coefficients(phase 1)
+        punpcklwd mm0,mm1               ; mm0=(02 03 12 13)
+        punpckhwd mm4,mm1               ; mm4=(22 23 32 33)
+        movq      mm5,mm2               ; transpose coefficients(phase 1)
+        punpcklwd mm2,mm3               ; mm2=(42 43 52 53)
+        punpckhwd mm5,mm3               ; mm5=(62 63 72 73)
+
+        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+        movq    mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+        movq    mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+        ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+        ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
+        movq    MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
+
+        movq      mm4,mm6               ; transpose coefficients(phase 1)
+        punpcklwd mm6,mm7               ; mm6=(00 01 10 11)
+        punpckhwd mm4,mm7               ; mm4=(20 21 30 31)
+        movq      mm2,mm1               ; transpose coefficients(phase 1)
+        punpcklwd mm1,mm3               ; mm1=(40 41 50 51)
+        punpckhwd mm2,mm3               ; mm2=(60 61 70 71)
+
+        movq      mm7,mm6               ; transpose coefficients(phase 2)
+        punpckldq mm6,mm0               ; mm6=(00 01 02 03)=data0
+        punpckhdq mm7,mm0               ; mm7=(10 11 12 13)=data1
+        movq      mm3,mm2               ; transpose coefficients(phase 2)
+        punpckldq mm2,mm5               ; mm2=(60 61 62 63)=data6
+        punpckhdq mm3,mm5               ; mm3=(70 71 72 73)=data7
+
+        movq    mm0,mm7
+        movq    mm5,mm6
+        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
+        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
+        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
+        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
+
+        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
+        movq    mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
+        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+        movq      mm7,mm4               ; transpose coefficients(phase 2)
+        punpckldq mm4,mm2               ; mm4=(20 21 22 23)=data2
+        punpckhdq mm7,mm2               ; mm7=(30 31 32 33)=data3
+        movq      mm6,mm1               ; transpose coefficients(phase 2)
+        punpckldq mm1,mm3               ; mm1=(40 41 42 43)=data4
+        punpckhdq mm6,mm3               ; mm6=(50 51 52 53)=data5
+
+        movq    mm2,mm7
+        movq    mm3,mm4
+        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
+        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
+        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
+        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
+
+        ; -- Even part
+
+        movq    mm1,mm5
+        movq    mm6,mm0
+        psubw   mm5,mm7                 ; mm5=tmp13
+        psubw   mm0,mm4                 ; mm0=tmp12
+        paddw   mm1,mm7                 ; mm1=tmp10
+        paddw   mm6,mm4                 ; mm6=tmp11
+
+        paddw   mm0,mm5
+        psllw   mm0,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+        movq    mm7,mm1
+        movq    mm4,mm5
+        psubw   mm1,mm6                 ; mm1=data4
+        psubw   mm5,mm0                 ; mm5=data6
+        paddw   mm7,mm6                 ; mm7=data0
+        paddw   mm4,mm0                 ; mm4=data2
+
+        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+        ; -- Odd part
+
+        movq    mm6, MMWORD [wk(0)]     ; mm6=tmp6
+        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp7
+
+        paddw   mm2,mm3                 ; mm2=tmp10
+        paddw   mm3,mm6                 ; mm3=tmp11
+        paddw   mm6,mm0                 ; mm6=tmp12, mm0=tmp7
+
+        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
+        psllw   mm6,PRE_MULTIPLY_SCALE_BITS
+
+        psllw   mm3,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+        movq    mm1,mm2                 ; mm1=tmp10
+        psubw   mm2,mm6
+        pmulhw  mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+        pmulhw  mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+        pmulhw  mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+        paddw   mm1,mm2                 ; mm1=z2
+        paddw   mm6,mm2                 ; mm6=z4
+
+        movq    mm5,mm0
+        psubw   mm0,mm3                 ; mm0=z13
+        paddw   mm5,mm3                 ; mm5=z11
+
+        movq    mm7,mm0
+        movq    mm4,mm5
+        psubw   mm0,mm1                 ; mm0=data3
+        psubw   mm5,mm6                 ; mm5=data7
+        paddw   mm7,mm1                 ; mm7=data5
+        paddw   mm4,mm6                 ; mm4=data1
+
+        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+        add     edx, byte 4*SIZEOF_DCTELEM
+        dec     ecx
+        jnz     near .columnloop
+
+        emms            ; empty MMX state
+
+;       pop     edi             ; unused
+;       pop     esi             ; unused
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        poppic  ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jfdctfst-sse2-64.asm b/simd/jfdctfst-sse2-64.asm
new file mode 100644
index 0000000..f9b1551
--- /dev/null
+++ b/simd/jfdctfst-sse2-64.asm
@@ -0,0 +1,392 @@
+;
+; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      8       ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ      98             ; FIX(0.382683433)
+F_0_541 equ     139             ; FIX(0.541196100)
+F_0_707 equ     181             ; FIX(0.707106781)
+F_1_306 equ     334             ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
+F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
+F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
+F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+        alignz  16
+        global  EXTN(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707        times 8 dw  F_0_707 << CONST_SHIFT
+PW_F0382        times 8 dw  F_0_382 << CONST_SHIFT
+PW_F0541        times 8 dw  F_0_541 << CONST_SHIFT
+PW_F1306        times 8 dw  F_1_306 << CONST_SHIFT
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2 (DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+        global  EXTN(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [wk(0)]
+        collect_args
+
+        ; ---- Pass 1: process rows.
+
+        mov     rdx, r10        ; (DCTELEM *)
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
+        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
+        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
+        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
+        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
+        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
+
+        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
+        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
+        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
+        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
+        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
+        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
+
+        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
+        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
+        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
+        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
+        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
+        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
+
+        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
+        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
+        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
+        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
+        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
+        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
+
+        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
+        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
+        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
+        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
+        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
+        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+        movdqa  xmm6,xmm1
+        movdqa  xmm3,xmm0
+        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
+        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
+        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
+        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
+
+        movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
+        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
+        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
+        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
+        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
+        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+        movdqa  xmm2,xmm1
+        movdqa  xmm5,xmm7
+        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
+        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
+        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
+        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
+
+        ; -- Even part
+
+        movdqa  xmm4,xmm3
+        movdqa  xmm0,xmm6
+        psubw   xmm3,xmm1               ; xmm3=tmp13
+        psubw   xmm6,xmm7               ; xmm6=tmp12
+        paddw   xmm4,xmm1               ; xmm4=tmp10
+        paddw   xmm0,xmm7               ; xmm0=tmp11
+
+        paddw   xmm6,xmm3
+        psllw   xmm6,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm6,[rel PW_F0707] ; xmm6=z1
+
+        movdqa  xmm1,xmm4
+        movdqa  xmm7,xmm3
+        psubw   xmm4,xmm0               ; xmm4=data4
+        psubw   xmm3,xmm6               ; xmm3=data6
+        paddw   xmm1,xmm0               ; xmm1=data0
+        paddw   xmm7,xmm6               ; xmm7=data2
+
+        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+        ; -- Odd part
+
+        paddw   xmm2,xmm5               ; xmm2=tmp10
+        paddw   xmm5,xmm0               ; xmm5=tmp11
+        paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7
+
+        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
+        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
+
+        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm5,[rel PW_F0707] ; xmm5=z3
+
+        movdqa  xmm4,xmm2               ; xmm4=tmp10
+        psubw   xmm2,xmm0
+        pmulhw  xmm2,[rel PW_F0382] ; xmm2=z5
+        pmulhw  xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+        pmulhw  xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+        paddw   xmm4,xmm2               ; xmm4=z2
+        paddw   xmm0,xmm2               ; xmm0=z4
+
+        movdqa  xmm3,xmm6
+        psubw   xmm6,xmm5               ; xmm6=z13
+        paddw   xmm3,xmm5               ; xmm3=z11
+
+        movdqa  xmm2,xmm6
+        movdqa  xmm5,xmm3
+        psubw   xmm6,xmm4               ; xmm6=data3
+        psubw   xmm3,xmm0               ; xmm3=data7
+        paddw   xmm2,xmm4               ; xmm2=data5
+        paddw   xmm5,xmm0               ; xmm5=data1
+
+        ; ---- Pass 2: process columns.
+
+        ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+        ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+        movdqa    xmm4,xmm1             ; transpose coefficients(phase 1)
+        punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31)
+        punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71)
+        movdqa    xmm0,xmm7             ; transpose coefficients(phase 1)
+        punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33)
+        punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73)
+
+        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+        ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+        ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+        movdqa    xmm7,xmm5             ; transpose coefficients(phase 1)
+        punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35)
+        punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75)
+        movdqa    xmm0,xmm6             ; transpose coefficients(phase 1)
+        punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37)
+        punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77)
+
+        movdqa    xmm2,xmm5             ; transpose coefficients(phase 2)
+        punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17)
+        punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37)
+        movdqa    xmm3,xmm7             ; transpose coefficients(phase 2)
+        punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57)
+        punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77)
+
+        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+        movdqa    xmm2,xmm1             ; transpose coefficients(phase 2)
+        punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13)
+        punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33)
+        movdqa    xmm7,xmm4             ; transpose coefficients(phase 2)
+        punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53)
+        punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73)
+
+        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
+        punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0
+        punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1
+        movdqa     xmm0,xmm7            ; transpose coefficients(phase 3)
+        punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6
+        punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+        movdqa  xmm5,xmm6
+        movdqa  xmm3,xmm1
+        psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6
+        psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7
+        paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1
+        paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0
+
+        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+        movdqa     xmm6,xmm2            ; transpose coefficients(phase 3)
+        punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2
+        punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3
+        movdqa     xmm1,xmm4            ; transpose coefficients(phase 3)
+        punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4
+        punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+        movdqa  xmm7,xmm6
+        movdqa  xmm0,xmm2
+        paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3
+        paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2
+        psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4
+        psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5
+
+        ; -- Even part
+
+        movdqa  xmm4,xmm3
+        movdqa  xmm1,xmm5
+        psubw   xmm3,xmm6               ; xmm3=tmp13
+        psubw   xmm5,xmm2               ; xmm5=tmp12
+        paddw   xmm4,xmm6               ; xmm4=tmp10
+        paddw   xmm1,xmm2               ; xmm1=tmp11
+
+        paddw   xmm5,xmm3
+        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm5,[rel PW_F0707] ; xmm5=z1
+
+        movdqa  xmm6,xmm4
+        movdqa  xmm2,xmm3
+        psubw   xmm4,xmm1               ; xmm4=data4
+        psubw   xmm3,xmm5               ; xmm3=data6
+        paddw   xmm6,xmm1               ; xmm6=data0
+        paddw   xmm2,xmm5               ; xmm2=data2
+
+        movdqa  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+        movdqa  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+        movdqa  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+        movdqa  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+        ; -- Odd part
+
+        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+        paddw   xmm7,xmm0               ; xmm7=tmp10
+        paddw   xmm0,xmm1               ; xmm0=tmp11
+        paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7
+
+        psllw   xmm7,PRE_MULTIPLY_SCALE_BITS
+        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
+
+        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm0,[rel PW_F0707] ; xmm0=z3
+
+        movdqa  xmm4,xmm7               ; xmm4=tmp10
+        psubw   xmm7,xmm1
+        pmulhw  xmm7,[rel PW_F0382] ; xmm7=z5
+        pmulhw  xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+        pmulhw  xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+        paddw   xmm4,xmm7               ; xmm4=z2
+        paddw   xmm1,xmm7               ; xmm1=z4
+
+        movdqa  xmm3,xmm5
+        psubw   xmm5,xmm0               ; xmm5=z13
+        paddw   xmm3,xmm0               ; xmm3=z11
+
+        movdqa  xmm6,xmm5
+        movdqa  xmm2,xmm3
+        psubw   xmm5,xmm4               ; xmm5=data3
+        psubw   xmm3,xmm1               ; xmm3=data7
+        paddw   xmm6,xmm4               ; xmm6=data5
+        paddw   xmm2,xmm1               ; xmm2=data1
+
+        movdqa  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+        movdqa  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+        movdqa  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+        movdqa  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jfdctfst-sse2.asm b/simd/jfdctfst-sse2.asm
new file mode 100644
index 0000000..ebbadad
--- /dev/null
+++ b/simd/jfdctfst-sse2.asm
@@ -0,0 +1,404 @@
+;
+; jfdctfst.asm - fast integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      8       ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ      98             ; FIX(0.382683433)
+F_0_541 equ     139             ; FIX(0.541196100)
+F_0_707 equ     181             ; FIX(0.707106781)
+F_1_306 equ     334             ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
+F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
+F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
+F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+        alignz  16
+        global  EXTN(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707        times 8 dw  F_0_707 << CONST_SHIFT
+PW_F0382        times 8 dw  F_0_382 << CONST_SHIFT
+PW_F0541        times 8 dw  F_0_541 << CONST_SHIFT
+PW_F1306        times 8 dw  F_1_306 << CONST_SHIFT
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2 (DCTELEM *data)
+;
+
+%define data(b)         (b)+8           ; DCTELEM *data
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+        global  EXTN(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic ebx
+;       push    ecx             ; unused
+;       push    edx             ; need not be preserved
+;       push    esi             ; unused
+;       push    edi             ; unused
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process rows.
+
+        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
+        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
+        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
+        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
+        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
+        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
+
+        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
+        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
+        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
+        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
+        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
+        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
+
+        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
+        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
+        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
+        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
+        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
+        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
+
+        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
+        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
+        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
+        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
+        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
+        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
+
+        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
+        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
+        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
+        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
+        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
+        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+        movdqa  xmm6,xmm1
+        movdqa  xmm3,xmm0
+        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
+        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
+        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
+        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
+
+        movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
+        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
+        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
+        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
+        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
+        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+        movdqa  xmm2,xmm1
+        movdqa  xmm5,xmm7
+        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
+        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
+        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
+        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
+
+        ; -- Even part
+
+        movdqa  xmm4,xmm3
+        movdqa  xmm0,xmm6
+        psubw   xmm3,xmm1               ; xmm3=tmp13
+        psubw   xmm6,xmm7               ; xmm6=tmp12
+        paddw   xmm4,xmm1               ; xmm4=tmp10
+        paddw   xmm0,xmm7               ; xmm0=tmp11
+
+        paddw   xmm6,xmm3
+        psllw   xmm6,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
+
+        movdqa  xmm1,xmm4
+        movdqa  xmm7,xmm3
+        psubw   xmm4,xmm0               ; xmm4=data4
+        psubw   xmm3,xmm6               ; xmm3=data6
+        paddw   xmm1,xmm0               ; xmm1=data0
+        paddw   xmm7,xmm6               ; xmm7=data2
+
+        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+        ; -- Odd part
+
+        paddw   xmm2,xmm5               ; xmm2=tmp10
+        paddw   xmm5,xmm0               ; xmm5=tmp11
+        paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7
+
+        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
+        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
+
+        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
+
+        movdqa  xmm4,xmm2               ; xmm4=tmp10
+        psubw   xmm2,xmm0
+        pmulhw  xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
+        pmulhw  xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+        pmulhw  xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+        paddw   xmm4,xmm2               ; xmm4=z2
+        paddw   xmm0,xmm2               ; xmm0=z4
+
+        movdqa  xmm3,xmm6
+        psubw   xmm6,xmm5               ; xmm6=z13
+        paddw   xmm3,xmm5               ; xmm3=z11
+
+        movdqa  xmm2,xmm6
+        movdqa  xmm5,xmm3
+        psubw   xmm6,xmm4               ; xmm6=data3
+        psubw   xmm3,xmm0               ; xmm3=data7
+        paddw   xmm2,xmm4               ; xmm2=data5
+        paddw   xmm5,xmm0               ; xmm5=data1
+
+        ; ---- Pass 2: process columns.
+
+;       mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
+
+        ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+        ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+        movdqa    xmm4,xmm1             ; transpose coefficients(phase 1)
+        punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31)
+        punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71)
+        movdqa    xmm0,xmm7             ; transpose coefficients(phase 1)
+        punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33)
+        punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73)
+
+        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+        ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+        ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+        movdqa    xmm7,xmm5             ; transpose coefficients(phase 1)
+        punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35)
+        punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75)
+        movdqa    xmm0,xmm6             ; transpose coefficients(phase 1)
+        punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37)
+        punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77)
+
+        movdqa    xmm2,xmm5             ; transpose coefficients(phase 2)
+        punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17)
+        punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37)
+        movdqa    xmm3,xmm7             ; transpose coefficients(phase 2)
+        punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57)
+        punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77)
+
+        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+        movdqa    xmm2,xmm1             ; transpose coefficients(phase 2)
+        punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13)
+        punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33)
+        movdqa    xmm7,xmm4             ; transpose coefficients(phase 2)
+        punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53)
+        punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73)
+
+        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
+        punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0
+        punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1
+        movdqa     xmm0,xmm7            ; transpose coefficients(phase 3)
+        punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6
+        punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+        movdqa  xmm5,xmm6
+        movdqa  xmm3,xmm1
+        psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6
+        psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7
+        paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1
+        paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0
+
+        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+        movdqa     xmm6,xmm2            ; transpose coefficients(phase 3)
+        punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2
+        punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3
+        movdqa     xmm1,xmm4            ; transpose coefficients(phase 3)
+        punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4
+        punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+        movdqa  xmm7,xmm6
+        movdqa  xmm0,xmm2
+        paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3
+        paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2
+        psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4
+        psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5
+
+        ; -- Even part
+
+        movdqa  xmm4,xmm3
+        movdqa  xmm1,xmm5
+        psubw   xmm3,xmm6               ; xmm3=tmp13
+        psubw   xmm5,xmm2               ; xmm5=tmp12
+        paddw   xmm4,xmm6               ; xmm4=tmp10
+        paddw   xmm1,xmm2               ; xmm1=tmp11
+
+        paddw   xmm5,xmm3
+        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
+
+        movdqa  xmm6,xmm4
+        movdqa  xmm2,xmm3
+        psubw   xmm4,xmm1               ; xmm4=data4
+        psubw   xmm3,xmm5               ; xmm3=data6
+        paddw   xmm6,xmm1               ; xmm6=data0
+        paddw   xmm2,xmm5               ; xmm2=data2
+
+        movdqa  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+        movdqa  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+        movdqa  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+        movdqa  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+        ; -- Odd part
+
+        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+        paddw   xmm7,xmm0               ; xmm7=tmp10
+        paddw   xmm0,xmm1               ; xmm0=tmp11
+        paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7
+
+        psllw   xmm7,PRE_MULTIPLY_SCALE_BITS
+        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
+
+        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
+
+        movdqa  xmm4,xmm7               ; xmm4=tmp10
+        psubw   xmm7,xmm1
+        pmulhw  xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
+        pmulhw  xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+        pmulhw  xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+        paddw   xmm4,xmm7               ; xmm4=z2
+        paddw   xmm1,xmm7               ; xmm1=z4
+
+        movdqa  xmm3,xmm5
+        psubw   xmm5,xmm0               ; xmm5=z13
+        paddw   xmm3,xmm0               ; xmm3=z11
+
+        movdqa  xmm6,xmm5
+        movdqa  xmm2,xmm3
+        psubw   xmm5,xmm4               ; xmm5=data3
+        psubw   xmm3,xmm1               ; xmm3=data7
+        paddw   xmm6,xmm4               ; xmm6=data5
+        paddw   xmm2,xmm1               ; xmm2=data1
+
+        movdqa  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+        movdqa  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+        movdqa  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+        movdqa  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+;       pop     edi             ; unused
+;       pop     esi             ; unused
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; unused
+        poppic  ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jfdctint-altivec.c b/simd/jfdctint-altivec.c
new file mode 100644
index 0000000..c13850a
--- /dev/null
+++ b/simd/jfdctint-altivec.c
@@ -0,0 +1,262 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER FORWARD DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446   /* FIX(0.298631336) */
+#define F_0_390 3196   /* FIX(0.390180644) */
+#define F_0_541 4433   /* FIX(0.541196100) */
+#define F_0_765 6270   /* FIX(0.765366865) */
+#define F_0_899 7373   /* FIX(0.899976223) */
+#define F_1_175 9633   /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS)  \
+{  \
+  /* (Original)  \
+   * z1 = (tmp12 + tmp13) * 0.541196100;  \
+   * data2 = z1 + tmp13 * 0.765366865;  \
+   * data6 = z1 + tmp12 * -1.847759065;  \
+   *  \
+   * (This implementation)  \
+   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;  \
+   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);  \
+   */  \
+  \
+  tmp1312l = vec_mergeh(tmp13, tmp12);  \
+  tmp1312h = vec_mergel(tmp13, tmp12);  \
+  \
+  out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS);  \
+  out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS);  \
+  out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS);  \
+  out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS);  \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS);  \
+  out2h = vec_sra(out2h, descale_p##PASS);  \
+  out6l = vec_sra(out6l, descale_p##PASS);  \
+  out6h = vec_sra(out6h, descale_p##PASS);  \
+  \
+  out2 = vec_pack(out2l, out2h);  \
+  out6 = vec_pack(out6l, out6h);  \
+  \
+  /* Odd part */  \
+  \
+  z3 = vec_add(tmp4, tmp6);  \
+  z4 = vec_add(tmp5, tmp7);  \
+  \
+  /* (Original)  \
+   * z5 = (z3 + z4) * 1.175875602;  \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
+   * z3 += z5;  z4 += z5;  \
+   *  \
+   * (This implementation)  \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
+   */  \
+  \
+  z34l = vec_mergeh(z3, z4);  \
+  z34h = vec_mergel(z3, z4);  \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS);  \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS);  \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS);  \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS);  \
+  \
+  /* (Original)  \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;  \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;  \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;  \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
+   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;  \
+   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;  \
+   *  \
+   * (This implementation)  \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;  \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;  \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);  \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);  \
+   * data7 = tmp4 + z3;  data5 = tmp5 + z4;  \
+   * data3 = tmp6 + z3;  data1 = tmp7 + z4;  \
+   */  \
+  \
+  tmp47l = vec_mergeh(tmp4, tmp7);  \
+  tmp47h = vec_mergel(tmp4, tmp7);  \
+  \
+  out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l);  \
+  out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h);  \
+  out1l = vec_msums(tmp47l, pw_mf089_f060, z4l);  \
+  out1h = vec_msums(tmp47h, pw_mf089_f060, z4h);  \
+  \
+  out7l = vec_sra(out7l, descale_p##PASS);  \
+  out7h = vec_sra(out7h, descale_p##PASS);  \
+  out1l = vec_sra(out1l, descale_p##PASS);  \
+  out1h = vec_sra(out1h, descale_p##PASS);  \
+  \
+  out7 = vec_pack(out7l, out7h);  \
+  out1 = vec_pack(out1l, out1h);  \
+  \
+  tmp56l = vec_mergeh(tmp5, tmp6);  \
+  tmp56h = vec_mergel(tmp5, tmp6);  \
+  \
+  out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l);  \
+  out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h);  \
+  out3l = vec_msums(tmp56l, pw_mf256_f050, z3l);  \
+  out3h = vec_msums(tmp56h, pw_mf256_f050, z3h);  \
+  \
+  out5l = vec_sra(out5l, descale_p##PASS);  \
+  out5h = vec_sra(out5h, descale_p##PASS);  \
+  out3l = vec_sra(out3l, descale_p##PASS);  \
+  out3h = vec_sra(out3h, descale_p##PASS);  \
+  \
+  out5 = vec_pack(out5l, out5h);  \
+  out3 = vec_pack(out3l, out3h);  \
+}
+
+#define DO_FDCT_PASS1()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out0  = vec_sl(out0, pass1_bits);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  out4  = vec_sl(out4, pass1_bits);  \
+  \
+  DO_FDCT_COMMON(1);  \
+}
+
+#define DO_FDCT_PASS2()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out0  = vec_add(out0, pw_descale_p2x);  \
+  out0  = vec_sra(out0, pass1_bits);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  out4  = vec_add(out4, pw_descale_p2x);  \
+  out4  = vec_sra(out4, pass1_bits);  \
+  \
+  DO_FDCT_COMMON(2);  \
+}
+
+
+void
+jsimd_fdct_islow_altivec (DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+    z3, z4, z34l, z34h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int z3l, z3h, z4l, z4h,
+    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+    out7l, out7h;
+
+  /* Constants */
+  __vector short
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
+    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) };
+
+  /* Pass 1: process rows */
+
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT_PASS1();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT_PASS2();
+
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
+}
diff --git a/simd/jfdctint-mmx.asm b/simd/jfdctint-mmx.asm
new file mode 100644
index 0000000..47f6041
--- /dev/null
+++ b/simd/jfdctint-mmx.asm
@@ -0,0 +1,622 @@
+;
+; jfdctint.asm - accurate integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      13
+%define PASS1_BITS      2
+
+%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ      2446           ; FIX(0.298631336)
+F_0_390 equ      3196           ; FIX(0.390180644)
+F_0_541 equ      4433           ; FIX(0.541196100)
+F_0_765 equ      6270           ; FIX(0.765366865)
+F_0_899 equ      7373           ; FIX(0.899976223)
+F_1_175 equ      9633           ; FIX(1.175875602)
+F_1_501 equ     12299           ; FIX(1.501321110)
+F_1_847 equ     15137           ; FIX(1.847759065)
+F_1_961 equ     16069           ; FIX(1.961570560)
+F_2_053 equ     16819           ; FIX(2.053119869)
+F_2_562 equ     20995           ; FIX(2.562915447)
+F_3_072 equ     25172           ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
+F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
+F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
+F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
+F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
+F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
+F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
+F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
+F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
+F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
+F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054    times 2 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130   times 2 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117   times 2 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078    times 2 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089  times 2 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060   times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256  times 2 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050   times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1   times 2 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2   times 2 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X  times 4 dw  1 << (PASS1_BITS-1)
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_mmx (DCTELEM *data)
+;
+
+%define data(b)         (b)+8           ; DCTELEM *data
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+        global  EXTN(jsimd_fdct_islow_mmx)
+
+EXTN(jsimd_fdct_islow_mmx):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+;       push    esi             ; unused
+;       push    edi             ; unused
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process rows.
+
+        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
+        mov     ecx, DCTSIZE/4
+        alignx  16,7
+.rowloop:
+
+        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+        movq    mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+        movq    mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+        ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+        ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+        movq      mm4,mm0               ; transpose coefficients(phase 1)
+        punpcklwd mm0,mm1               ; mm0=(20 30 21 31)
+        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
+        movq      mm5,mm2               ; transpose coefficients(phase 1)
+        punpcklwd mm2,mm3               ; mm2=(24 34 25 35)
+        punpckhwd mm5,mm3               ; mm5=(26 36 27 37)
+
+        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+        ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+        ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
+        movq    MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
+
+        movq      mm4,mm6               ; transpose coefficients(phase 1)
+        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
+        punpckhwd mm4,mm7               ; mm4=(02 12 03 13)
+        movq      mm2,mm1               ; transpose coefficients(phase 1)
+        punpcklwd mm1,mm3               ; mm1=(04 14 05 15)
+        punpckhwd mm2,mm3               ; mm2=(06 16 07 17)
+
+        movq      mm7,mm6               ; transpose coefficients(phase 2)
+        punpckldq mm6,mm0               ; mm6=(00 10 20 30)=data0
+        punpckhdq mm7,mm0               ; mm7=(01 11 21 31)=data1
+        movq      mm3,mm2               ; transpose coefficients(phase 2)
+        punpckldq mm2,mm5               ; mm2=(06 16 26 36)=data6
+        punpckhdq mm3,mm5               ; mm3=(07 17 27 37)=data7
+
+        movq    mm0,mm7
+        movq    mm5,mm6
+        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
+        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
+        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
+        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
+
+        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
+        movq    mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
+        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+        movq      mm7,mm4               ; transpose coefficients(phase 2)
+        punpckldq mm4,mm2               ; mm4=(02 12 22 32)=data2
+        punpckhdq mm7,mm2               ; mm7=(03 13 23 33)=data3
+        movq      mm6,mm1               ; transpose coefficients(phase 2)
+        punpckldq mm1,mm3               ; mm1=(04 14 24 34)=data4
+        punpckhdq mm6,mm3               ; mm6=(05 15 25 35)=data5
+
+        movq    mm2,mm7
+        movq    mm3,mm4
+        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
+        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
+        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
+        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
+
+        ; -- Even part
+
+        movq    mm1,mm5
+        movq    mm6,mm0
+        paddw   mm5,mm7                 ; mm5=tmp10
+        paddw   mm0,mm4                 ; mm0=tmp11
+        psubw   mm1,mm7                 ; mm1=tmp13
+        psubw   mm6,mm4                 ; mm6=tmp12
+
+        movq    mm7,mm5
+        paddw   mm5,mm0                 ; mm5=tmp10+tmp11
+        psubw   mm7,mm0                 ; mm7=tmp10-tmp11
+
+        psllw   mm5,PASS1_BITS          ; mm5=data0
+        psllw   mm7,PASS1_BITS          ; mm7=data4
+
+        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+        ; (Original)
+        ; z1 = (tmp12 + tmp13) * 0.541196100;
+        ; data2 = z1 + tmp13 * 0.765366865;
+        ; data6 = z1 + tmp12 * -1.847759065;
+        ;
+        ; (This implementation)
+        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+        movq      mm4,mm1               ; mm1=tmp13
+        movq      mm0,mm1
+        punpcklwd mm4,mm6               ; mm6=tmp12
+        punpckhwd mm0,mm6
+        movq      mm1,mm4
+        movq      mm6,mm0
+        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=data2L
+        pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]        ; mm0=data2H
+        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=data6L
+        pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]       ; mm6=data6H
+
+        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+        paddd   mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+        psrad   mm4,DESCALE_P1
+        psrad   mm0,DESCALE_P1
+        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+        paddd   mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+        psrad   mm1,DESCALE_P1
+        psrad   mm6,DESCALE_P1
+
+        packssdw  mm4,mm0               ; mm4=data2
+        packssdw  mm1,mm6               ; mm1=data6
+
+        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+        movq    MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+        ; -- Odd part
+
+        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp6
+        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp7
+
+        movq    mm0,mm2                 ; mm2=tmp4
+        movq    mm6,mm3                 ; mm3=tmp5
+        paddw   mm0,mm5                 ; mm0=z3
+        paddw   mm6,mm7                 ; mm6=z4
+
+        ; (Original)
+        ; z5 = (z3 + z4) * 1.175875602;
+        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+        ; z3 += z5;  z4 += z5;
+        ;
+        ; (This implementation)
+        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+        movq      mm4,mm0
+        movq      mm1,mm0
+        punpcklwd mm4,mm6
+        punpckhwd mm1,mm6
+        movq      mm0,mm4
+        movq      mm6,mm1
+        pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]       ; mm4=z3L
+        pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]       ; mm1=z3H
+        pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]        ; mm0=z4L
+        pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]        ; mm6=z4H
+
+        movq    MMWORD [wk(0)], mm4     ; wk(0)=z3L
+        movq    MMWORD [wk(1)], mm1     ; wk(1)=z3H
+
+        ; (Original)
+        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+        ;
+        ; (This implementation)
+        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+        movq      mm4,mm2
+        movq      mm1,mm2
+        punpcklwd mm4,mm7
+        punpckhwd mm1,mm7
+        movq      mm2,mm4
+        movq      mm7,mm1
+        pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm4=tmp4L
+        pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm1=tmp4H
+        pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]       ; mm2=tmp7L
+        pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]       ; mm7=tmp7H
+
+        paddd   mm4, MMWORD [wk(0)]     ; mm4=data7L
+        paddd   mm1, MMWORD [wk(1)]     ; mm1=data7H
+        paddd   mm2,mm0                 ; mm2=data1L
+        paddd   mm7,mm6                 ; mm7=data1H
+
+        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+        psrad   mm4,DESCALE_P1
+        psrad   mm1,DESCALE_P1
+        paddd   mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+        psrad   mm2,DESCALE_P1
+        psrad   mm7,DESCALE_P1
+
+        packssdw  mm4,mm1               ; mm4=data7
+        packssdw  mm2,mm7               ; mm2=data1
+
+        movq    MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+        movq      mm1,mm3
+        movq      mm7,mm3
+        punpcklwd mm1,mm5
+        punpckhwd mm7,mm5
+        movq      mm3,mm1
+        movq      mm5,mm7
+        pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm1=tmp5L
+        pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm7=tmp5H
+        pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]       ; mm3=tmp6L
+        pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]       ; mm5=tmp6H
+
+        paddd   mm1,mm0                 ; mm1=data5L
+        paddd   mm7,mm6                 ; mm7=data5H
+        paddd   mm3, MMWORD [wk(0)]     ; mm3=data3L
+        paddd   mm5, MMWORD [wk(1)]     ; mm5=data3H
+
+        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+        psrad   mm1,DESCALE_P1
+        psrad   mm7,DESCALE_P1
+        paddd   mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+        paddd   mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+        psrad   mm3,DESCALE_P1
+        psrad   mm5,DESCALE_P1
+
+        packssdw  mm1,mm7               ; mm1=data5
+        packssdw  mm3,mm5               ; mm3=data3
+
+        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+        add     edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+        dec     ecx
+        jnz     near .rowloop
+
+        ; ---- Pass 2: process columns.
+
+        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
+        mov     ecx, DCTSIZE/4
+        alignx  16,7
+.columnloop:
+
+        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+        ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+        ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+        movq      mm4,mm0               ; transpose coefficients(phase 1)
+        punpcklwd mm0,mm1               ; mm0=(02 03 12 13)
+        punpckhwd mm4,mm1               ; mm4=(22 23 32 33)
+        movq      mm5,mm2               ; transpose coefficients(phase 1)
+        punpcklwd mm2,mm3               ; mm2=(42 43 52 53)
+        punpckhwd mm5,mm3               ; mm5=(62 63 72 73)
+
+        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+        movq    mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+        movq    mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+        ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+        ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
+        movq    MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
+
+        movq      mm4,mm6               ; transpose coefficients(phase 1)
+        punpcklwd mm6,mm7               ; mm6=(00 01 10 11)
+        punpckhwd mm4,mm7               ; mm4=(20 21 30 31)
+        movq      mm2,mm1               ; transpose coefficients(phase 1)
+        punpcklwd mm1,mm3               ; mm1=(40 41 50 51)
+        punpckhwd mm2,mm3               ; mm2=(60 61 70 71)
+
+        movq      mm7,mm6               ; transpose coefficients(phase 2)
+        punpckldq mm6,mm0               ; mm6=(00 01 02 03)=data0
+        punpckhdq mm7,mm0               ; mm7=(10 11 12 13)=data1
+        movq      mm3,mm2               ; transpose coefficients(phase 2)
+        punpckldq mm2,mm5               ; mm2=(60 61 62 63)=data6
+        punpckhdq mm3,mm5               ; mm3=(70 71 72 73)=data7
+
+        movq    mm0,mm7
+        movq    mm5,mm6
+        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
+        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
+        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
+        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
+
+        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
+        movq    mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
+        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+        movq      mm7,mm4               ; transpose coefficients(phase 2)
+        punpckldq mm4,mm2               ; mm4=(20 21 22 23)=data2
+        punpckhdq mm7,mm2               ; mm7=(30 31 32 33)=data3
+        movq      mm6,mm1               ; transpose coefficients(phase 2)
+        punpckldq mm1,mm3               ; mm1=(40 41 42 43)=data4
+        punpckhdq mm6,mm3               ; mm6=(50 51 52 53)=data5
+
+        movq    mm2,mm7
+        movq    mm3,mm4
+        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
+        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
+        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
+        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
+
+        ; -- Even part
+
+        movq    mm1,mm5
+        movq    mm6,mm0
+        paddw   mm5,mm7                 ; mm5=tmp10
+        paddw   mm0,mm4                 ; mm0=tmp11
+        psubw   mm1,mm7                 ; mm1=tmp13
+        psubw   mm6,mm4                 ; mm6=tmp12
+
+        movq    mm7,mm5
+        paddw   mm5,mm0                 ; mm5=tmp10+tmp11
+        psubw   mm7,mm0                 ; mm7=tmp10-tmp11
+
+        paddw   mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+        paddw   mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+        psraw   mm5,PASS1_BITS          ; mm5=data0
+        psraw   mm7,PASS1_BITS          ; mm7=data4
+
+        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+        ; (Original)
+        ; z1 = (tmp12 + tmp13) * 0.541196100;
+        ; data2 = z1 + tmp13 * 0.765366865;
+        ; data6 = z1 + tmp12 * -1.847759065;
+        ;
+        ; (This implementation)
+        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+        movq      mm4,mm1               ; mm1=tmp13
+        movq      mm0,mm1
+        punpcklwd mm4,mm6               ; mm6=tmp12
+        punpckhwd mm0,mm6
+        movq      mm1,mm4
+        movq      mm6,mm0
+        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=data2L
+        pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]        ; mm0=data2H
+        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=data6L
+        pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]       ; mm6=data6H
+
+        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+        paddd   mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+        psrad   mm4,DESCALE_P2
+        psrad   mm0,DESCALE_P2
+        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+        paddd   mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+        psrad   mm1,DESCALE_P2
+        psrad   mm6,DESCALE_P2
+
+        packssdw  mm4,mm0               ; mm4=data2
+        packssdw  mm1,mm6               ; mm1=data6
+
+        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+        ; -- Odd part
+
+        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp6
+        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp7
+
+        movq    mm0,mm2                 ; mm2=tmp4
+        movq    mm6,mm3                 ; mm3=tmp5
+        paddw   mm0,mm5                 ; mm0=z3
+        paddw   mm6,mm7                 ; mm6=z4
+
+        ; (Original)
+        ; z5 = (z3 + z4) * 1.175875602;
+        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+        ; z3 += z5;  z4 += z5;
+        ;
+        ; (This implementation)
+        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+        movq      mm4,mm0
+        movq      mm1,mm0
+        punpcklwd mm4,mm6
+        punpckhwd mm1,mm6
+        movq      mm0,mm4
+        movq      mm6,mm1
+        pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]       ; mm4=z3L
+        pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]       ; mm1=z3H
+        pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]        ; mm0=z4L
+        pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]        ; mm6=z4H
+
+        movq    MMWORD [wk(0)], mm4     ; wk(0)=z3L
+        movq    MMWORD [wk(1)], mm1     ; wk(1)=z3H
+
+        ; (Original)
+        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+        ;
+        ; (This implementation)
+        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+        movq      mm4,mm2
+        movq      mm1,mm2
+        punpcklwd mm4,mm7
+        punpckhwd mm1,mm7
+        movq      mm2,mm4
+        movq      mm7,mm1
+        pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm4=tmp4L
+        pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm1=tmp4H
+        pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]       ; mm2=tmp7L
+        pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]       ; mm7=tmp7H
+
+        paddd   mm4, MMWORD [wk(0)]     ; mm4=data7L
+        paddd   mm1, MMWORD [wk(1)]     ; mm1=data7H
+        paddd   mm2,mm0                 ; mm2=data1L
+        paddd   mm7,mm6                 ; mm7=data1H
+
+        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+        psrad   mm4,DESCALE_P2
+        psrad   mm1,DESCALE_P2
+        paddd   mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+        psrad   mm2,DESCALE_P2
+        psrad   mm7,DESCALE_P2
+
+        packssdw  mm4,mm1               ; mm4=data7
+        packssdw  mm2,mm7               ; mm2=data1
+
+        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+        movq      mm1,mm3
+        movq      mm7,mm3
+        punpcklwd mm1,mm5
+        punpckhwd mm7,mm5
+        movq      mm3,mm1
+        movq      mm5,mm7
+        pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm1=tmp5L
+        pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm7=tmp5H
+        pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]       ; mm3=tmp6L
+        pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]       ; mm5=tmp6H
+
+        paddd   mm1,mm0                 ; mm1=data5L
+        paddd   mm7,mm6                 ; mm7=data5H
+        paddd   mm3, MMWORD [wk(0)]     ; mm3=data3L
+        paddd   mm5, MMWORD [wk(1)]     ; mm5=data3H
+
+        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+        psrad   mm1,DESCALE_P2
+        psrad   mm7,DESCALE_P2
+        paddd   mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+        paddd   mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+        psrad   mm3,DESCALE_P2
+        psrad   mm5,DESCALE_P2
+
+        packssdw  mm1,mm7               ; mm1=data5
+        packssdw  mm3,mm5               ; mm3=data3
+
+        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+        add     edx, byte 4*SIZEOF_DCTELEM
+        dec     ecx
+        jnz     near .columnloop
+
+        emms            ; empty MMX state
+
+;       pop     edi             ; unused
+;       pop     esi             ; unused
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        poppic  ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jfdctint-sse2-64.asm b/simd/jfdctint-sse2-64.asm
new file mode 100644
index 0000000..c23fcfb
--- /dev/null
+++ b/simd/jfdctint-sse2-64.asm
@@ -0,0 +1,622 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      13
+%define PASS1_BITS      2
+
+%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ      2446           ; FIX(0.298631336)
+F_0_390 equ      3196           ; FIX(0.390180644)
+F_0_541 equ      4433           ; FIX(0.541196100)
+F_0_765 equ      6270           ; FIX(0.765366865)
+F_0_899 equ      7373           ; FIX(0.899976223)
+F_1_175 equ      9633           ; FIX(1.175875602)
+F_1_501 equ     12299           ; FIX(1.501321110)
+F_1_847 equ     15137           ; FIX(1.847759065)
+F_1_961 equ     16069           ; FIX(1.961570560)
+F_2_053 equ     16819           ; FIX(2.053119869)
+F_2_562 equ     20995           ; FIX(2.562915447)
+F_3_072 equ     25172           ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
+F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
+F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
+F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
+F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
+F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
+F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
+F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
+F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
+F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
+F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X  times 8 dw  1 << (PASS1_BITS-1)
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2 (DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          6
+
+        align   16
+        global  EXTN(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [wk(0)]
+        collect_args
+
+        ; ---- Pass 1: process rows.
+
+        mov     rdx, r10        ; (DCTELEM *)
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
+        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
+        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
+        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
+        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
+        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
+
+        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
+        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
+        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
+        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
+        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
+        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
+
+        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
+        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
+        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
+        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
+        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
+        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
+
+        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+        movdqa  XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
+        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
+        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
+        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
+        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
+        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
+
+        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
+        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
+        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
+        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
+        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
+        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+        movdqa  xmm6,xmm1
+        movdqa  xmm3,xmm0
+        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
+        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
+        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
+        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
+
+        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+        movdqa  xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
+        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
+        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
+        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
+        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
+        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+        movdqa  xmm2,xmm1
+        movdqa  xmm5,xmm7
+        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
+        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
+        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
+        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
+
+        ; -- Even part
+
+        movdqa  xmm4,xmm3
+        movdqa  xmm0,xmm6
+        paddw   xmm3,xmm1               ; xmm3=tmp10
+        paddw   xmm6,xmm7               ; xmm6=tmp11
+        psubw   xmm4,xmm1               ; xmm4=tmp13
+        psubw   xmm0,xmm7               ; xmm0=tmp12
+
+        movdqa  xmm1,xmm3
+        paddw   xmm3,xmm6               ; xmm3=tmp10+tmp11
+        psubw   xmm1,xmm6               ; xmm1=tmp10-tmp11
+
+        psllw   xmm3,PASS1_BITS         ; xmm3=data0
+        psllw   xmm1,PASS1_BITS         ; xmm1=data4
+
+        movdqa  XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+        movdqa  XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+        ; (Original)
+        ; z1 = (tmp12 + tmp13) * 0.541196100;
+        ; data2 = z1 + tmp13 * 0.765366865;
+        ; data6 = z1 + tmp12 * -1.847759065;
+        ;
+        ; (This implementation)
+        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+        movdqa    xmm7,xmm4             ; xmm4=tmp13
+        movdqa    xmm6,xmm4
+        punpcklwd xmm7,xmm0             ; xmm0=tmp12
+        punpckhwd xmm6,xmm0
+        movdqa    xmm4,xmm7
+        movdqa    xmm0,xmm6
+        pmaddwd   xmm7,[rel PW_F130_F054]       ; xmm7=data2L
+        pmaddwd   xmm6,[rel PW_F130_F054]       ; xmm6=data2H
+        pmaddwd   xmm4,[rel PW_F054_MF130]      ; xmm4=data6L
+        pmaddwd   xmm0,[rel PW_F054_MF130]      ; xmm0=data6H
+
+        paddd   xmm7,[rel PD_DESCALE_P1]
+        paddd   xmm6,[rel PD_DESCALE_P1]
+        psrad   xmm7,DESCALE_P1
+        psrad   xmm6,DESCALE_P1
+        paddd   xmm4,[rel PD_DESCALE_P1]
+        paddd   xmm0,[rel PD_DESCALE_P1]
+        psrad   xmm4,DESCALE_P1
+        psrad   xmm0,DESCALE_P1
+
+        packssdw  xmm7,xmm6             ; xmm7=data2
+        packssdw  xmm4,xmm0             ; xmm4=data6
+
+        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+        ; -- Odd part
+
+        movdqa  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+        movdqa  xmm6,xmm2               ; xmm2=tmp4
+        movdqa  xmm0,xmm5               ; xmm5=tmp5
+        paddw   xmm6,xmm3               ; xmm6=z3
+        paddw   xmm0,xmm1               ; xmm0=z4
+
+        ; (Original)
+        ; z5 = (z3 + z4) * 1.175875602;
+        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+        ; z3 += z5;  z4 += z5;
+        ;
+        ; (This implementation)
+        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+        movdqa    xmm7,xmm6
+        movdqa    xmm4,xmm6
+        punpcklwd xmm7,xmm0
+        punpckhwd xmm4,xmm0
+        movdqa    xmm6,xmm7
+        movdqa    xmm0,xmm4
+        pmaddwd   xmm7,[rel PW_MF078_F117]      ; xmm7=z3L
+        pmaddwd   xmm4,[rel PW_MF078_F117]      ; xmm4=z3H
+        pmaddwd   xmm6,[rel PW_F117_F078]       ; xmm6=z4L
+        pmaddwd   xmm0,[rel PW_F117_F078]       ; xmm0=z4H
+
+        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+        ; (Original)
+        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+        ;
+        ; (This implementation)
+        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+        movdqa    xmm7,xmm2
+        movdqa    xmm4,xmm2
+        punpcklwd xmm7,xmm1
+        punpckhwd xmm4,xmm1
+        movdqa    xmm2,xmm7
+        movdqa    xmm1,xmm4
+        pmaddwd   xmm7,[rel PW_MF060_MF089]     ; xmm7=tmp4L
+        pmaddwd   xmm4,[rel PW_MF060_MF089]     ; xmm4=tmp4H
+        pmaddwd   xmm2,[rel PW_MF089_F060]      ; xmm2=tmp7L
+        pmaddwd   xmm1,[rel PW_MF089_F060]      ; xmm1=tmp7H
+
+        paddd   xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+        paddd   xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+        paddd   xmm2,xmm6               ; xmm2=data1L
+        paddd   xmm1,xmm0               ; xmm1=data1H
+
+        paddd   xmm7,[rel PD_DESCALE_P1]
+        paddd   xmm4,[rel PD_DESCALE_P1]
+        psrad   xmm7,DESCALE_P1
+        psrad   xmm4,DESCALE_P1
+        paddd   xmm2,[rel PD_DESCALE_P1]
+        paddd   xmm1,[rel PD_DESCALE_P1]
+        psrad   xmm2,DESCALE_P1
+        psrad   xmm1,DESCALE_P1
+
+        packssdw  xmm7,xmm4             ; xmm7=data7
+        packssdw  xmm2,xmm1             ; xmm2=data1
+
+        movdqa    xmm4,xmm5
+        movdqa    xmm1,xmm5
+        punpcklwd xmm4,xmm3
+        punpckhwd xmm1,xmm3
+        movdqa    xmm5,xmm4
+        movdqa    xmm3,xmm1
+        pmaddwd   xmm4,[rel PW_MF050_MF256]     ; xmm4=tmp5L
+        pmaddwd   xmm1,[rel PW_MF050_MF256]     ; xmm1=tmp5H
+        pmaddwd   xmm5,[rel PW_MF256_F050]      ; xmm5=tmp6L
+        pmaddwd   xmm3,[rel PW_MF256_F050]      ; xmm3=tmp6H
+
+        paddd   xmm4,xmm6               ; xmm4=data5L
+        paddd   xmm1,xmm0               ; xmm1=data5H
+        paddd   xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+        paddd   xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+        paddd   xmm4,[rel PD_DESCALE_P1]
+        paddd   xmm1,[rel PD_DESCALE_P1]
+        psrad   xmm4,DESCALE_P1
+        psrad   xmm1,DESCALE_P1
+        paddd   xmm5,[rel PD_DESCALE_P1]
+        paddd   xmm3,[rel PD_DESCALE_P1]
+        psrad   xmm5,DESCALE_P1
+        psrad   xmm3,DESCALE_P1
+
+        packssdw  xmm4,xmm1             ; xmm4=data5
+        packssdw  xmm5,xmm3             ; xmm5=data3
+
+        ; ---- Pass 2: process columns.
+
+        movdqa  xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+        movdqa  xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+        ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+        ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+        movdqa    xmm1,xmm6             ; transpose coefficients(phase 1)
+        punpcklwd xmm6,xmm2             ; xmm6=(00 01 10 11 20 21 30 31)
+        punpckhwd xmm1,xmm2             ; xmm1=(40 41 50 51 60 61 70 71)
+        movdqa    xmm3,xmm0             ; transpose coefficients(phase 1)
+        punpcklwd xmm0,xmm5             ; xmm0=(02 03 12 13 22 23 32 33)
+        punpckhwd xmm3,xmm5             ; xmm3=(42 43 52 53 62 63 72 73)
+
+        movdqa  xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+        movdqa  xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+        ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+        ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+        movdqa    xmm0,xmm2             ; transpose coefficients(phase 1)
+        punpcklwd xmm2,xmm4             ; xmm2=(04 05 14 15 24 25 34 35)
+        punpckhwd xmm0,xmm4             ; xmm0=(44 45 54 55 64 65 74 75)
+        movdqa    xmm3,xmm5             ; transpose coefficients(phase 1)
+        punpcklwd xmm5,xmm7             ; xmm5=(06 07 16 17 26 27 36 37)
+        punpckhwd xmm3,xmm7             ; xmm3=(46 47 56 57 66 67 76 77)
+
+        movdqa    xmm4,xmm2             ; transpose coefficients(phase 2)
+        punpckldq xmm2,xmm5             ; xmm2=(04 05 06 07 14 15 16 17)
+        punpckhdq xmm4,xmm5             ; xmm4=(24 25 26 27 34 35 36 37)
+        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
+        punpckldq xmm0,xmm3             ; xmm0=(44 45 46 47 54 55 56 57)
+        punpckhdq xmm7,xmm3             ; xmm7=(64 65 66 67 74 75 76 77)
+
+        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+        movdqa  XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+        movdqa  XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+        movdqa    xmm4,xmm6             ; transpose coefficients(phase 2)
+        punpckldq xmm6,xmm5             ; xmm6=(00 01 02 03 10 11 12 13)
+        punpckhdq xmm4,xmm5             ; xmm4=(20 21 22 23 30 31 32 33)
+        movdqa    xmm0,xmm1             ; transpose coefficients(phase 2)
+        punpckldq xmm1,xmm3             ; xmm1=(40 41 42 43 50 51 52 53)
+        punpckhdq xmm0,xmm3             ; xmm0=(60 61 62 63 70 71 72 73)
+
+        movdqa     xmm5,xmm6            ; transpose coefficients(phase 3)
+        punpcklqdq xmm6,xmm2            ; xmm6=(00 01 02 03 04 05 06 07)=data0
+        punpckhqdq xmm5,xmm2            ; xmm5=(10 11 12 13 14 15 16 17)=data1
+        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
+        punpcklqdq xmm0,xmm7            ; xmm0=(60 61 62 63 64 65 66 67)=data6
+        punpckhqdq xmm3,xmm7            ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+        movdqa  xmm2,xmm5
+        movdqa  xmm7,xmm6
+        psubw   xmm5,xmm0               ; xmm5=data1-data6=tmp6
+        psubw   xmm6,xmm3               ; xmm6=data0-data7=tmp7
+        paddw   xmm2,xmm0               ; xmm2=data1+data6=tmp1
+        paddw   xmm7,xmm3               ; xmm7=data0+data7=tmp0
+
+        movdqa  xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+        movdqa  xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+        movdqa     xmm5,xmm4            ; transpose coefficients(phase 3)
+        punpcklqdq xmm4,xmm0            ; xmm4=(20 21 22 23 24 25 26 27)=data2
+        punpckhqdq xmm5,xmm0            ; xmm5=(30 31 32 33 34 35 36 37)=data3
+        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
+        punpcklqdq xmm1,xmm3            ; xmm1=(40 41 42 43 44 45 46 47)=data4
+        punpckhqdq xmm6,xmm3            ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+        movdqa  xmm0,xmm5
+        movdqa  xmm3,xmm4
+        paddw   xmm5,xmm1               ; xmm5=data3+data4=tmp3
+        paddw   xmm4,xmm6               ; xmm4=data2+data5=tmp2
+        psubw   xmm0,xmm1               ; xmm0=data3-data4=tmp4
+        psubw   xmm3,xmm6               ; xmm3=data2-data5=tmp5
+
+        ; -- Even part
+
+        movdqa  xmm1,xmm7
+        movdqa  xmm6,xmm2
+        paddw   xmm7,xmm5               ; xmm7=tmp10
+        paddw   xmm2,xmm4               ; xmm2=tmp11
+        psubw   xmm1,xmm5               ; xmm1=tmp13
+        psubw   xmm6,xmm4               ; xmm6=tmp12
+
+        movdqa  xmm5,xmm7
+        paddw   xmm7,xmm2               ; xmm7=tmp10+tmp11
+        psubw   xmm5,xmm2               ; xmm5=tmp10-tmp11
+
+        paddw   xmm7,[rel PW_DESCALE_P2X]
+        paddw   xmm5,[rel PW_DESCALE_P2X]
+        psraw   xmm7,PASS1_BITS         ; xmm7=data0
+        psraw   xmm5,PASS1_BITS         ; xmm5=data4
+
+        movdqa  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+        movdqa  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+        ; (Original)
+        ; z1 = (tmp12 + tmp13) * 0.541196100;
+        ; data2 = z1 + tmp13 * 0.765366865;
+        ; data6 = z1 + tmp12 * -1.847759065;
+        ;
+        ; (This implementation)
+        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+        movdqa    xmm4,xmm1             ; xmm1=tmp13
+        movdqa    xmm2,xmm1
+        punpcklwd xmm4,xmm6             ; xmm6=tmp12
+        punpckhwd xmm2,xmm6
+        movdqa    xmm1,xmm4
+        movdqa    xmm6,xmm2
+        pmaddwd   xmm4,[rel PW_F130_F054]       ; xmm4=data2L
+        pmaddwd   xmm2,[rel PW_F130_F054]       ; xmm2=data2H
+        pmaddwd   xmm1,[rel PW_F054_MF130]      ; xmm1=data6L
+        pmaddwd   xmm6,[rel PW_F054_MF130]      ; xmm6=data6H
+
+        paddd   xmm4,[rel PD_DESCALE_P2]
+        paddd   xmm2,[rel PD_DESCALE_P2]
+        psrad   xmm4,DESCALE_P2
+        psrad   xmm2,DESCALE_P2
+        paddd   xmm1,[rel PD_DESCALE_P2]
+        paddd   xmm6,[rel PD_DESCALE_P2]
+        psrad   xmm1,DESCALE_P2
+        psrad   xmm6,DESCALE_P2
+
+        packssdw  xmm4,xmm2             ; xmm4=data2
+        packssdw  xmm1,xmm6             ; xmm1=data6
+
+        movdqa  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+        movdqa  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+        ; -- Odd part
+
+        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+        movdqa  xmm2,xmm0               ; xmm0=tmp4
+        movdqa  xmm6,xmm3               ; xmm3=tmp5
+        paddw   xmm2,xmm7               ; xmm2=z3
+        paddw   xmm6,xmm5               ; xmm6=z4
+
+        ; (Original)
+        ; z5 = (z3 + z4) * 1.175875602;
+        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+        ; z3 += z5;  z4 += z5;
+        ;
+        ; (This implementation)
+        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+        movdqa    xmm4,xmm2
+        movdqa    xmm1,xmm2
+        punpcklwd xmm4,xmm6
+        punpckhwd xmm1,xmm6
+        movdqa    xmm2,xmm4
+        movdqa    xmm6,xmm1
+        pmaddwd   xmm4,[rel PW_MF078_F117]      ; xmm4=z3L
+        pmaddwd   xmm1,[rel PW_MF078_F117]      ; xmm1=z3H
+        pmaddwd   xmm2,[rel PW_F117_F078]       ; xmm2=z4L
+        pmaddwd   xmm6,[rel PW_F117_F078]       ; xmm6=z4H
+
+        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+        ; (Original)
+        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+        ;
+        ; (This implementation)
+        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+        movdqa    xmm4,xmm0
+        movdqa    xmm1,xmm0
+        punpcklwd xmm4,xmm5
+        punpckhwd xmm1,xmm5
+        movdqa    xmm0,xmm4
+        movdqa    xmm5,xmm1
+        pmaddwd   xmm4,[rel PW_MF060_MF089]     ; xmm4=tmp4L
+        pmaddwd   xmm1,[rel PW_MF060_MF089]     ; xmm1=tmp4H
+        pmaddwd   xmm0,[rel PW_MF089_F060]      ; xmm0=tmp7L
+        pmaddwd   xmm5,[rel PW_MF089_F060]      ; xmm5=tmp7H
+
+        paddd   xmm4, XMMWORD [wk(0)]   ; xmm4=data7L
+        paddd   xmm1, XMMWORD [wk(1)]   ; xmm1=data7H
+        paddd   xmm0,xmm2               ; xmm0=data1L
+        paddd   xmm5,xmm6               ; xmm5=data1H
+
+        paddd   xmm4,[rel PD_DESCALE_P2]
+        paddd   xmm1,[rel PD_DESCALE_P2]
+        psrad   xmm4,DESCALE_P2
+        psrad   xmm1,DESCALE_P2
+        paddd   xmm0,[rel PD_DESCALE_P2]
+        paddd   xmm5,[rel PD_DESCALE_P2]
+        psrad   xmm0,DESCALE_P2
+        psrad   xmm5,DESCALE_P2
+
+        packssdw  xmm4,xmm1             ; xmm4=data7
+        packssdw  xmm0,xmm5             ; xmm0=data1
+
+        movdqa  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+        movdqa  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+        movdqa    xmm1,xmm3
+        movdqa    xmm5,xmm3
+        punpcklwd xmm1,xmm7
+        punpckhwd xmm5,xmm7
+        movdqa    xmm3,xmm1
+        movdqa    xmm7,xmm5
+        pmaddwd   xmm1,[rel PW_MF050_MF256]     ; xmm1=tmp5L
+        pmaddwd   xmm5,[rel PW_MF050_MF256]     ; xmm5=tmp5H
+        pmaddwd   xmm3,[rel PW_MF256_F050]      ; xmm3=tmp6L
+        pmaddwd   xmm7,[rel PW_MF256_F050]      ; xmm7=tmp6H
+
+        paddd   xmm1,xmm2               ; xmm1=data5L
+        paddd   xmm5,xmm6               ; xmm5=data5H
+        paddd   xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+        paddd   xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+        paddd   xmm1,[rel PD_DESCALE_P2]
+        paddd   xmm5,[rel PD_DESCALE_P2]
+        psrad   xmm1,DESCALE_P2
+        psrad   xmm5,DESCALE_P2
+        paddd   xmm3,[rel PD_DESCALE_P2]
+        paddd   xmm7,[rel PD_DESCALE_P2]
+        psrad   xmm3,DESCALE_P2
+        psrad   xmm7,DESCALE_P2
+
+        packssdw  xmm1,xmm5             ; xmm1=data5
+        packssdw  xmm3,xmm7             ; xmm3=data3
+
+        movdqa  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+        movdqa  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jfdctint-sse2.asm b/simd/jfdctint-sse2.asm
new file mode 100644
index 0000000..6b42ce5
--- /dev/null
+++ b/simd/jfdctint-sse2.asm
@@ -0,0 +1,634 @@
+;
+; jfdctint.asm - accurate integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      13
+%define PASS1_BITS      2
+
+%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ      2446           ; FIX(0.298631336)
+F_0_390 equ      3196           ; FIX(0.390180644)
+F_0_541 equ      4433           ; FIX(0.541196100)
+F_0_765 equ      6270           ; FIX(0.765366865)
+F_0_899 equ      7373           ; FIX(0.899976223)
+F_1_175 equ      9633           ; FIX(1.175875602)
+F_1_501 equ     12299           ; FIX(1.501321110)
+F_1_847 equ     15137           ; FIX(1.847759065)
+F_1_961 equ     16069           ; FIX(1.961570560)
+F_2_053 equ     16819           ; FIX(2.053119869)
+F_2_562 equ     20995           ; FIX(2.562915447)
+F_3_072 equ     25172           ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
+F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
+F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
+F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
+F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
+F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
+F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
+F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
+F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
+F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
+F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X  times 8 dw  1 << (PASS1_BITS-1)
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2 (DCTELEM *data)
+;
+
+%define data(b)         (b)+8           ; DCTELEM *data
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          6
+
+        align   16
+        global  EXTN(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic ebx
+;       push    ecx             ; unused
+;       push    edx             ; need not be preserved
+;       push    esi             ; unused
+;       push    edi             ; unused
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process rows.
+
+        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
+        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
+        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
+        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
+        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
+        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
+
+        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
+        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
+        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
+        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
+        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
+        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
+
+        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
+        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
+        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
+        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
+        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
+        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
+
+        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+        movdqa  XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
+        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
+        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
+        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
+        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
+        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
+
+        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
+        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
+        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
+        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
+        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
+        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+        movdqa  xmm6,xmm1
+        movdqa  xmm3,xmm0
+        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
+        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
+        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
+        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
+
+        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+        movdqa  xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
+        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
+        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
+        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
+        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
+        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+        movdqa  xmm2,xmm1
+        movdqa  xmm5,xmm7
+        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
+        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
+        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
+        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
+
+        ; -- Even part
+
+        movdqa  xmm4,xmm3
+        movdqa  xmm0,xmm6
+        paddw   xmm3,xmm1               ; xmm3=tmp10
+        paddw   xmm6,xmm7               ; xmm6=tmp11
+        psubw   xmm4,xmm1               ; xmm4=tmp13
+        psubw   xmm0,xmm7               ; xmm0=tmp12
+
+        movdqa  xmm1,xmm3
+        paddw   xmm3,xmm6               ; xmm3=tmp10+tmp11
+        psubw   xmm1,xmm6               ; xmm1=tmp10-tmp11
+
+        psllw   xmm3,PASS1_BITS         ; xmm3=data0
+        psllw   xmm1,PASS1_BITS         ; xmm1=data4
+
+        movdqa  XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+        movdqa  XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+        ; (Original)
+        ; z1 = (tmp12 + tmp13) * 0.541196100;
+        ; data2 = z1 + tmp13 * 0.765366865;
+        ; data6 = z1 + tmp12 * -1.847759065;
+        ;
+        ; (This implementation)
+        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+        movdqa    xmm7,xmm4             ; xmm4=tmp13
+        movdqa    xmm6,xmm4
+        punpcklwd xmm7,xmm0             ; xmm0=tmp12
+        punpckhwd xmm6,xmm0
+        movdqa    xmm4,xmm7
+        movdqa    xmm0,xmm6
+        pmaddwd   xmm7,[GOTOFF(ebx,PW_F130_F054)]       ; xmm7=data2L
+        pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]       ; xmm6=data2H
+        pmaddwd   xmm4,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm4=data6L
+        pmaddwd   xmm0,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm0=data6H
+
+        paddd   xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+        paddd   xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+        psrad   xmm7,DESCALE_P1
+        psrad   xmm6,DESCALE_P1
+        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+        paddd   xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+        psrad   xmm4,DESCALE_P1
+        psrad   xmm0,DESCALE_P1
+
+        packssdw  xmm7,xmm6             ; xmm7=data2
+        packssdw  xmm4,xmm0             ; xmm4=data6
+
+        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+        ; -- Odd part
+
+        movdqa  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+        movdqa  xmm6,xmm2               ; xmm2=tmp4
+        movdqa  xmm0,xmm5               ; xmm5=tmp5
+        paddw   xmm6,xmm3               ; xmm6=z3
+        paddw   xmm0,xmm1               ; xmm0=z4
+
+        ; (Original)
+        ; z5 = (z3 + z4) * 1.175875602;
+        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+        ; z3 += z5;  z4 += z5;
+        ;
+        ; (This implementation)
+        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+        movdqa    xmm7,xmm6
+        movdqa    xmm4,xmm6
+        punpcklwd xmm7,xmm0
+        punpckhwd xmm4,xmm0
+        movdqa    xmm6,xmm7
+        movdqa    xmm0,xmm4
+        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm7=z3L
+        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm4=z3H
+        pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]       ; xmm6=z4L
+        pmaddwd   xmm0,[GOTOFF(ebx,PW_F117_F078)]       ; xmm0=z4H
+
+        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+        ; (Original)
+        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+        ;
+        ; (This implementation)
+        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+        movdqa    xmm7,xmm2
+        movdqa    xmm4,xmm2
+        punpcklwd xmm7,xmm1
+        punpckhwd xmm4,xmm1
+        movdqa    xmm2,xmm7
+        movdqa    xmm1,xmm4
+        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm7=tmp4L
+        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm4=tmp4H
+        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm2=tmp7L
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm1=tmp7H
+
+        paddd   xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+        paddd   xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+        paddd   xmm2,xmm6               ; xmm2=data1L
+        paddd   xmm1,xmm0               ; xmm1=data1H
+
+        paddd   xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+        psrad   xmm7,DESCALE_P1
+        psrad   xmm4,DESCALE_P1
+        paddd   xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+        psrad   xmm2,DESCALE_P1
+        psrad   xmm1,DESCALE_P1
+
+        packssdw  xmm7,xmm4             ; xmm7=data7
+        packssdw  xmm2,xmm1             ; xmm2=data1
+
+        movdqa    xmm4,xmm5
+        movdqa    xmm1,xmm5
+        punpcklwd xmm4,xmm3
+        punpckhwd xmm1,xmm3
+        movdqa    xmm5,xmm4
+        movdqa    xmm3,xmm1
+        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm4=tmp5L
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm1=tmp5H
+        pmaddwd   xmm5,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm5=tmp6L
+        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm3=tmp6H
+
+        paddd   xmm4,xmm6               ; xmm4=data5L
+        paddd   xmm1,xmm0               ; xmm1=data5H
+        paddd   xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+        paddd   xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+        psrad   xmm4,DESCALE_P1
+        psrad   xmm1,DESCALE_P1
+        paddd   xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+        paddd   xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+        psrad   xmm5,DESCALE_P1
+        psrad   xmm3,DESCALE_P1
+
+        packssdw  xmm4,xmm1             ; xmm4=data5
+        packssdw  xmm5,xmm3             ; xmm5=data3
+
+        ; ---- Pass 2: process columns.
+
+;       mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
+
+        movdqa  xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+        movdqa  xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+        ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+        ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+        movdqa    xmm1,xmm6             ; transpose coefficients(phase 1)
+        punpcklwd xmm6,xmm2             ; xmm6=(00 01 10 11 20 21 30 31)
+        punpckhwd xmm1,xmm2             ; xmm1=(40 41 50 51 60 61 70 71)
+        movdqa    xmm3,xmm0             ; transpose coefficients(phase 1)
+        punpcklwd xmm0,xmm5             ; xmm0=(02 03 12 13 22 23 32 33)
+        punpckhwd xmm3,xmm5             ; xmm3=(42 43 52 53 62 63 72 73)
+
+        movdqa  xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+        movdqa  xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+        ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+        ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+        movdqa    xmm0,xmm2             ; transpose coefficients(phase 1)
+        punpcklwd xmm2,xmm4             ; xmm2=(04 05 14 15 24 25 34 35)
+        punpckhwd xmm0,xmm4             ; xmm0=(44 45 54 55 64 65 74 75)
+        movdqa    xmm3,xmm5             ; transpose coefficients(phase 1)
+        punpcklwd xmm5,xmm7             ; xmm5=(06 07 16 17 26 27 36 37)
+        punpckhwd xmm3,xmm7             ; xmm3=(46 47 56 57 66 67 76 77)
+
+        movdqa    xmm4,xmm2             ; transpose coefficients(phase 2)
+        punpckldq xmm2,xmm5             ; xmm2=(04 05 06 07 14 15 16 17)
+        punpckhdq xmm4,xmm5             ; xmm4=(24 25 26 27 34 35 36 37)
+        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
+        punpckldq xmm0,xmm3             ; xmm0=(44 45 46 47 54 55 56 57)
+        punpckhdq xmm7,xmm3             ; xmm7=(64 65 66 67 74 75 76 77)
+
+        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+        movdqa  XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+        movdqa  XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+        movdqa    xmm4,xmm6             ; transpose coefficients(phase 2)
+        punpckldq xmm6,xmm5             ; xmm6=(00 01 02 03 10 11 12 13)
+        punpckhdq xmm4,xmm5             ; xmm4=(20 21 22 23 30 31 32 33)
+        movdqa    xmm0,xmm1             ; transpose coefficients(phase 2)
+        punpckldq xmm1,xmm3             ; xmm1=(40 41 42 43 50 51 52 53)
+        punpckhdq xmm0,xmm3             ; xmm0=(60 61 62 63 70 71 72 73)
+
+        movdqa     xmm5,xmm6            ; transpose coefficients(phase 3)
+        punpcklqdq xmm6,xmm2            ; xmm6=(00 01 02 03 04 05 06 07)=data0
+        punpckhqdq xmm5,xmm2            ; xmm5=(10 11 12 13 14 15 16 17)=data1
+        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
+        punpcklqdq xmm0,xmm7            ; xmm0=(60 61 62 63 64 65 66 67)=data6
+        punpckhqdq xmm3,xmm7            ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+        movdqa  xmm2,xmm5
+        movdqa  xmm7,xmm6
+        psubw   xmm5,xmm0               ; xmm5=data1-data6=tmp6
+        psubw   xmm6,xmm3               ; xmm6=data0-data7=tmp7
+        paddw   xmm2,xmm0               ; xmm2=data1+data6=tmp1
+        paddw   xmm7,xmm3               ; xmm7=data0+data7=tmp0
+
+        movdqa  xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+        movdqa  xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+        movdqa     xmm5,xmm4            ; transpose coefficients(phase 3)
+        punpcklqdq xmm4,xmm0            ; xmm4=(20 21 22 23 24 25 26 27)=data2
+        punpckhqdq xmm5,xmm0            ; xmm5=(30 31 32 33 34 35 36 37)=data3
+        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
+        punpcklqdq xmm1,xmm3            ; xmm1=(40 41 42 43 44 45 46 47)=data4
+        punpckhqdq xmm6,xmm3            ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+        movdqa  xmm0,xmm5
+        movdqa  xmm3,xmm4
+        paddw   xmm5,xmm1               ; xmm5=data3+data4=tmp3
+        paddw   xmm4,xmm6               ; xmm4=data2+data5=tmp2
+        psubw   xmm0,xmm1               ; xmm0=data3-data4=tmp4
+        psubw   xmm3,xmm6               ; xmm3=data2-data5=tmp5
+
+        ; -- Even part
+
+        movdqa  xmm1,xmm7
+        movdqa  xmm6,xmm2
+        paddw   xmm7,xmm5               ; xmm7=tmp10
+        paddw   xmm2,xmm4               ; xmm2=tmp11
+        psubw   xmm1,xmm5               ; xmm1=tmp13
+        psubw   xmm6,xmm4               ; xmm6=tmp12
+
+        movdqa  xmm5,xmm7
+        paddw   xmm7,xmm2               ; xmm7=tmp10+tmp11
+        psubw   xmm5,xmm2               ; xmm5=tmp10-tmp11
+
+        paddw   xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+        paddw   xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+        psraw   xmm7,PASS1_BITS         ; xmm7=data0
+        psraw   xmm5,PASS1_BITS         ; xmm5=data4
+
+        movdqa  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+        movdqa  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+        ; (Original)
+        ; z1 = (tmp12 + tmp13) * 0.541196100;
+        ; data2 = z1 + tmp13 * 0.765366865;
+        ; data6 = z1 + tmp12 * -1.847759065;
+        ;
+        ; (This implementation)
+        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+        movdqa    xmm4,xmm1             ; xmm1=tmp13
+        movdqa    xmm2,xmm1
+        punpcklwd xmm4,xmm6             ; xmm6=tmp12
+        punpckhwd xmm2,xmm6
+        movdqa    xmm1,xmm4
+        movdqa    xmm6,xmm2
+        pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]       ; xmm4=data2L
+        pmaddwd   xmm2,[GOTOFF(ebx,PW_F130_F054)]       ; xmm2=data2H
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm1=data6L
+        pmaddwd   xmm6,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm6=data6H
+
+        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+        paddd   xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+        psrad   xmm4,DESCALE_P2
+        psrad   xmm2,DESCALE_P2
+        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+        paddd   xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+        psrad   xmm1,DESCALE_P2
+        psrad   xmm6,DESCALE_P2
+
+        packssdw  xmm4,xmm2             ; xmm4=data2
+        packssdw  xmm1,xmm6             ; xmm1=data6
+
+        movdqa  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+        movdqa  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+        ; -- Odd part
+
+        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+        movdqa  xmm2,xmm0               ; xmm0=tmp4
+        movdqa  xmm6,xmm3               ; xmm3=tmp5
+        paddw   xmm2,xmm7               ; xmm2=z3
+        paddw   xmm6,xmm5               ; xmm6=z4
+
+        ; (Original)
+        ; z5 = (z3 + z4) * 1.175875602;
+        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+        ; z3 += z5;  z4 += z5;
+        ;
+        ; (This implementation)
+        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+        movdqa    xmm4,xmm2
+        movdqa    xmm1,xmm2
+        punpcklwd xmm4,xmm6
+        punpckhwd xmm1,xmm6
+        movdqa    xmm2,xmm4
+        movdqa    xmm6,xmm1
+        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm4=z3L
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm1=z3H
+        pmaddwd   xmm2,[GOTOFF(ebx,PW_F117_F078)]       ; xmm2=z4L
+        pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]       ; xmm6=z4H
+
+        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+        ; (Original)
+        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+        ;
+        ; (This implementation)
+        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+        movdqa    xmm4,xmm0
+        movdqa    xmm1,xmm0
+        punpcklwd xmm4,xmm5
+        punpckhwd xmm1,xmm5
+        movdqa    xmm0,xmm4
+        movdqa    xmm5,xmm1
+        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm4=tmp4L
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm1=tmp4H
+        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm0=tmp7L
+        pmaddwd   xmm5,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm5=tmp7H
+
+        paddd   xmm4, XMMWORD [wk(0)]   ; xmm4=data7L
+        paddd   xmm1, XMMWORD [wk(1)]   ; xmm1=data7H
+        paddd   xmm0,xmm2               ; xmm0=data1L
+        paddd   xmm5,xmm6               ; xmm5=data1H
+
+        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+        psrad   xmm4,DESCALE_P2
+        psrad   xmm1,DESCALE_P2
+        paddd   xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+        paddd   xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+        psrad   xmm0,DESCALE_P2
+        psrad   xmm5,DESCALE_P2
+
+        packssdw  xmm4,xmm1             ; xmm4=data7
+        packssdw  xmm0,xmm5             ; xmm0=data1
+
+        movdqa  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+        movdqa  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+        movdqa    xmm1,xmm3
+        movdqa    xmm5,xmm3
+        punpcklwd xmm1,xmm7
+        punpckhwd xmm5,xmm7
+        movdqa    xmm3,xmm1
+        movdqa    xmm7,xmm5
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm1=tmp5L
+        pmaddwd   xmm5,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm5=tmp5H
+        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm3=tmp6L
+        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm7=tmp6H
+
+        paddd   xmm1,xmm2               ; xmm1=data5L
+        paddd   xmm5,xmm6               ; xmm5=data5H
+        paddd   xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+        paddd   xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+        paddd   xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+        psrad   xmm1,DESCALE_P2
+        psrad   xmm5,DESCALE_P2
+        paddd   xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+        paddd   xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+        psrad   xmm3,DESCALE_P2
+        psrad   xmm7,DESCALE_P2
+
+        packssdw  xmm1,xmm5             ; xmm1=data5
+        packssdw  xmm3,xmm7             ; xmm3=data3
+
+        movdqa  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+        movdqa  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+;       pop     edi             ; unused
+;       pop     esi             ; unused
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; unused
+        poppic  ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jfmmxfst.asm b/simd/jfmmxfst.asm
deleted file mode 100644
index 146e8c3..0000000
--- a/simd/jfmmxfst.asm
+++ /dev/null
@@ -1,397 +0,0 @@
-;
-; jfmmxfst.asm - fast integer FDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	8	; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382	equ	 98		; FIX(0.382683433)
-F_0_541	equ	139		; FIX(0.541196100)
-F_0_707	equ	181		; FIX(0.707106781)
-F_1_306	equ	334		; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
-F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-	alignz	16
-	global	EXTN(jconst_fdct_ifast_mmx) PRIVATE
-
-EXTN(jconst_fdct_ifast_mmx):
-
-PW_F0707	times 4 dw  F_0_707 << CONST_SHIFT
-PW_F0382	times 4 dw  F_0_382 << CONST_SHIFT
-PW_F0541	times 4 dw  F_0_541 << CONST_SHIFT
-PW_F1306	times 4 dw  F_1_306 << CONST_SHIFT
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_mmx (DCTELEM * data)
-;
-
-%define data(b)		(b)+8		; DCTELEM * data
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_ifast_mmx) PRIVATE
-
-EXTN(jsimd_fdct_ifast_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-;	push	edi		; unused
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process rows.
-
-	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.rowloop:
-
-	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
-
-	; mm0=(20 21 22 23), mm2=(24 25 26 27)
-	; mm1=(30 31 32 33), mm3=(34 35 36 37)
-
-	movq      mm4,mm0		; transpose coefficients(phase 1)
-	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
-	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
-	movq      mm5,mm2		; transpose coefficients(phase 1)
-	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
-	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
-
-	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
-
-	; mm6=(00 01 02 03), mm1=(04 05 06 07)
-	; mm7=(10 11 12 13), mm3=(14 15 16 17)
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
-	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
-
-	movq      mm4,mm6		; transpose coefficients(phase 1)
-	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
-	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
-	movq      mm2,mm1		; transpose coefficients(phase 1)
-	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
-	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
-
-	movq      mm7,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
-	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
-	movq      mm3,mm2		; transpose coefficients(phase 2)
-	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
-	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
-
-	movq	mm0,mm7
-	movq	mm5,mm6
-	psubw	mm7,mm2			; mm7=data1-data6=tmp6
-	psubw	mm6,mm3			; mm6=data0-data7=tmp7
-	paddw	mm0,mm2			; mm0=data1+data6=tmp1
-	paddw	mm5,mm3			; mm5=data0+data7=tmp0
-
-	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
-	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
-	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
-	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
-
-	movq      mm7,mm4		; transpose coefficients(phase 2)
-	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
-	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
-	movq      mm6,mm1		; transpose coefficients(phase 2)
-	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
-	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
-
-	movq	mm2,mm7
-	movq	mm3,mm4
-	paddw	mm7,mm1			; mm7=data3+data4=tmp3
-	paddw	mm4,mm6			; mm4=data2+data5=tmp2
-	psubw	mm2,mm1			; mm2=data3-data4=tmp4
-	psubw	mm3,mm6			; mm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movq	mm1,mm5
-	movq	mm6,mm0
-	psubw	mm5,mm7			; mm5=tmp13
-	psubw	mm0,mm4			; mm0=tmp12
-	paddw	mm1,mm7			; mm1=tmp10
-	paddw	mm6,mm4			; mm6=tmp11
-
-	paddw	mm0,mm5
-	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
-
-	movq	mm7,mm1
-	movq	mm4,mm5
-	psubw	mm1,mm6			; mm1=data4
-	psubw	mm5,mm0			; mm5=data6
-	paddw	mm7,mm6			; mm7=data0
-	paddw	mm4,mm0			; mm4=data2
-
-	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
-	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
-	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
-	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-
-	; -- Odd part
-
-	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
-	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
-
-	paddw	mm2,mm3			; mm2=tmp10
-	paddw	mm3,mm6			; mm3=tmp11
-	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
-
-	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
-
-	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
-
-	movq	mm1,mm2			; mm1=tmp10
-	psubw	mm2,mm6
-	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
-	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
-	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
-	paddw	mm1,mm2			; mm1=z2
-	paddw	mm6,mm2			; mm6=z4
-
-	movq	mm5,mm0
-	psubw	mm0,mm3			; mm0=z13
-	paddw	mm5,mm3			; mm5=z11
-
-	movq	mm7,mm0
-	movq	mm4,mm5
-	psubw	mm0,mm1			; mm0=data3
-	psubw	mm5,mm6			; mm5=data7
-	paddw	mm7,mm1			; mm7=data5
-	paddw	mm4,mm6			; mm4=data1
-
-	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
-	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
-	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
-	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
-
-	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
-	dec	ecx
-	jnz	near .rowloop
-
-	; ---- Pass 2: process columns.
-
-	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-	; mm0=(02 12 22 32), mm2=(42 52 62 72)
-	; mm1=(03 13 23 33), mm3=(43 53 63 73)
-
-	movq      mm4,mm0		; transpose coefficients(phase 1)
-	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
-	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
-	movq      mm5,mm2		; transpose coefficients(phase 1)
-	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
-	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
-
-	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-
-	; mm6=(00 10 20 30), mm1=(40 50 60 70)
-	; mm7=(01 11 21 31), mm3=(41 51 61 71)
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
-	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
-
-	movq      mm4,mm6		; transpose coefficients(phase 1)
-	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
-	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
-	movq      mm2,mm1		; transpose coefficients(phase 1)
-	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
-	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
-
-	movq      mm7,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
-	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
-	movq      mm3,mm2		; transpose coefficients(phase 2)
-	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
-	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
-
-	movq	mm0,mm7
-	movq	mm5,mm6
-	psubw	mm7,mm2			; mm7=data1-data6=tmp6
-	psubw	mm6,mm3			; mm6=data0-data7=tmp7
-	paddw	mm0,mm2			; mm0=data1+data6=tmp1
-	paddw	mm5,mm3			; mm5=data0+data7=tmp0
-
-	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
-	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
-	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
-	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
-
-	movq      mm7,mm4		; transpose coefficients(phase 2)
-	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
-	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
-	movq      mm6,mm1		; transpose coefficients(phase 2)
-	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
-	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
-
-	movq	mm2,mm7
-	movq	mm3,mm4
-	paddw	mm7,mm1			; mm7=data3+data4=tmp3
-	paddw	mm4,mm6			; mm4=data2+data5=tmp2
-	psubw	mm2,mm1			; mm2=data3-data4=tmp4
-	psubw	mm3,mm6			; mm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movq	mm1,mm5
-	movq	mm6,mm0
-	psubw	mm5,mm7			; mm5=tmp13
-	psubw	mm0,mm4			; mm0=tmp12
-	paddw	mm1,mm7			; mm1=tmp10
-	paddw	mm6,mm4			; mm6=tmp11
-
-	paddw	mm0,mm5
-	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
-
-	movq	mm7,mm1
-	movq	mm4,mm5
-	psubw	mm1,mm6			; mm1=data4
-	psubw	mm5,mm0			; mm5=data6
-	paddw	mm7,mm6			; mm7=data0
-	paddw	mm4,mm0			; mm4=data2
-
-	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
-	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
-	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
-	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-
-	; -- Odd part
-
-	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
-	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
-
-	paddw	mm2,mm3			; mm2=tmp10
-	paddw	mm3,mm6			; mm3=tmp11
-	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
-
-	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
-
-	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
-
-	movq	mm1,mm2			; mm1=tmp10
-	psubw	mm2,mm6
-	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
-	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
-	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
-	paddw	mm1,mm2			; mm1=z2
-	paddw	mm6,mm2			; mm6=z4
-
-	movq	mm5,mm0
-	psubw	mm0,mm3			; mm0=z13
-	paddw	mm5,mm3			; mm5=z11
-
-	movq	mm7,mm0
-	movq	mm4,mm5
-	psubw	mm0,mm1			; mm0=data3
-	psubw	mm5,mm6			; mm5=data7
-	paddw	mm7,mm1			; mm7=data5
-	paddw	mm4,mm6			; mm4=data1
-
-	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
-	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
-	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
-	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
-
-	add	edx, byte 4*SIZEOF_DCTELEM
-	dec	ecx
-	jnz	near .columnloop
-
-	emms		; empty MMX state
-
-;	pop	edi		; unused
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfmmxint.asm b/simd/jfmmxint.asm
deleted file mode 100644
index e5593f8..0000000
--- a/simd/jfmmxint.asm
+++ /dev/null
@@ -1,622 +0,0 @@
-;
-; jfmmxint.asm - accurate integer FDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
-%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298	equ	 2446		; FIX(0.298631336)
-F_0_390	equ	 3196		; FIX(0.390180644)
-F_0_541	equ	 4433		; FIX(0.541196100)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_175	equ	 9633		; FIX(1.175875602)
-F_1_501	equ	12299		; FIX(1.501321110)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_1_961	equ	16069		; FIX(1.961570560)
-F_2_053	equ	16819		; FIX(2.053119869)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_072	equ	25172		; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
-F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
-F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
-F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fdct_islow_mmx) PRIVATE
-
-EXTN(jconst_fdct_islow_mmx):
-
-PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X	times 4 dw  1 << (PASS1_BITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_mmx (DCTELEM * data)
-;
-
-%define data(b)		(b)+8		; DCTELEM * data
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_islow_mmx) PRIVATE
-
-EXTN(jsimd_fdct_islow_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-;	push	edi		; unused
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process rows.
-
-	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.rowloop:
-
-	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
-
-	; mm0=(20 21 22 23), mm2=(24 25 26 27)
-	; mm1=(30 31 32 33), mm3=(34 35 36 37)
-
-	movq      mm4,mm0		; transpose coefficients(phase 1)
-	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
-	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
-	movq      mm5,mm2		; transpose coefficients(phase 1)
-	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
-	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
-
-	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
-
-	; mm6=(00 01 02 03), mm1=(04 05 06 07)
-	; mm7=(10 11 12 13), mm3=(14 15 16 17)
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
-	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
-
-	movq      mm4,mm6		; transpose coefficients(phase 1)
-	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
-	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
-	movq      mm2,mm1		; transpose coefficients(phase 1)
-	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
-	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
-
-	movq      mm7,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
-	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
-	movq      mm3,mm2		; transpose coefficients(phase 2)
-	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
-	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
-
-	movq	mm0,mm7
-	movq	mm5,mm6
-	psubw	mm7,mm2			; mm7=data1-data6=tmp6
-	psubw	mm6,mm3			; mm6=data0-data7=tmp7
-	paddw	mm0,mm2			; mm0=data1+data6=tmp1
-	paddw	mm5,mm3			; mm5=data0+data7=tmp0
-
-	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
-	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
-	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
-	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
-
-	movq      mm7,mm4		; transpose coefficients(phase 2)
-	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
-	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
-	movq      mm6,mm1		; transpose coefficients(phase 2)
-	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
-	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
-
-	movq	mm2,mm7
-	movq	mm3,mm4
-	paddw	mm7,mm1			; mm7=data3+data4=tmp3
-	paddw	mm4,mm6			; mm4=data2+data5=tmp2
-	psubw	mm2,mm1			; mm2=data3-data4=tmp4
-	psubw	mm3,mm6			; mm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movq	mm1,mm5
-	movq	mm6,mm0
-	paddw	mm5,mm7			; mm5=tmp10
-	paddw	mm0,mm4			; mm0=tmp11
-	psubw	mm1,mm7			; mm1=tmp13
-	psubw	mm6,mm4			; mm6=tmp12
-
-	movq	mm7,mm5
-	paddw	mm5,mm0			; mm5=tmp10+tmp11
-	psubw	mm7,mm0			; mm7=tmp10-tmp11
-
-	psllw	mm5,PASS1_BITS		; mm5=data0
-	psllw	mm7,PASS1_BITS		; mm7=data4
-
-	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
-	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
-
-	; (Original)
-	; z1 = (tmp12 + tmp13) * 0.541196100;
-	; data2 = z1 + tmp13 * 0.765366865;
-	; data6 = z1 + tmp12 * -1.847759065;
-	;
-	; (This implementation)
-	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-	movq      mm4,mm1		; mm1=tmp13
-	movq      mm0,mm1
-	punpcklwd mm4,mm6		; mm6=tmp12
-	punpckhwd mm0,mm6
-	movq      mm1,mm4
-	movq      mm6,mm0
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
-	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
-	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
-
-	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	mm4,DESCALE_P1
-	psrad	mm0,DESCALE_P1
-	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	mm1,DESCALE_P1
-	psrad	mm6,DESCALE_P1
-
-	packssdw  mm4,mm0		; mm4=data2
-	packssdw  mm1,mm6		; mm1=data6
-
-	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
-
-	; -- Odd part
-
-	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
-	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
-
-	movq	mm0,mm2			; mm2=tmp4
-	movq	mm6,mm3			; mm3=tmp5
-	paddw	mm0,mm5			; mm0=z3
-	paddw	mm6,mm7			; mm6=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movq      mm4,mm0
-	movq      mm1,mm0
-	punpcklwd mm4,mm6
-	punpckhwd mm1,mm6
-	movq      mm0,mm4
-	movq      mm6,mm1
-	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
-	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
-	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
-	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
-
-	; (Original)
-	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-	;
-	; (This implementation)
-	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-	movq      mm4,mm2
-	movq      mm1,mm2
-	punpcklwd mm4,mm7
-	punpckhwd mm1,mm7
-	movq      mm2,mm4
-	movq      mm7,mm1
-	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
-	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
-
-	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
-	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
-	paddd	mm2,mm0			; mm2=data1L
-	paddd	mm7,mm6			; mm7=data1H
-
-	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	mm4,DESCALE_P1
-	psrad	mm1,DESCALE_P1
-	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	mm2,DESCALE_P1
-	psrad	mm7,DESCALE_P1
-
-	packssdw  mm4,mm1		; mm4=data7
-	packssdw  mm2,mm7		; mm2=data1
-
-	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
-	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
-
-	movq      mm1,mm3
-	movq      mm7,mm3
-	punpcklwd mm1,mm5
-	punpckhwd mm7,mm5
-	movq      mm3,mm1
-	movq      mm5,mm7
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
-	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
-	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
-	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
-
-	paddd	mm1,mm0			; mm1=data5L
-	paddd	mm7,mm6			; mm7=data5H
-	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
-	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
-
-	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	mm1,DESCALE_P1
-	psrad	mm7,DESCALE_P1
-	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	mm3,DESCALE_P1
-	psrad	mm5,DESCALE_P1
-
-	packssdw  mm1,mm7		; mm1=data5
-	packssdw  mm3,mm5		; mm3=data3
-
-	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
-	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
-
-	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
-	dec	ecx
-	jnz	near .rowloop
-
-	; ---- Pass 2: process columns.
-
-	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.columnloop:
-
-	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-	; mm0=(02 12 22 32), mm2=(42 52 62 72)
-	; mm1=(03 13 23 33), mm3=(43 53 63 73)
-
-	movq      mm4,mm0		; transpose coefficients(phase 1)
-	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
-	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
-	movq      mm5,mm2		; transpose coefficients(phase 1)
-	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
-	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
-
-	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-
-	; mm6=(00 10 20 30), mm1=(40 50 60 70)
-	; mm7=(01 11 21 31), mm3=(41 51 61 71)
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
-	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
-
-	movq      mm4,mm6		; transpose coefficients(phase 1)
-	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
-	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
-	movq      mm2,mm1		; transpose coefficients(phase 1)
-	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
-	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
-
-	movq      mm7,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
-	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
-	movq      mm3,mm2		; transpose coefficients(phase 2)
-	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
-	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
-
-	movq	mm0,mm7
-	movq	mm5,mm6
-	psubw	mm7,mm2			; mm7=data1-data6=tmp6
-	psubw	mm6,mm3			; mm6=data0-data7=tmp7
-	paddw	mm0,mm2			; mm0=data1+data6=tmp1
-	paddw	mm5,mm3			; mm5=data0+data7=tmp0
-
-	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
-	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
-	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
-	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
-
-	movq      mm7,mm4		; transpose coefficients(phase 2)
-	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
-	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
-	movq      mm6,mm1		; transpose coefficients(phase 2)
-	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
-	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
-
-	movq	mm2,mm7
-	movq	mm3,mm4
-	paddw	mm7,mm1			; mm7=data3+data4=tmp3
-	paddw	mm4,mm6			; mm4=data2+data5=tmp2
-	psubw	mm2,mm1			; mm2=data3-data4=tmp4
-	psubw	mm3,mm6			; mm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movq	mm1,mm5
-	movq	mm6,mm0
-	paddw	mm5,mm7			; mm5=tmp10
-	paddw	mm0,mm4			; mm0=tmp11
-	psubw	mm1,mm7			; mm1=tmp13
-	psubw	mm6,mm4			; mm6=tmp12
-
-	movq	mm7,mm5
-	paddw	mm5,mm0			; mm5=tmp10+tmp11
-	psubw	mm7,mm0			; mm7=tmp10-tmp11
-
-	paddw	mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
-	paddw	mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
-	psraw	mm5,PASS1_BITS		; mm5=data0
-	psraw	mm7,PASS1_BITS		; mm7=data4
-
-	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
-	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
-
-	; (Original)
-	; z1 = (tmp12 + tmp13) * 0.541196100;
-	; data2 = z1 + tmp13 * 0.765366865;
-	; data6 = z1 + tmp12 * -1.847759065;
-	;
-	; (This implementation)
-	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-	movq      mm4,mm1		; mm1=tmp13
-	movq      mm0,mm1
-	punpcklwd mm4,mm6		; mm6=tmp12
-	punpckhwd mm0,mm6
-	movq      mm1,mm4
-	movq      mm6,mm0
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
-	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
-	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
-
-	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	mm4,DESCALE_P2
-	psrad	mm0,DESCALE_P2
-	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	mm1,DESCALE_P2
-	psrad	mm6,DESCALE_P2
-
-	packssdw  mm4,mm0		; mm4=data2
-	packssdw  mm1,mm6		; mm1=data6
-
-	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
-
-	; -- Odd part
-
-	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
-	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
-
-	movq	mm0,mm2			; mm2=tmp4
-	movq	mm6,mm3			; mm3=tmp5
-	paddw	mm0,mm5			; mm0=z3
-	paddw	mm6,mm7			; mm6=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movq      mm4,mm0
-	movq      mm1,mm0
-	punpcklwd mm4,mm6
-	punpckhwd mm1,mm6
-	movq      mm0,mm4
-	movq      mm6,mm1
-	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
-	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
-	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
-
-	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
-	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
-
-	; (Original)
-	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-	;
-	; (This implementation)
-	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-	movq      mm4,mm2
-	movq      mm1,mm2
-	punpcklwd mm4,mm7
-	punpckhwd mm1,mm7
-	movq      mm2,mm4
-	movq      mm7,mm1
-	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
-	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
-
-	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
-	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
-	paddd	mm2,mm0			; mm2=data1L
-	paddd	mm7,mm6			; mm7=data1H
-
-	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	mm4,DESCALE_P2
-	psrad	mm1,DESCALE_P2
-	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	mm2,DESCALE_P2
-	psrad	mm7,DESCALE_P2
-
-	packssdw  mm4,mm1		; mm4=data7
-	packssdw  mm2,mm7		; mm2=data1
-
-	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
-	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
-
-	movq      mm1,mm3
-	movq      mm7,mm3
-	punpcklwd mm1,mm5
-	punpckhwd mm7,mm5
-	movq      mm3,mm1
-	movq      mm5,mm7
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
-	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
-	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
-	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
-
-	paddd	mm1,mm0			; mm1=data5L
-	paddd	mm7,mm6			; mm7=data5H
-	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
-	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
-
-	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	mm1,DESCALE_P2
-	psrad	mm7,DESCALE_P2
-	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	mm3,DESCALE_P2
-	psrad	mm5,DESCALE_P2
-
-	packssdw  mm1,mm7		; mm1=data5
-	packssdw  mm3,mm5		; mm3=data3
-
-	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
-	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
-
-	add	edx, byte 4*SIZEOF_DCTELEM
-	dec	ecx
-	jnz	near .columnloop
-
-	emms		; empty MMX state
-
-;	pop	edi		; unused
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfss2fst-64.asm b/simd/jfss2fst-64.asm
deleted file mode 100644
index 16a62f2..0000000
--- a/simd/jfss2fst-64.asm
+++ /dev/null
@@ -1,392 +0,0 @@
-;
-; jfss2fst-64.asm - fast integer FDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	8	; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382	equ	 98		; FIX(0.382683433)
-F_0_541	equ	139		; FIX(0.541196100)
-F_0_707	equ	181		; FIX(0.707106781)
-F_1_306	equ	334		; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
-F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-	alignz	16
-	global	EXTN(jconst_fdct_ifast_sse2) PRIVATE
-
-EXTN(jconst_fdct_ifast_sse2):
-
-PW_F0707	times 8 dw  F_0_707 << CONST_SHIFT
-PW_F0382	times 8 dw  F_0_382 << CONST_SHIFT
-PW_F0541	times 8 dw  F_0_541 << CONST_SHIFT
-PW_F1306	times 8 dw  F_1_306 << CONST_SHIFT
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM * data)
-;
-
-; r10 = DCTELEM * data
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_ifast_sse2) PRIVATE
-
-EXTN(jsimd_fdct_ifast_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-
-	; ---- Pass 1: process rows.
-
-	mov	rdx, r10	; (DCTELEM *)
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
-	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
-	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
-
-	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
-	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
-	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
-
-	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
-	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
-
-	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
-	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(42 52 62 72 43 53 63 73)
-	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=(44 54 64 74 45 55 65 75)
-
-	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
-	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
-
-	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
-	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
-	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
-	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-	movdqa	xmm6,xmm1
-	movdqa	xmm3,xmm0
-	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
-	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
-	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
-	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
-
-	movdqa	xmm2, XMMWORD [wk(0)]	; xmm2=(42 52 62 72 43 53 63 73)
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(44 54 64 74 45 55 65 75)
-	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
-
-	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
-	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
-	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
-	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-	movdqa	xmm2,xmm1
-	movdqa	xmm5,xmm7
-	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
-	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
-	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
-	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm0,xmm6
-	psubw	xmm3,xmm1		; xmm3=tmp13
-	psubw	xmm6,xmm7		; xmm6=tmp12
-	paddw	xmm4,xmm1		; xmm4=tmp10
-	paddw	xmm0,xmm7		; xmm0=tmp11
-
-	paddw	xmm6,xmm3
-	psllw	xmm6,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm6,[rel PW_F0707] ; xmm6=z1
-
-	movdqa	xmm1,xmm4
-	movdqa	xmm7,xmm3
-	psubw	xmm4,xmm0		; xmm4=data4
-	psubw	xmm3,xmm6		; xmm3=data6
-	paddw	xmm1,xmm0		; xmm1=data0
-	paddw	xmm7,xmm6		; xmm7=data2
-
-	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=tmp6
-	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp7
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=data4
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=data6
-
-	; -- Odd part
-
-	paddw	xmm2,xmm5		; xmm2=tmp10
-	paddw	xmm5,xmm0		; xmm5=tmp11
-	paddw	xmm0,xmm6		; xmm0=tmp12, xmm6=tmp7
-
-	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
-
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm5,[rel PW_F0707] ; xmm5=z3
-
-	movdqa	xmm4,xmm2		; xmm4=tmp10
-	psubw	xmm2,xmm0
-	pmulhw	xmm2,[rel PW_F0382] ; xmm2=z5
-	pmulhw	xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-	pmulhw	xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
-	paddw	xmm4,xmm2		; xmm4=z2
-	paddw	xmm0,xmm2		; xmm0=z4
-
-	movdqa	xmm3,xmm6
-	psubw	xmm6,xmm5		; xmm6=z13
-	paddw	xmm3,xmm5		; xmm3=z11
-
-	movdqa	xmm2,xmm6
-	movdqa	xmm5,xmm3
-	psubw	xmm6,xmm4		; xmm6=data3
-	psubw	xmm3,xmm0		; xmm3=data7
-	paddw	xmm2,xmm4		; xmm2=data5
-	paddw	xmm5,xmm0		; xmm5=data1
-
-	; ---- Pass 2: process columns.
-
-	; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
-	; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
-	movdqa    xmm4,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm5		; xmm1=(00 01 10 11 20 21 30 31)
-	punpckhwd xmm4,xmm5		; xmm4=(40 41 50 51 60 61 70 71)
-	movdqa    xmm0,xmm7		; transpose coefficients(phase 1)
-	punpcklwd xmm7,xmm6		; xmm7=(02 03 12 13 22 23 32 33)
-	punpckhwd xmm0,xmm6		; xmm0=(42 43 52 53 62 63 72 73)
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=col4
-	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=col6
-
-	; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
-	; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(02 03 12 13 22 23 32 33)
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(42 43 52 53 62 63 72 73)
-
-	movdqa    xmm7,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm2		; xmm5=(04 05 14 15 24 25 34 35)
-	punpckhwd xmm7,xmm2		; xmm7=(44 45 54 55 64 65 74 75)
-	movdqa    xmm0,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm3		; xmm6=(06 07 16 17 26 27 36 37)
-	punpckhwd xmm0,xmm3		; xmm0=(46 47 56 57 66 67 76 77)
-
-	movdqa    xmm2,xmm5		; transpose coefficients(phase 2)
-	punpckldq xmm5,xmm6		; xmm5=(04 05 06 07 14 15 16 17)
-	punpckhdq xmm2,xmm6		; xmm2=(24 25 26 27 34 35 36 37)
-	movdqa    xmm3,xmm7		; transpose coefficients(phase 2)
-	punpckldq xmm7,xmm0		; xmm7=(44 45 46 47 54 55 56 57)
-	punpckhdq xmm3,xmm0		; xmm3=(64 65 66 67 74 75 76 77)
-
-	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=(02 03 12 13 22 23 32 33)
-	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(42 43 52 53 62 63 72 73)
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(24 25 26 27 34 35 36 37)
-	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(44 45 46 47 54 55 56 57)
-
-	movdqa    xmm2,xmm1		; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm6		; xmm1=(00 01 02 03 10 11 12 13)
-	punpckhdq xmm2,xmm6		; xmm2=(20 21 22 23 30 31 32 33)
-	movdqa    xmm7,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm0		; xmm4=(40 41 42 43 50 51 52 53)
-	punpckhdq xmm7,xmm0		; xmm7=(60 61 62 63 70 71 72 73)
-
-	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm5		; xmm1=(00 01 02 03 04 05 06 07)=data0
-	punpckhqdq xmm6,xmm5		; xmm6=(10 11 12 13 14 15 16 17)=data1
-	movdqa     xmm0,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm3		; xmm7=(60 61 62 63 64 65 66 67)=data6
-	punpckhqdq xmm0,xmm3		; xmm0=(70 71 72 73 74 75 76 77)=data7
-
-	movdqa	xmm5,xmm6
-	movdqa	xmm3,xmm1
-	psubw	xmm6,xmm7		; xmm6=data1-data6=tmp6
-	psubw	xmm1,xmm0		; xmm1=data0-data7=tmp7
-	paddw	xmm5,xmm7		; xmm5=data1+data6=tmp1
-	paddw	xmm3,xmm0		; xmm3=data0+data7=tmp0
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(24 25 26 27 34 35 36 37)
-	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(44 45 46 47 54 55 56 57)
-	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=tmp7
-
-	movdqa     xmm6,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm7		; xmm2=(20 21 22 23 24 25 26 27)=data2
-	punpckhqdq xmm6,xmm7		; xmm6=(30 31 32 33 34 35 36 37)=data3
-	movdqa     xmm1,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm0		; xmm4=(40 41 42 43 44 45 46 47)=data4
-	punpckhqdq xmm1,xmm0		; xmm1=(50 51 52 53 54 55 56 57)=data5
-
-	movdqa	xmm7,xmm6
-	movdqa	xmm0,xmm2
-	paddw	xmm6,xmm4		; xmm6=data3+data4=tmp3
-	paddw	xmm2,xmm1		; xmm2=data2+data5=tmp2
-	psubw	xmm7,xmm4		; xmm7=data3-data4=tmp4
-	psubw	xmm0,xmm1		; xmm0=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm1,xmm5
-	psubw	xmm3,xmm6		; xmm3=tmp13
-	psubw	xmm5,xmm2		; xmm5=tmp12
-	paddw	xmm4,xmm6		; xmm4=tmp10
-	paddw	xmm1,xmm2		; xmm1=tmp11
-
-	paddw	xmm5,xmm3
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm5,[rel PW_F0707] ; xmm5=z1
-
-	movdqa	xmm6,xmm4
-	movdqa	xmm2,xmm3
-	psubw	xmm4,xmm1		; xmm4=data4
-	psubw	xmm3,xmm5		; xmm3=data6
-	paddw	xmm6,xmm1		; xmm6=data0
-	paddw	xmm2,xmm5		; xmm2=data2
-
-	movdqa	XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
-	movdqa	XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
-	movdqa	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
-	movdqa	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
-
-	; -- Odd part
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=tmp6
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
-
-	paddw	xmm7,xmm0		; xmm7=tmp10
-	paddw	xmm0,xmm1		; xmm0=tmp11
-	paddw	xmm1,xmm5		; xmm1=tmp12, xmm5=tmp7
-
-	psllw	xmm7,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
-
-	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm0,[rel PW_F0707] ; xmm0=z3
-
-	movdqa	xmm4,xmm7		; xmm4=tmp10
-	psubw	xmm7,xmm1
-	pmulhw	xmm7,[rel PW_F0382] ; xmm7=z5
-	pmulhw	xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-	pmulhw	xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
-	paddw	xmm4,xmm7		; xmm4=z2
-	paddw	xmm1,xmm7		; xmm1=z4
-
-	movdqa	xmm3,xmm5
-	psubw	xmm5,xmm0		; xmm5=z13
-	paddw	xmm3,xmm0		; xmm3=z11
-
-	movdqa	xmm6,xmm5
-	movdqa	xmm2,xmm3
-	psubw	xmm5,xmm4		; xmm5=data3
-	psubw	xmm3,xmm1		; xmm3=data7
-	paddw	xmm6,xmm4		; xmm6=data5
-	paddw	xmm2,xmm1		; xmm2=data1
-
-	movdqa	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
-	movdqa	XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
-	movdqa	XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
-	movdqa	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
-
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfss2fst.asm b/simd/jfss2fst.asm
deleted file mode 100644
index 3232db5..0000000
--- a/simd/jfss2fst.asm
+++ /dev/null
@@ -1,404 +0,0 @@
-;
-; jfss2fst.asm - fast integer FDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	8	; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382	equ	 98		; FIX(0.382683433)
-F_0_541	equ	139		; FIX(0.541196100)
-F_0_707	equ	181		; FIX(0.707106781)
-F_1_306	equ	334		; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
-F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-	alignz	16
-	global	EXTN(jconst_fdct_ifast_sse2) PRIVATE
-
-EXTN(jconst_fdct_ifast_sse2):
-
-PW_F0707	times 8 dw  F_0_707 << CONST_SHIFT
-PW_F0382	times 8 dw  F_0_382 << CONST_SHIFT
-PW_F0541	times 8 dw  F_0_541 << CONST_SHIFT
-PW_F1306	times 8 dw  F_1_306 << CONST_SHIFT
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM * data)
-;
-
-%define data(b)		(b)+8		; DCTELEM * data
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_ifast_sse2) PRIVATE
-
-EXTN(jsimd_fdct_ifast_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-;	push	edi		; unused
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process rows.
-
-	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
-	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
-	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
-
-	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
-	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
-
-	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
-	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
-
-	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
-	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(42 52 62 72 43 53 63 73)
-	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=(44 54 64 74 45 55 65 75)
-
-	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
-	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
-
-	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
-	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
-	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
-	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-	movdqa	xmm6,xmm1
-	movdqa	xmm3,xmm0
-	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
-	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
-	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
-	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
-
-	movdqa	xmm2, XMMWORD [wk(0)]	; xmm2=(42 52 62 72 43 53 63 73)
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(44 54 64 74 45 55 65 75)
-	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
-
-	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
-	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
-	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
-	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-	movdqa	xmm2,xmm1
-	movdqa	xmm5,xmm7
-	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
-	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
-	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
-	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm0,xmm6
-	psubw	xmm3,xmm1		; xmm3=tmp13
-	psubw	xmm6,xmm7		; xmm6=tmp12
-	paddw	xmm4,xmm1		; xmm4=tmp10
-	paddw	xmm0,xmm7		; xmm0=tmp11
-
-	paddw	xmm6,xmm3
-	psllw	xmm6,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
-
-	movdqa	xmm1,xmm4
-	movdqa	xmm7,xmm3
-	psubw	xmm4,xmm0		; xmm4=data4
-	psubw	xmm3,xmm6		; xmm3=data6
-	paddw	xmm1,xmm0		; xmm1=data0
-	paddw	xmm7,xmm6		; xmm7=data2
-
-	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=tmp6
-	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp7
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=data4
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=data6
-
-	; -- Odd part
-
-	paddw	xmm2,xmm5		; xmm2=tmp10
-	paddw	xmm5,xmm0		; xmm5=tmp11
-	paddw	xmm0,xmm6		; xmm0=tmp12, xmm6=tmp7
-
-	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
-
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
-
-	movdqa	xmm4,xmm2		; xmm4=tmp10
-	psubw	xmm2,xmm0
-	pmulhw	xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
-	pmulhw	xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-	pmulhw	xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
-	paddw	xmm4,xmm2		; xmm4=z2
-	paddw	xmm0,xmm2		; xmm0=z4
-
-	movdqa	xmm3,xmm6
-	psubw	xmm6,xmm5		; xmm6=z13
-	paddw	xmm3,xmm5		; xmm3=z11
-
-	movdqa	xmm2,xmm6
-	movdqa	xmm5,xmm3
-	psubw	xmm6,xmm4		; xmm6=data3
-	psubw	xmm3,xmm0		; xmm3=data7
-	paddw	xmm2,xmm4		; xmm2=data5
-	paddw	xmm5,xmm0		; xmm5=data1
-
-	; ---- Pass 2: process columns.
-
-;	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-
-	; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
-	; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
-	movdqa    xmm4,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm5		; xmm1=(00 01 10 11 20 21 30 31)
-	punpckhwd xmm4,xmm5		; xmm4=(40 41 50 51 60 61 70 71)
-	movdqa    xmm0,xmm7		; transpose coefficients(phase 1)
-	punpcklwd xmm7,xmm6		; xmm7=(02 03 12 13 22 23 32 33)
-	punpckhwd xmm0,xmm6		; xmm0=(42 43 52 53 62 63 72 73)
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=col4
-	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=col6
-
-	; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
-	; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(02 03 12 13 22 23 32 33)
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(42 43 52 53 62 63 72 73)
-
-	movdqa    xmm7,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm2		; xmm5=(04 05 14 15 24 25 34 35)
-	punpckhwd xmm7,xmm2		; xmm7=(44 45 54 55 64 65 74 75)
-	movdqa    xmm0,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm3		; xmm6=(06 07 16 17 26 27 36 37)
-	punpckhwd xmm0,xmm3		; xmm0=(46 47 56 57 66 67 76 77)
-
-	movdqa    xmm2,xmm5		; transpose coefficients(phase 2)
-	punpckldq xmm5,xmm6		; xmm5=(04 05 06 07 14 15 16 17)
-	punpckhdq xmm2,xmm6		; xmm2=(24 25 26 27 34 35 36 37)
-	movdqa    xmm3,xmm7		; transpose coefficients(phase 2)
-	punpckldq xmm7,xmm0		; xmm7=(44 45 46 47 54 55 56 57)
-	punpckhdq xmm3,xmm0		; xmm3=(64 65 66 67 74 75 76 77)
-
-	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=(02 03 12 13 22 23 32 33)
-	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(42 43 52 53 62 63 72 73)
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(24 25 26 27 34 35 36 37)
-	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(44 45 46 47 54 55 56 57)
-
-	movdqa    xmm2,xmm1		; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm6		; xmm1=(00 01 02 03 10 11 12 13)
-	punpckhdq xmm2,xmm6		; xmm2=(20 21 22 23 30 31 32 33)
-	movdqa    xmm7,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm0		; xmm4=(40 41 42 43 50 51 52 53)
-	punpckhdq xmm7,xmm0		; xmm7=(60 61 62 63 70 71 72 73)
-
-	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm5		; xmm1=(00 01 02 03 04 05 06 07)=data0
-	punpckhqdq xmm6,xmm5		; xmm6=(10 11 12 13 14 15 16 17)=data1
-	movdqa     xmm0,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm3		; xmm7=(60 61 62 63 64 65 66 67)=data6
-	punpckhqdq xmm0,xmm3		; xmm0=(70 71 72 73 74 75 76 77)=data7
-
-	movdqa	xmm5,xmm6
-	movdqa	xmm3,xmm1
-	psubw	xmm6,xmm7		; xmm6=data1-data6=tmp6
-	psubw	xmm1,xmm0		; xmm1=data0-data7=tmp7
-	paddw	xmm5,xmm7		; xmm5=data1+data6=tmp1
-	paddw	xmm3,xmm0		; xmm3=data0+data7=tmp0
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(24 25 26 27 34 35 36 37)
-	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(44 45 46 47 54 55 56 57)
-	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=tmp7
-
-	movdqa     xmm6,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm7		; xmm2=(20 21 22 23 24 25 26 27)=data2
-	punpckhqdq xmm6,xmm7		; xmm6=(30 31 32 33 34 35 36 37)=data3
-	movdqa     xmm1,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm0		; xmm4=(40 41 42 43 44 45 46 47)=data4
-	punpckhqdq xmm1,xmm0		; xmm1=(50 51 52 53 54 55 56 57)=data5
-
-	movdqa	xmm7,xmm6
-	movdqa	xmm0,xmm2
-	paddw	xmm6,xmm4		; xmm6=data3+data4=tmp3
-	paddw	xmm2,xmm1		; xmm2=data2+data5=tmp2
-	psubw	xmm7,xmm4		; xmm7=data3-data4=tmp4
-	psubw	xmm0,xmm1		; xmm0=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm1,xmm5
-	psubw	xmm3,xmm6		; xmm3=tmp13
-	psubw	xmm5,xmm2		; xmm5=tmp12
-	paddw	xmm4,xmm6		; xmm4=tmp10
-	paddw	xmm1,xmm2		; xmm1=tmp11
-
-	paddw	xmm5,xmm3
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
-
-	movdqa	xmm6,xmm4
-	movdqa	xmm2,xmm3
-	psubw	xmm4,xmm1		; xmm4=data4
-	psubw	xmm3,xmm5		; xmm3=data6
-	paddw	xmm6,xmm1		; xmm6=data0
-	paddw	xmm2,xmm5		; xmm2=data2
-
-	movdqa	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
-	movdqa	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
-	movdqa	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
-	movdqa	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
-
-	; -- Odd part
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=tmp6
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
-
-	paddw	xmm7,xmm0		; xmm7=tmp10
-	paddw	xmm0,xmm1		; xmm0=tmp11
-	paddw	xmm1,xmm5		; xmm1=tmp12, xmm5=tmp7
-
-	psllw	xmm7,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
-
-	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
-
-	movdqa	xmm4,xmm7		; xmm4=tmp10
-	psubw	xmm7,xmm1
-	pmulhw	xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
-	pmulhw	xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-	pmulhw	xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
-	paddw	xmm4,xmm7		; xmm4=z2
-	paddw	xmm1,xmm7		; xmm1=z4
-
-	movdqa	xmm3,xmm5
-	psubw	xmm5,xmm0		; xmm5=z13
-	paddw	xmm3,xmm0		; xmm3=z11
-
-	movdqa	xmm6,xmm5
-	movdqa	xmm2,xmm3
-	psubw	xmm5,xmm4		; xmm5=data3
-	psubw	xmm3,xmm1		; xmm3=data7
-	paddw	xmm6,xmm4		; xmm6=data5
-	paddw	xmm2,xmm1		; xmm2=data1
-
-	movdqa	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
-	movdqa	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
-	movdqa	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
-	movdqa	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
-
-;	pop	edi		; unused
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfss2int-64.asm b/simd/jfss2int-64.asm
deleted file mode 100644
index 0b710f2..0000000
--- a/simd/jfss2int-64.asm
+++ /dev/null
@@ -1,622 +0,0 @@
-;
-; jfss2int-64.asm - accurate integer FDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
-%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298	equ	 2446		; FIX(0.298631336)
-F_0_390	equ	 3196		; FIX(0.390180644)
-F_0_541	equ	 4433		; FIX(0.541196100)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_175	equ	 9633		; FIX(1.175875602)
-F_1_501	equ	12299		; FIX(1.501321110)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_1_961	equ	16069		; FIX(1.961570560)
-F_2_053	equ	16819		; FIX(2.053119869)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_072	equ	25172		; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
-F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
-F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
-F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fdct_islow_sse2) PRIVATE
-
-EXTN(jconst_fdct_islow_sse2):
-
-PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X	times 8 dw  1 << (PASS1_BITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM * data)
-;
-
-; r10 = DCTELEM * data
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		6
-
-	align	16
-	global	EXTN(jsimd_fdct_islow_sse2) PRIVATE
-
-EXTN(jsimd_fdct_islow_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-
-	; ---- Pass 1: process rows.
-
-	mov	rdx, r10	; (DCTELEM *)
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
-	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
-	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
-
-	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
-	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
-	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
-
-	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
-	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
-
-	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
-	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
-	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=(42 52 62 72 43 53 63 73)
-	movdqa	XMMWORD [wk(3)], xmm2	; wk(3)=(44 54 64 74 45 55 65 75)
-
-	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
-	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
-
-	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
-	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
-	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
-	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-	movdqa	xmm6,xmm1
-	movdqa	xmm3,xmm0
-	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
-	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
-	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
-	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
-
-	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(42 52 62 72 43 53 63 73)
-	movdqa	xmm5, XMMWORD [wk(3)]	; xmm5=(44 54 64 74 45 55 65 75)
-	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
-
-	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
-	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
-	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
-	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-	movdqa	xmm2,xmm1
-	movdqa	xmm5,xmm7
-	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
-	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
-	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
-	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm0,xmm6
-	paddw	xmm3,xmm1		; xmm3=tmp10
-	paddw	xmm6,xmm7		; xmm6=tmp11
-	psubw	xmm4,xmm1		; xmm4=tmp13
-	psubw	xmm0,xmm7		; xmm0=tmp12
-
-	movdqa	xmm1,xmm3
-	paddw	xmm3,xmm6		; xmm3=tmp10+tmp11
-	psubw	xmm1,xmm6		; xmm1=tmp10-tmp11
-
-	psllw	xmm3,PASS1_BITS		; xmm3=data0
-	psllw	xmm1,PASS1_BITS		; xmm1=data4
-
-	movdqa	XMMWORD [wk(2)], xmm3	; wk(2)=data0
-	movdqa	XMMWORD [wk(3)], xmm1	; wk(3)=data4
-
-	; (Original)
-	; z1 = (tmp12 + tmp13) * 0.541196100;
-	; data2 = z1 + tmp13 * 0.765366865;
-	; data6 = z1 + tmp12 * -1.847759065;
-	;
-	; (This implementation)
-	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-	movdqa    xmm7,xmm4		; xmm4=tmp13
-	movdqa    xmm6,xmm4
-	punpcklwd xmm7,xmm0		; xmm0=tmp12
-	punpckhwd xmm6,xmm0
-	movdqa    xmm4,xmm7
-	movdqa    xmm0,xmm6
-	pmaddwd   xmm7,[rel PW_F130_F054]	; xmm7=data2L
-	pmaddwd   xmm6,[rel PW_F130_F054]	; xmm6=data2H
-	pmaddwd   xmm4,[rel PW_F054_MF130]	; xmm4=data6L
-	pmaddwd   xmm0,[rel PW_F054_MF130]	; xmm0=data6H
-
-	paddd	xmm7,[rel PD_DESCALE_P1]
-	paddd	xmm6,[rel PD_DESCALE_P1]
-	psrad	xmm7,DESCALE_P1
-	psrad	xmm6,DESCALE_P1
-	paddd	xmm4,[rel PD_DESCALE_P1]
-	paddd	xmm0,[rel PD_DESCALE_P1]
-	psrad	xmm4,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-
-	packssdw  xmm7,xmm6		; xmm7=data2
-	packssdw  xmm4,xmm0		; xmm4=data6
-
-	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=data2
-	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=data6
-
-	; -- Odd part
-
-	movdqa	xmm3, XMMWORD [wk(0)]	; xmm3=tmp6
-	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp7
-
-	movdqa	xmm6,xmm2		; xmm2=tmp4
-	movdqa	xmm0,xmm5		; xmm5=tmp5
-	paddw	xmm6,xmm3		; xmm6=z3
-	paddw	xmm0,xmm1		; xmm0=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm7,xmm6
-	movdqa    xmm4,xmm6
-	punpcklwd xmm7,xmm0
-	punpckhwd xmm4,xmm0
-	movdqa    xmm6,xmm7
-	movdqa    xmm0,xmm4
-	pmaddwd   xmm7,[rel PW_MF078_F117]	; xmm7=z3L
-	pmaddwd   xmm4,[rel PW_MF078_F117]	; xmm4=z3H
-	pmaddwd   xmm6,[rel PW_F117_F078]	; xmm6=z4L
-	pmaddwd   xmm0,[rel PW_F117_F078]	; xmm0=z4H
-
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=z3L
-	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=z3H
-
-	; (Original)
-	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-	;
-	; (This implementation)
-	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-	movdqa    xmm7,xmm2
-	movdqa    xmm4,xmm2
-	punpcklwd xmm7,xmm1
-	punpckhwd xmm4,xmm1
-	movdqa    xmm2,xmm7
-	movdqa    xmm1,xmm4
-	pmaddwd   xmm7,[rel PW_MF060_MF089]	; xmm7=tmp4L
-	pmaddwd   xmm4,[rel PW_MF060_MF089]	; xmm4=tmp4H
-	pmaddwd   xmm2,[rel PW_MF089_F060]	; xmm2=tmp7L
-	pmaddwd   xmm1,[rel PW_MF089_F060]	; xmm1=tmp7H
-
-	paddd	xmm7, XMMWORD [wk(0)]	; xmm7=data7L
-	paddd	xmm4, XMMWORD [wk(1)]	; xmm4=data7H
-	paddd	xmm2,xmm6		; xmm2=data1L
-	paddd	xmm1,xmm0		; xmm1=data1H
-
-	paddd	xmm7,[rel PD_DESCALE_P1]
-	paddd	xmm4,[rel PD_DESCALE_P1]
-	psrad	xmm7,DESCALE_P1
-	psrad	xmm4,DESCALE_P1
-	paddd	xmm2,[rel PD_DESCALE_P1]
-	paddd	xmm1,[rel PD_DESCALE_P1]
-	psrad	xmm2,DESCALE_P1
-	psrad	xmm1,DESCALE_P1
-
-	packssdw  xmm7,xmm4		; xmm7=data7
-	packssdw  xmm2,xmm1		; xmm2=data1
-
-	movdqa    xmm4,xmm5
-	movdqa    xmm1,xmm5
-	punpcklwd xmm4,xmm3
-	punpckhwd xmm1,xmm3
-	movdqa    xmm5,xmm4
-	movdqa    xmm3,xmm1
-	pmaddwd   xmm4,[rel PW_MF050_MF256]	; xmm4=tmp5L
-	pmaddwd   xmm1,[rel PW_MF050_MF256]	; xmm1=tmp5H
-	pmaddwd   xmm5,[rel PW_MF256_F050]	; xmm5=tmp6L
-	pmaddwd   xmm3,[rel PW_MF256_F050]	; xmm3=tmp6H
-
-	paddd	xmm4,xmm6		; xmm4=data5L
-	paddd	xmm1,xmm0		; xmm1=data5H
-	paddd	xmm5, XMMWORD [wk(0)]	; xmm5=data3L
-	paddd	xmm3, XMMWORD [wk(1)]	; xmm3=data3H
-
-	paddd	xmm4,[rel PD_DESCALE_P1]
-	paddd	xmm1,[rel PD_DESCALE_P1]
-	psrad	xmm4,DESCALE_P1
-	psrad	xmm1,DESCALE_P1
-	paddd	xmm5,[rel PD_DESCALE_P1]
-	paddd	xmm3,[rel PD_DESCALE_P1]
-	psrad	xmm5,DESCALE_P1
-	psrad	xmm3,DESCALE_P1
-
-	packssdw  xmm4,xmm1		; xmm4=data5
-	packssdw  xmm5,xmm3		; xmm5=data3
-
-	; ---- Pass 2: process columns.
-
-	movdqa	xmm6, XMMWORD [wk(2)]	; xmm6=col0
-	movdqa	xmm0, XMMWORD [wk(4)]	; xmm0=col2
-
-	; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
-	; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
-	movdqa    xmm1,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm2		; xmm6=(00 01 10 11 20 21 30 31)
-	punpckhwd xmm1,xmm2		; xmm1=(40 41 50 51 60 61 70 71)
-	movdqa    xmm3,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm5		; xmm0=(02 03 12 13 22 23 32 33)
-	punpckhwd xmm3,xmm5		; xmm3=(42 43 52 53 62 63 72 73)
-
-	movdqa	xmm2, XMMWORD [wk(3)]	; xmm2=col4
-	movdqa	xmm5, XMMWORD [wk(5)]	; xmm5=col6
-
-	; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
-	; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
-	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=(02 03 12 13 22 23 32 33)
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(42 43 52 53 62 63 72 73)
-
-	movdqa    xmm0,xmm2		; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm4		; xmm2=(04 05 14 15 24 25 34 35)
-	punpckhwd xmm0,xmm4		; xmm0=(44 45 54 55 64 65 74 75)
-	movdqa    xmm3,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm7		; xmm5=(06 07 16 17 26 27 36 37)
-	punpckhwd xmm3,xmm7		; xmm3=(46 47 56 57 66 67 76 77)
-
-	movdqa    xmm4,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm5		; xmm2=(04 05 06 07 14 15 16 17)
-	punpckhdq xmm4,xmm5		; xmm4=(24 25 26 27 34 35 36 37)
-	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm3		; xmm0=(44 45 46 47 54 55 56 57)
-	punpckhdq xmm7,xmm3		; xmm7=(64 65 66 67 74 75 76 77)
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=(02 03 12 13 22 23 32 33)
-	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53 62 63 72 73)
-	movdqa	XMMWORD [wk(2)], xmm4	; wk(2)=(24 25 26 27 34 35 36 37)
-	movdqa	XMMWORD [wk(3)], xmm0	; wk(3)=(44 45 46 47 54 55 56 57)
-
-	movdqa    xmm4,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm5		; xmm6=(00 01 02 03 10 11 12 13)
-	punpckhdq xmm4,xmm5		; xmm4=(20 21 22 23 30 31 32 33)
-	movdqa    xmm0,xmm1		; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm3		; xmm1=(40 41 42 43 50 51 52 53)
-	punpckhdq xmm0,xmm3		; xmm0=(60 61 62 63 70 71 72 73)
-
-	movdqa     xmm5,xmm6		; transpose coefficients(phase 3)
-	punpcklqdq xmm6,xmm2		; xmm6=(00 01 02 03 04 05 06 07)=data0
-	punpckhqdq xmm5,xmm2		; xmm5=(10 11 12 13 14 15 16 17)=data1
-	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm7		; xmm0=(60 61 62 63 64 65 66 67)=data6
-	punpckhqdq xmm3,xmm7		; xmm3=(70 71 72 73 74 75 76 77)=data7
-
-	movdqa	xmm2,xmm5
-	movdqa	xmm7,xmm6
-	psubw	xmm5,xmm0		; xmm5=data1-data6=tmp6
-	psubw	xmm6,xmm3		; xmm6=data0-data7=tmp7
-	paddw	xmm2,xmm0		; xmm2=data1+data6=tmp1
-	paddw	xmm7,xmm3		; xmm7=data0+data7=tmp0
-
-	movdqa	xmm0, XMMWORD [wk(2)]	; xmm0=(24 25 26 27 34 35 36 37)
-	movdqa	xmm3, XMMWORD [wk(3)]	; xmm3=(44 45 46 47 54 55 56 57)
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
-
-	movdqa     xmm5,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm0		; xmm4=(20 21 22 23 24 25 26 27)=data2
-	punpckhqdq xmm5,xmm0		; xmm5=(30 31 32 33 34 35 36 37)=data3
-	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm3		; xmm1=(40 41 42 43 44 45 46 47)=data4
-	punpckhqdq xmm6,xmm3		; xmm6=(50 51 52 53 54 55 56 57)=data5
-
-	movdqa	xmm0,xmm5
-	movdqa	xmm3,xmm4
-	paddw	xmm5,xmm1		; xmm5=data3+data4=tmp3
-	paddw	xmm4,xmm6		; xmm4=data2+data5=tmp2
-	psubw	xmm0,xmm1		; xmm0=data3-data4=tmp4
-	psubw	xmm3,xmm6		; xmm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm1,xmm7
-	movdqa	xmm6,xmm2
-	paddw	xmm7,xmm5		; xmm7=tmp10
-	paddw	xmm2,xmm4		; xmm2=tmp11
-	psubw	xmm1,xmm5		; xmm1=tmp13
-	psubw	xmm6,xmm4		; xmm6=tmp12
-
-	movdqa	xmm5,xmm7
-	paddw	xmm7,xmm2		; xmm7=tmp10+tmp11
-	psubw	xmm5,xmm2		; xmm5=tmp10-tmp11
-
-	paddw	xmm7,[rel PW_DESCALE_P2X]
-	paddw	xmm5,[rel PW_DESCALE_P2X]
-	psraw	xmm7,PASS1_BITS		; xmm7=data0
-	psraw	xmm5,PASS1_BITS		; xmm5=data4
-
-	movdqa	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
-	movdqa	XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
-
-	; (Original)
-	; z1 = (tmp12 + tmp13) * 0.541196100;
-	; data2 = z1 + tmp13 * 0.765366865;
-	; data6 = z1 + tmp12 * -1.847759065;
-	;
-	; (This implementation)
-	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-	movdqa    xmm4,xmm1		; xmm1=tmp13
-	movdqa    xmm2,xmm1
-	punpcklwd xmm4,xmm6		; xmm6=tmp12
-	punpckhwd xmm2,xmm6
-	movdqa    xmm1,xmm4
-	movdqa    xmm6,xmm2
-	pmaddwd   xmm4,[rel PW_F130_F054]	; xmm4=data2L
-	pmaddwd   xmm2,[rel PW_F130_F054]	; xmm2=data2H
-	pmaddwd   xmm1,[rel PW_F054_MF130]	; xmm1=data6L
-	pmaddwd   xmm6,[rel PW_F054_MF130]	; xmm6=data6H
-
-	paddd	xmm4,[rel PD_DESCALE_P2]
-	paddd	xmm2,[rel PD_DESCALE_P2]
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm2,DESCALE_P2
-	paddd	xmm1,[rel PD_DESCALE_P2]
-	paddd	xmm6,[rel PD_DESCALE_P2]
-	psrad	xmm1,DESCALE_P2
-	psrad	xmm6,DESCALE_P2
-
-	packssdw  xmm4,xmm2		; xmm4=data2
-	packssdw  xmm1,xmm6		; xmm1=data6
-
-	movdqa	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
-	movdqa	XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
-
-	; -- Odd part
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp6
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
-
-	movdqa	xmm2,xmm0		; xmm0=tmp4
-	movdqa	xmm6,xmm3		; xmm3=tmp5
-	paddw	xmm2,xmm7		; xmm2=z3
-	paddw	xmm6,xmm5		; xmm6=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm4,xmm2
-	movdqa    xmm1,xmm2
-	punpcklwd xmm4,xmm6
-	punpckhwd xmm1,xmm6
-	movdqa    xmm2,xmm4
-	movdqa    xmm6,xmm1
-	pmaddwd   xmm4,[rel PW_MF078_F117]	; xmm4=z3L
-	pmaddwd   xmm1,[rel PW_MF078_F117]	; xmm1=z3H
-	pmaddwd   xmm2,[rel PW_F117_F078]	; xmm2=z4L
-	pmaddwd   xmm6,[rel PW_F117_F078]	; xmm6=z4H
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=z3L
-	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=z3H
-
-	; (Original)
-	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-	;
-	; (This implementation)
-	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-	movdqa    xmm4,xmm0
-	movdqa    xmm1,xmm0
-	punpcklwd xmm4,xmm5
-	punpckhwd xmm1,xmm5
-	movdqa    xmm0,xmm4
-	movdqa    xmm5,xmm1
-	pmaddwd   xmm4,[rel PW_MF060_MF089]	; xmm4=tmp4L
-	pmaddwd   xmm1,[rel PW_MF060_MF089]	; xmm1=tmp4H
-	pmaddwd   xmm0,[rel PW_MF089_F060]	; xmm0=tmp7L
-	pmaddwd   xmm5,[rel PW_MF089_F060]	; xmm5=tmp7H
-
-	paddd	xmm4, XMMWORD [wk(0)]	; xmm4=data7L
-	paddd	xmm1, XMMWORD [wk(1)]	; xmm1=data7H
-	paddd	xmm0,xmm2		; xmm0=data1L
-	paddd	xmm5,xmm6		; xmm5=data1H
-
-	paddd	xmm4,[rel PD_DESCALE_P2]
-	paddd	xmm1,[rel PD_DESCALE_P2]
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm1,DESCALE_P2
-	paddd	xmm0,[rel PD_DESCALE_P2]
-	paddd	xmm5,[rel PD_DESCALE_P2]
-	psrad	xmm0,DESCALE_P2
-	psrad	xmm5,DESCALE_P2
-
-	packssdw  xmm4,xmm1		; xmm4=data7
-	packssdw  xmm0,xmm5		; xmm0=data1
-
-	movdqa	XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
-	movdqa	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
-
-	movdqa    xmm1,xmm3
-	movdqa    xmm5,xmm3
-	punpcklwd xmm1,xmm7
-	punpckhwd xmm5,xmm7
-	movdqa    xmm3,xmm1
-	movdqa    xmm7,xmm5
-	pmaddwd   xmm1,[rel PW_MF050_MF256]	; xmm1=tmp5L
-	pmaddwd   xmm5,[rel PW_MF050_MF256]	; xmm5=tmp5H
-	pmaddwd   xmm3,[rel PW_MF256_F050]	; xmm3=tmp6L
-	pmaddwd   xmm7,[rel PW_MF256_F050]	; xmm7=tmp6H
-
-	paddd	xmm1,xmm2		; xmm1=data5L
-	paddd	xmm5,xmm6		; xmm5=data5H
-	paddd	xmm3, XMMWORD [wk(0)]	; xmm3=data3L
-	paddd	xmm7, XMMWORD [wk(1)]	; xmm7=data3H
-
-	paddd	xmm1,[rel PD_DESCALE_P2]
-	paddd	xmm5,[rel PD_DESCALE_P2]
-	psrad	xmm1,DESCALE_P2
-	psrad	xmm5,DESCALE_P2
-	paddd	xmm3,[rel PD_DESCALE_P2]
-	paddd	xmm7,[rel PD_DESCALE_P2]
-	psrad	xmm3,DESCALE_P2
-	psrad	xmm7,DESCALE_P2
-
-	packssdw  xmm1,xmm5		; xmm1=data5
-	packssdw  xmm3,xmm7		; xmm3=data3
-
-	movdqa	XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
-	movdqa	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
-
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfss2int.asm b/simd/jfss2int.asm
deleted file mode 100644
index 1f73163..0000000
--- a/simd/jfss2int.asm
+++ /dev/null
@@ -1,634 +0,0 @@
-;
-; jfss2int.asm - accurate integer FDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
-%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298	equ	 2446		; FIX(0.298631336)
-F_0_390	equ	 3196		; FIX(0.390180644)
-F_0_541	equ	 4433		; FIX(0.541196100)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_175	equ	 9633		; FIX(1.175875602)
-F_1_501	equ	12299		; FIX(1.501321110)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_1_961	equ	16069		; FIX(1.961570560)
-F_2_053	equ	16819		; FIX(2.053119869)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_072	equ	25172		; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
-F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
-F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
-F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fdct_islow_sse2) PRIVATE
-
-EXTN(jconst_fdct_islow_sse2):
-
-PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X	times 8 dw  1 << (PASS1_BITS-1)
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM * data)
-;
-
-%define data(b)		(b)+8		; DCTELEM * data
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		6
-
-	align	16
-	global	EXTN(jsimd_fdct_islow_sse2) PRIVATE
-
-EXTN(jsimd_fdct_islow_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-;	push	edi		; unused
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process rows.
-
-	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
-	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
-	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
-
-	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
-	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
-
-	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
-	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
-
-	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
-	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
-	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=(42 52 62 72 43 53 63 73)
-	movdqa	XMMWORD [wk(3)], xmm2	; wk(3)=(44 54 64 74 45 55 65 75)
-
-	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
-	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
-
-	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
-	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
-	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
-	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-	movdqa	xmm6,xmm1
-	movdqa	xmm3,xmm0
-	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
-	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
-	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
-	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
-
-	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(42 52 62 72 43 53 63 73)
-	movdqa	xmm5, XMMWORD [wk(3)]	; xmm5=(44 54 64 74 45 55 65 75)
-	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
-
-	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
-	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
-	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
-	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-	movdqa	xmm2,xmm1
-	movdqa	xmm5,xmm7
-	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
-	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
-	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
-	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm0,xmm6
-	paddw	xmm3,xmm1		; xmm3=tmp10
-	paddw	xmm6,xmm7		; xmm6=tmp11
-	psubw	xmm4,xmm1		; xmm4=tmp13
-	psubw	xmm0,xmm7		; xmm0=tmp12
-
-	movdqa	xmm1,xmm3
-	paddw	xmm3,xmm6		; xmm3=tmp10+tmp11
-	psubw	xmm1,xmm6		; xmm1=tmp10-tmp11
-
-	psllw	xmm3,PASS1_BITS		; xmm3=data0
-	psllw	xmm1,PASS1_BITS		; xmm1=data4
-
-	movdqa	XMMWORD [wk(2)], xmm3	; wk(2)=data0
-	movdqa	XMMWORD [wk(3)], xmm1	; wk(3)=data4
-
-	; (Original)
-	; z1 = (tmp12 + tmp13) * 0.541196100;
-	; data2 = z1 + tmp13 * 0.765366865;
-	; data6 = z1 + tmp12 * -1.847759065;
-	;
-	; (This implementation)
-	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-	movdqa    xmm7,xmm4		; xmm4=tmp13
-	movdqa    xmm6,xmm4
-	punpcklwd xmm7,xmm0		; xmm0=tmp12
-	punpckhwd xmm6,xmm0
-	movdqa    xmm4,xmm7
-	movdqa    xmm0,xmm6
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_F130_F054)]	; xmm7=data2L
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]	; xmm6=data2H
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_F054_MF130)]	; xmm4=data6L
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_F054_MF130)]	; xmm0=data6H
-
-	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	xmm7,DESCALE_P1
-	psrad	xmm6,DESCALE_P1
-	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	xmm4,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-
-	packssdw  xmm7,xmm6		; xmm7=data2
-	packssdw  xmm4,xmm0		; xmm4=data6
-
-	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=data2
-	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=data6
-
-	; -- Odd part
-
-	movdqa	xmm3, XMMWORD [wk(0)]	; xmm3=tmp6
-	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp7
-
-	movdqa	xmm6,xmm2		; xmm2=tmp4
-	movdqa	xmm0,xmm5		; xmm5=tmp5
-	paddw	xmm6,xmm3		; xmm6=z3
-	paddw	xmm0,xmm1		; xmm0=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm7,xmm6
-	movdqa    xmm4,xmm6
-	punpcklwd xmm7,xmm0
-	punpckhwd xmm4,xmm0
-	movdqa    xmm6,xmm7
-	movdqa    xmm0,xmm4
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]	; xmm7=z3L
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]	; xmm4=z3H
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]	; xmm6=z4L
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_F117_F078)]	; xmm0=z4H
-
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=z3L
-	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=z3H
-
-	; (Original)
-	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-	;
-	; (This implementation)
-	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-	movdqa    xmm7,xmm2
-	movdqa    xmm4,xmm2
-	punpcklwd xmm7,xmm1
-	punpckhwd xmm4,xmm1
-	movdqa    xmm2,xmm7
-	movdqa    xmm1,xmm4
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm7=tmp4L
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm4=tmp4H
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF089_F060)]	; xmm2=tmp7L
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]	; xmm1=tmp7H
-
-	paddd	xmm7, XMMWORD [wk(0)]	; xmm7=data7L
-	paddd	xmm4, XMMWORD [wk(1)]	; xmm4=data7H
-	paddd	xmm2,xmm6		; xmm2=data1L
-	paddd	xmm1,xmm0		; xmm1=data1H
-
-	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	xmm7,DESCALE_P1
-	psrad	xmm4,DESCALE_P1
-	paddd	xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	xmm2,DESCALE_P1
-	psrad	xmm1,DESCALE_P1
-
-	packssdw  xmm7,xmm4		; xmm7=data7
-	packssdw  xmm2,xmm1		; xmm2=data1
-
-	movdqa    xmm4,xmm5
-	movdqa    xmm1,xmm5
-	punpcklwd xmm4,xmm3
-	punpckhwd xmm1,xmm3
-	movdqa    xmm5,xmm4
-	movdqa    xmm3,xmm1
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm4=tmp5L
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm1=tmp5H
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF256_F050)]	; xmm5=tmp6L
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]	; xmm3=tmp6H
-
-	paddd	xmm4,xmm6		; xmm4=data5L
-	paddd	xmm1,xmm0		; xmm1=data5H
-	paddd	xmm5, XMMWORD [wk(0)]	; xmm5=data3L
-	paddd	xmm3, XMMWORD [wk(1)]	; xmm3=data3H
-
-	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	xmm4,DESCALE_P1
-	psrad	xmm1,DESCALE_P1
-	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
-	paddd	xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
-	psrad	xmm5,DESCALE_P1
-	psrad	xmm3,DESCALE_P1
-
-	packssdw  xmm4,xmm1		; xmm4=data5
-	packssdw  xmm5,xmm3		; xmm5=data3
-
-	; ---- Pass 2: process columns.
-
-;	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
-
-	movdqa	xmm6, XMMWORD [wk(2)]	; xmm6=col0
-	movdqa	xmm0, XMMWORD [wk(4)]	; xmm0=col2
-
-	; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
-	; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
-	movdqa    xmm1,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm2		; xmm6=(00 01 10 11 20 21 30 31)
-	punpckhwd xmm1,xmm2		; xmm1=(40 41 50 51 60 61 70 71)
-	movdqa    xmm3,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm5		; xmm0=(02 03 12 13 22 23 32 33)
-	punpckhwd xmm3,xmm5		; xmm3=(42 43 52 53 62 63 72 73)
-
-	movdqa	xmm2, XMMWORD [wk(3)]	; xmm2=col4
-	movdqa	xmm5, XMMWORD [wk(5)]	; xmm5=col6
-
-	; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
-	; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
-	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=(02 03 12 13 22 23 32 33)
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(42 43 52 53 62 63 72 73)
-
-	movdqa    xmm0,xmm2		; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm4		; xmm2=(04 05 14 15 24 25 34 35)
-	punpckhwd xmm0,xmm4		; xmm0=(44 45 54 55 64 65 74 75)
-	movdqa    xmm3,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm7		; xmm5=(06 07 16 17 26 27 36 37)
-	punpckhwd xmm3,xmm7		; xmm3=(46 47 56 57 66 67 76 77)
-
-	movdqa    xmm4,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm5		; xmm2=(04 05 06 07 14 15 16 17)
-	punpckhdq xmm4,xmm5		; xmm4=(24 25 26 27 34 35 36 37)
-	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm3		; xmm0=(44 45 46 47 54 55 56 57)
-	punpckhdq xmm7,xmm3		; xmm7=(64 65 66 67 74 75 76 77)
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=(02 03 12 13 22 23 32 33)
-	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53 62 63 72 73)
-	movdqa	XMMWORD [wk(2)], xmm4	; wk(2)=(24 25 26 27 34 35 36 37)
-	movdqa	XMMWORD [wk(3)], xmm0	; wk(3)=(44 45 46 47 54 55 56 57)
-
-	movdqa    xmm4,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm5		; xmm6=(00 01 02 03 10 11 12 13)
-	punpckhdq xmm4,xmm5		; xmm4=(20 21 22 23 30 31 32 33)
-	movdqa    xmm0,xmm1		; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm3		; xmm1=(40 41 42 43 50 51 52 53)
-	punpckhdq xmm0,xmm3		; xmm0=(60 61 62 63 70 71 72 73)
-
-	movdqa     xmm5,xmm6		; transpose coefficients(phase 3)
-	punpcklqdq xmm6,xmm2		; xmm6=(00 01 02 03 04 05 06 07)=data0
-	punpckhqdq xmm5,xmm2		; xmm5=(10 11 12 13 14 15 16 17)=data1
-	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm7		; xmm0=(60 61 62 63 64 65 66 67)=data6
-	punpckhqdq xmm3,xmm7		; xmm3=(70 71 72 73 74 75 76 77)=data7
-
-	movdqa	xmm2,xmm5
-	movdqa	xmm7,xmm6
-	psubw	xmm5,xmm0		; xmm5=data1-data6=tmp6
-	psubw	xmm6,xmm3		; xmm6=data0-data7=tmp7
-	paddw	xmm2,xmm0		; xmm2=data1+data6=tmp1
-	paddw	xmm7,xmm3		; xmm7=data0+data7=tmp0
-
-	movdqa	xmm0, XMMWORD [wk(2)]	; xmm0=(24 25 26 27 34 35 36 37)
-	movdqa	xmm3, XMMWORD [wk(3)]	; xmm3=(44 45 46 47 54 55 56 57)
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=tmp6
-	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
-
-	movdqa     xmm5,xmm4		; transpose coefficients(phase 3)
-	punpcklqdq xmm4,xmm0		; xmm4=(20 21 22 23 24 25 26 27)=data2
-	punpckhqdq xmm5,xmm0		; xmm5=(30 31 32 33 34 35 36 37)=data3
-	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm3		; xmm1=(40 41 42 43 44 45 46 47)=data4
-	punpckhqdq xmm6,xmm3		; xmm6=(50 51 52 53 54 55 56 57)=data5
-
-	movdqa	xmm0,xmm5
-	movdqa	xmm3,xmm4
-	paddw	xmm5,xmm1		; xmm5=data3+data4=tmp3
-	paddw	xmm4,xmm6		; xmm4=data2+data5=tmp2
-	psubw	xmm0,xmm1		; xmm0=data3-data4=tmp4
-	psubw	xmm3,xmm6		; xmm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movdqa	xmm1,xmm7
-	movdqa	xmm6,xmm2
-	paddw	xmm7,xmm5		; xmm7=tmp10
-	paddw	xmm2,xmm4		; xmm2=tmp11
-	psubw	xmm1,xmm5		; xmm1=tmp13
-	psubw	xmm6,xmm4		; xmm6=tmp12
-
-	movdqa	xmm5,xmm7
-	paddw	xmm7,xmm2		; xmm7=tmp10+tmp11
-	psubw	xmm5,xmm2		; xmm5=tmp10-tmp11
-
-	paddw	xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
-	paddw	xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
-	psraw	xmm7,PASS1_BITS		; xmm7=data0
-	psraw	xmm5,PASS1_BITS		; xmm5=data4
-
-	movdqa	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
-	movdqa	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
-
-	; (Original)
-	; z1 = (tmp12 + tmp13) * 0.541196100;
-	; data2 = z1 + tmp13 * 0.765366865;
-	; data6 = z1 + tmp12 * -1.847759065;
-	;
-	; (This implementation)
-	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-	movdqa    xmm4,xmm1		; xmm1=tmp13
-	movdqa    xmm2,xmm1
-	punpcklwd xmm4,xmm6		; xmm6=tmp12
-	punpckhwd xmm2,xmm6
-	movdqa    xmm1,xmm4
-	movdqa    xmm6,xmm2
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]	; xmm4=data2L
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_F130_F054)]	; xmm2=data2H
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=data6L
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_F054_MF130)]	; xmm6=data6H
-
-	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm2,DESCALE_P2
-	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	xmm1,DESCALE_P2
-	psrad	xmm6,DESCALE_P2
-
-	packssdw  xmm4,xmm2		; xmm4=data2
-	packssdw  xmm1,xmm6		; xmm1=data6
-
-	movdqa	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
-	movdqa	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
-
-	; -- Odd part
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp6
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
-
-	movdqa	xmm2,xmm0		; xmm0=tmp4
-	movdqa	xmm6,xmm3		; xmm3=tmp5
-	paddw	xmm2,xmm7		; xmm2=z3
-	paddw	xmm6,xmm5		; xmm6=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm4,xmm2
-	movdqa    xmm1,xmm2
-	punpcklwd xmm4,xmm6
-	punpckhwd xmm1,xmm6
-	movdqa    xmm2,xmm4
-	movdqa    xmm6,xmm1
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]	; xmm4=z3L
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF078_F117)]	; xmm1=z3H
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_F117_F078)]	; xmm2=z4L
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]	; xmm6=z4H
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=z3L
-	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=z3H
-
-	; (Original)
-	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-	;
-	; (This implementation)
-	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-	movdqa    xmm4,xmm0
-	movdqa    xmm1,xmm0
-	punpcklwd xmm4,xmm5
-	punpckhwd xmm1,xmm5
-	movdqa    xmm0,xmm4
-	movdqa    xmm5,xmm1
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm4=tmp4L
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm1=tmp4H
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF089_F060)]	; xmm0=tmp7L
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF089_F060)]	; xmm5=tmp7H
-
-	paddd	xmm4, XMMWORD [wk(0)]	; xmm4=data7L
-	paddd	xmm1, XMMWORD [wk(1)]	; xmm1=data7H
-	paddd	xmm0,xmm2		; xmm0=data1L
-	paddd	xmm5,xmm6		; xmm5=data1H
-
-	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm1,DESCALE_P2
-	paddd	xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	xmm0,DESCALE_P2
-	psrad	xmm5,DESCALE_P2
-
-	packssdw  xmm4,xmm1		; xmm4=data7
-	packssdw  xmm0,xmm5		; xmm0=data1
-
-	movdqa	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
-	movdqa	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
-
-	movdqa    xmm1,xmm3
-	movdqa    xmm5,xmm3
-	punpcklwd xmm1,xmm7
-	punpckhwd xmm5,xmm7
-	movdqa    xmm3,xmm1
-	movdqa    xmm7,xmm5
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm1=tmp5L
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm5=tmp5H
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]	; xmm3=tmp6L
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF256_F050)]	; xmm7=tmp6H
-
-	paddd	xmm1,xmm2		; xmm1=data5L
-	paddd	xmm5,xmm6		; xmm5=data5H
-	paddd	xmm3, XMMWORD [wk(0)]	; xmm3=data3L
-	paddd	xmm7, XMMWORD [wk(1)]	; xmm7=data3H
-
-	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	xmm1,DESCALE_P2
-	psrad	xmm5,DESCALE_P2
-	paddd	xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
-	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-	psrad	xmm3,DESCALE_P2
-	psrad	xmm7,DESCALE_P2
-
-	packssdw  xmm1,xmm5		; xmm1=data5
-	packssdw  xmm3,xmm7		; xmm3=data3
-
-	movdqa	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
-	movdqa	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
-
-;	pop	edi		; unused
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfsseflt-64.asm b/simd/jfsseflt-64.asm
deleted file mode 100644
index b5de0c4..0000000
--- a/simd/jfsseflt-64.asm
+++ /dev/null
@@ -1,358 +0,0 @@
-;
-; jfsseflt-64.asm - floating-point FDCT (64-bit SSE)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-	shufps	%1,%2,0x44
-%endmacro
-
-%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-	shufps	%1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fdct_float_sse) PRIVATE
-
-EXTN(jconst_fdct_float_sse):
-
-PD_0_382	times 4 dd  0.382683432365089771728460
-PD_0_707	times 4 dd  0.707106781186547524400844
-PD_0_541	times 4 dd  0.541196100146196984399723
-PD_1_306	times 4 dd  1.306562964876376527856643
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT * data)
-;
-
-; r10 = FAST_FLOAT * data
-
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_float_sse) PRIVATE
-
-EXTN(jsimd_fdct_float_sse):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-
-	; ---- Pass 1: process rows.
-
-	mov	rdx, r10	; (FAST_FLOAT *)
-	mov	rcx, DCTSIZE/4
-.rowloop:
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
-
-	; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
-	; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
-	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
-	unpcklps xmm0,xmm1		; xmm0=(20 30 21 31)
-	unpckhps xmm4,xmm1		; xmm4=(22 32 23 33)
-	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
-	unpcklps xmm2,xmm3		; xmm2=(24 34 25 35)
-	unpckhps xmm5,xmm3		; xmm5=(26 36 27 37)
-
-	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
-	; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
-	; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
-	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 32 23 33)
-	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(24 34 25 35)
-
-	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
-	unpcklps xmm6,xmm7		; xmm6=(00 10 01 11)
-	unpckhps xmm4,xmm7		; xmm4=(02 12 03 13)
-	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
-	unpcklps xmm1,xmm3		; xmm1=(04 14 05 15)
-	unpckhps xmm2,xmm3		; xmm2=(06 16 07 17)
-
-	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm0		; xmm6=(00 10 20 30)=data0
-	unpckhps2 xmm7,xmm0		; xmm7=(01 11 21 31)=data1
-	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
-	unpcklps2 xmm2,xmm5		; xmm2=(06 16 26 36)=data6
-	unpckhps2 xmm3,xmm5		; xmm3=(07 17 27 37)=data7
-
-	movaps	xmm0,xmm7
-	movaps	xmm5,xmm6
-	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
-	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
-	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
-	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
-
-	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 32 23 33)
-	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(24 34 25 35)
-	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
-	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
-
-	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(02 12 22 32)=data2
-	unpckhps2 xmm7,xmm2		; xmm7=(03 13 23 33)=data3
-	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm3		; xmm1=(04 14 24 34)=data4
-	unpckhps2 xmm6,xmm3		; xmm6=(05 15 25 35)=data5
-
-	movaps	xmm2,xmm7
-	movaps	xmm3,xmm4
-	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
-	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
-	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
-	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movaps	xmm1,xmm5
-	movaps	xmm6,xmm0
-	subps	xmm5,xmm7		; xmm5=tmp13
-	subps	xmm0,xmm4		; xmm0=tmp12
-	addps	xmm1,xmm7		; xmm1=tmp10
-	addps	xmm6,xmm4		; xmm6=tmp11
-
-	addps	xmm0,xmm5
-	mulps	xmm0,[rel PD_0_707] ; xmm0=z1
-
-	movaps	xmm7,xmm1
-	movaps	xmm4,xmm5
-	subps	xmm1,xmm6		; xmm1=data4
-	subps	xmm5,xmm0		; xmm5=data6
-	addps	xmm7,xmm6		; xmm7=data0
-	addps	xmm4,xmm0		; xmm4=data2
-
-	movaps	XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-	; -- Odd part
-
-	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
-	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
-
-	addps	xmm2,xmm3		; xmm2=tmp10
-	addps	xmm3,xmm6		; xmm3=tmp11
-	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
-
-	mulps	xmm3,[rel PD_0_707] ; xmm3=z3
-
-	movaps	xmm1,xmm2		; xmm1=tmp10
-	subps	xmm2,xmm6
-	mulps	xmm2,[rel PD_0_382] ; xmm2=z5
-	mulps	xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-	mulps	xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-	addps	xmm1,xmm2		; xmm1=z2
-	addps	xmm6,xmm2		; xmm6=z4
-
-	movaps	xmm5,xmm0
-	subps	xmm0,xmm3		; xmm0=z13
-	addps	xmm5,xmm3		; xmm5=z11
-
-	movaps	xmm7,xmm0
-	movaps	xmm4,xmm5
-	subps	xmm0,xmm1		; xmm0=data3
-	subps	xmm5,xmm6		; xmm5=data7
-	addps	xmm7,xmm1		; xmm7=data5
-	addps	xmm4,xmm6		; xmm4=data1
-
-	movaps	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-	add	rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	rcx
-	jnz	near .rowloop
-
-	; ---- Pass 2: process columns.
-
-	mov	rdx, r10	; (FAST_FLOAT *)
-	mov	rcx, DCTSIZE/4
-.columnloop:
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
-
-	; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
-	; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
-	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
-	unpcklps xmm0,xmm1		; xmm0=(02 03 12 13)
-	unpckhps xmm4,xmm1		; xmm4=(22 23 32 33)
-	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
-	unpcklps xmm2,xmm3		; xmm2=(42 43 52 53)
-	unpckhps xmm5,xmm3		; xmm5=(62 63 72 73)
-
-	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
-
-	; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
-	; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
-	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 23 32 33)
-	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(42 43 52 53)
-
-	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
-	unpcklps xmm6,xmm7		; xmm6=(00 01 10 11)
-	unpckhps xmm4,xmm7		; xmm4=(20 21 30 31)
-	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
-	unpcklps xmm1,xmm3		; xmm1=(40 41 50 51)
-	unpckhps xmm2,xmm3		; xmm2=(60 61 70 71)
-
-	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm0		; xmm6=(00 01 02 03)=data0
-	unpckhps2 xmm7,xmm0		; xmm7=(10 11 12 13)=data1
-	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
-	unpcklps2 xmm2,xmm5		; xmm2=(60 61 62 63)=data6
-	unpckhps2 xmm3,xmm5		; xmm3=(70 71 72 73)=data7
-
-	movaps	xmm0,xmm7
-	movaps	xmm5,xmm6
-	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
-	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
-	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
-	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
-
-	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 23 32 33)
-	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53)
-	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
-	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
-
-	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(20 21 22 23)=data2
-	unpckhps2 xmm7,xmm2		; xmm7=(30 31 32 33)=data3
-	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm3		; xmm1=(40 41 42 43)=data4
-	unpckhps2 xmm6,xmm3		; xmm6=(50 51 52 53)=data5
-
-	movaps	xmm2,xmm7
-	movaps	xmm3,xmm4
-	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
-	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
-	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
-	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movaps	xmm1,xmm5
-	movaps	xmm6,xmm0
-	subps	xmm5,xmm7		; xmm5=tmp13
-	subps	xmm0,xmm4		; xmm0=tmp12
-	addps	xmm1,xmm7		; xmm1=tmp10
-	addps	xmm6,xmm4		; xmm6=tmp11
-
-	addps	xmm0,xmm5
-	mulps	xmm0,[rel PD_0_707] ; xmm0=z1
-
-	movaps	xmm7,xmm1
-	movaps	xmm4,xmm5
-	subps	xmm1,xmm6		; xmm1=data4
-	subps	xmm5,xmm0		; xmm5=data6
-	addps	xmm7,xmm6		; xmm7=data0
-	addps	xmm4,xmm0		; xmm4=data2
-
-	movaps	XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-	; -- Odd part
-
-	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
-	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
-
-	addps	xmm2,xmm3		; xmm2=tmp10
-	addps	xmm3,xmm6		; xmm3=tmp11
-	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
-
-	mulps	xmm3,[rel PD_0_707] ; xmm3=z3
-
-	movaps	xmm1,xmm2		; xmm1=tmp10
-	subps	xmm2,xmm6
-	mulps	xmm2,[rel PD_0_382] ; xmm2=z5
-	mulps	xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-	mulps	xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-	addps	xmm1,xmm2		; xmm1=z2
-	addps	xmm6,xmm2		; xmm6=z4
-
-	movaps	xmm5,xmm0
-	subps	xmm0,xmm3		; xmm0=z13
-	addps	xmm5,xmm3		; xmm5=z11
-
-	movaps	xmm7,xmm0
-	movaps	xmm4,xmm5
-	subps	xmm0,xmm1		; xmm0=data3
-	subps	xmm5,xmm6		; xmm5=data7
-	addps	xmm7,xmm1		; xmm7=data5
-	addps	xmm4,xmm6		; xmm4=data1
-
-	movaps	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-	add	rdx, byte 4*SIZEOF_FAST_FLOAT
-	dec	rcx
-	jnz	near .columnloop
-
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jfsseflt.asm b/simd/jfsseflt.asm
deleted file mode 100644
index dc52c32..0000000
--- a/simd/jfsseflt.asm
+++ /dev/null
@@ -1,370 +0,0 @@
-;
-; jfsseflt.asm - floating-point FDCT (SSE)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-	shufps	%1,%2,0x44
-%endmacro
-
-%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-	shufps	%1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_fdct_float_sse) PRIVATE
-
-EXTN(jconst_fdct_float_sse):
-
-PD_0_382	times 4 dd  0.382683432365089771728460
-PD_0_707	times 4 dd  0.707106781186547524400844
-PD_0_541	times 4 dd  0.541196100146196984399723
-PD_1_306	times 4 dd  1.306562964876376527856643
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT * data)
-;
-
-%define data(b)		(b)+8		; FAST_FLOAT * data
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_fdct_float_sse) PRIVATE
-
-EXTN(jsimd_fdct_float_sse):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-;	push	edi		; unused
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process rows.
-
-	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.rowloop:
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
-
-	; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
-	; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
-	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
-	unpcklps xmm0,xmm1		; xmm0=(20 30 21 31)
-	unpckhps xmm4,xmm1		; xmm4=(22 32 23 33)
-	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
-	unpcklps xmm2,xmm3		; xmm2=(24 34 25 35)
-	unpckhps xmm5,xmm3		; xmm5=(26 36 27 37)
-
-	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-	; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
-	; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
-	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 32 23 33)
-	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(24 34 25 35)
-
-	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
-	unpcklps xmm6,xmm7		; xmm6=(00 10 01 11)
-	unpckhps xmm4,xmm7		; xmm4=(02 12 03 13)
-	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
-	unpcklps xmm1,xmm3		; xmm1=(04 14 05 15)
-	unpckhps xmm2,xmm3		; xmm2=(06 16 07 17)
-
-	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm0		; xmm6=(00 10 20 30)=data0
-	unpckhps2 xmm7,xmm0		; xmm7=(01 11 21 31)=data1
-	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
-	unpcklps2 xmm2,xmm5		; xmm2=(06 16 26 36)=data6
-	unpckhps2 xmm3,xmm5		; xmm3=(07 17 27 37)=data7
-
-	movaps	xmm0,xmm7
-	movaps	xmm5,xmm6
-	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
-	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
-	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
-	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
-
-	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 32 23 33)
-	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(24 34 25 35)
-	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
-	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
-
-	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(02 12 22 32)=data2
-	unpckhps2 xmm7,xmm2		; xmm7=(03 13 23 33)=data3
-	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm3		; xmm1=(04 14 24 34)=data4
-	unpckhps2 xmm6,xmm3		; xmm6=(05 15 25 35)=data5
-
-	movaps	xmm2,xmm7
-	movaps	xmm3,xmm4
-	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
-	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
-	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
-	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movaps	xmm1,xmm5
-	movaps	xmm6,xmm0
-	subps	xmm5,xmm7		; xmm5=tmp13
-	subps	xmm0,xmm4		; xmm0=tmp12
-	addps	xmm1,xmm7		; xmm1=tmp10
-	addps	xmm6,xmm4		; xmm6=tmp11
-
-	addps	xmm0,xmm5
-	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
-	movaps	xmm7,xmm1
-	movaps	xmm4,xmm5
-	subps	xmm1,xmm6		; xmm1=data4
-	subps	xmm5,xmm0		; xmm5=data6
-	addps	xmm7,xmm6		; xmm7=data0
-	addps	xmm4,xmm0		; xmm4=data2
-
-	movaps	XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-	; -- Odd part
-
-	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
-	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
-
-	addps	xmm2,xmm3		; xmm2=tmp10
-	addps	xmm3,xmm6		; xmm3=tmp11
-	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
-
-	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
-	movaps	xmm1,xmm2		; xmm1=tmp10
-	subps	xmm2,xmm6
-	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
-	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-	addps	xmm1,xmm2		; xmm1=z2
-	addps	xmm6,xmm2		; xmm6=z4
-
-	movaps	xmm5,xmm0
-	subps	xmm0,xmm3		; xmm0=z13
-	addps	xmm5,xmm3		; xmm5=z11
-
-	movaps	xmm7,xmm0
-	movaps	xmm4,xmm5
-	subps	xmm0,xmm1		; xmm0=data3
-	subps	xmm5,xmm6		; xmm5=data7
-	addps	xmm7,xmm1		; xmm7=data5
-	addps	xmm4,xmm6		; xmm4=data1
-
-	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-	add	edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	near .rowloop
-
-	; ---- Pass 2: process columns.
-
-	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
-	mov	ecx, DCTSIZE/4
-	alignx	16,7
-.columnloop:
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
-	; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
-	; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
-	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
-	unpcklps xmm0,xmm1		; xmm0=(02 03 12 13)
-	unpckhps xmm4,xmm1		; xmm4=(22 23 32 33)
-	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
-	unpcklps xmm2,xmm3		; xmm2=(42 43 52 53)
-	unpckhps xmm5,xmm3		; xmm5=(62 63 72 73)
-
-	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
-	; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
-	; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
-	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 23 32 33)
-	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(42 43 52 53)
-
-	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
-	unpcklps xmm6,xmm7		; xmm6=(00 01 10 11)
-	unpckhps xmm4,xmm7		; xmm4=(20 21 30 31)
-	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
-	unpcklps xmm1,xmm3		; xmm1=(40 41 50 51)
-	unpckhps xmm2,xmm3		; xmm2=(60 61 70 71)
-
-	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm0		; xmm6=(00 01 02 03)=data0
-	unpckhps2 xmm7,xmm0		; xmm7=(10 11 12 13)=data1
-	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
-	unpcklps2 xmm2,xmm5		; xmm2=(60 61 62 63)=data6
-	unpckhps2 xmm3,xmm5		; xmm3=(70 71 72 73)=data7
-
-	movaps	xmm0,xmm7
-	movaps	xmm5,xmm6
-	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
-	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
-	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
-	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
-
-	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 23 32 33)
-	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53)
-	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
-	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
-
-	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(20 21 22 23)=data2
-	unpckhps2 xmm7,xmm2		; xmm7=(30 31 32 33)=data3
-	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm3		; xmm1=(40 41 42 43)=data4
-	unpckhps2 xmm6,xmm3		; xmm6=(50 51 52 53)=data5
-
-	movaps	xmm2,xmm7
-	movaps	xmm3,xmm4
-	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
-	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
-	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
-	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
-
-	; -- Even part
-
-	movaps	xmm1,xmm5
-	movaps	xmm6,xmm0
-	subps	xmm5,xmm7		; xmm5=tmp13
-	subps	xmm0,xmm4		; xmm0=tmp12
-	addps	xmm1,xmm7		; xmm1=tmp10
-	addps	xmm6,xmm4		; xmm6=tmp11
-
-	addps	xmm0,xmm5
-	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
-	movaps	xmm7,xmm1
-	movaps	xmm4,xmm5
-	subps	xmm1,xmm6		; xmm1=data4
-	subps	xmm5,xmm0		; xmm5=data6
-	addps	xmm7,xmm6		; xmm7=data0
-	addps	xmm4,xmm0		; xmm4=data2
-
-	movaps	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-	; -- Odd part
-
-	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
-	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
-
-	addps	xmm2,xmm3		; xmm2=tmp10
-	addps	xmm3,xmm6		; xmm3=tmp11
-	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
-
-	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
-	movaps	xmm1,xmm2		; xmm1=tmp10
-	subps	xmm2,xmm6
-	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
-	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-	addps	xmm1,xmm2		; xmm1=z2
-	addps	xmm6,xmm2		; xmm6=z4
-
-	movaps	xmm5,xmm0
-	subps	xmm0,xmm3		; xmm0=z13
-	addps	xmm5,xmm3		; xmm5=z11
-
-	movaps	xmm7,xmm0
-	movaps	xmm4,xmm5
-	subps	xmm0,xmm1		; xmm0=data3
-	subps	xmm5,xmm6		; xmm5=data7
-	addps	xmm7,xmm1		; xmm7=data5
-	addps	xmm4,xmm6		; xmm4=data1
-
-	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-	add	edx, byte 4*SIZEOF_FAST_FLOAT
-	dec	ecx
-	jnz	near .columnloop
-
-;	pop	edi		; unused
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/ji3dnflt.asm b/simd/ji3dnflt.asm
deleted file mode 100644
index 30ff49d..0000000
--- a/simd/ji3dnflt.asm
+++ /dev/null
@@ -1,452 +0,0 @@
-;
-; ji3dnflt.asm - floating-point IDCT (3DNow! & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_float_3dnow) PRIVATE
-
-EXTN(jconst_idct_float_3dnow):
-
-PD_1_414	times 2 dd  1.414213562373095048801689
-PD_1_847	times 2 dd  1.847759065022573512256366
-PD_1_082	times 2 dd  1.082392200292393968799446
-PD_2_613	times 2 dd  2.613125929752753055713286
-PD_RNDINT_MAGIC	times 2 dd  100663296.0	; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_3dnow (void * dct_table, JCOEFPTR coef_block,
-;                         JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-					; FAST_FLOAT workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_float_3dnow) PRIVATE
-
-EXTN(jsimd_idct_float_3dnow):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [workspace]
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-	lea	edi, [workspace]			; FAST_FLOAT * wsptr
-	mov	ecx, DCTSIZE/2				; ctr
-	alignx	16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	short .columnDCT
-
-	pushpic	ebx		; save GOT address
-	mov	ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	mov	eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	or	ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	or	ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	or	eax,ebx
-	poppic	ebx		; restore GOT address
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-	punpcklwd mm0,mm0
-	psrad     mm0,(DWORD_BIT-WORD_BIT)
-	pi2fd     mm0,mm0
-
-	pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movq      mm1,mm0
-	punpckldq mm0,mm0
-	punpckhdq mm1,mm1
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
-	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
-	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
-	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
-	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
-	jmp	near .nextcolumn
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movd      mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movd      mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movd      mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-	punpcklwd mm0,mm0
-	punpcklwd mm1,mm1
-	psrad     mm0,(DWORD_BIT-WORD_BIT)
-	psrad     mm1,(DWORD_BIT-WORD_BIT)
-	pi2fd     mm0,mm0
-	pi2fd     mm1,mm1
-
-	pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	pfmul     mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	punpcklwd mm2,mm2
-	punpcklwd mm3,mm3
-	psrad     mm2,(DWORD_BIT-WORD_BIT)
-	psrad     mm3,(DWORD_BIT-WORD_BIT)
-	pi2fd     mm2,mm2
-	pi2fd     mm3,mm3
-
-	pfmul     mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	pfmul     mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movq	mm4,mm0
-	movq	mm5,mm1
-	pfsub	mm0,mm2			; mm0=tmp11
-	pfsub	mm1,mm3
-	pfadd	mm4,mm2			; mm4=tmp10
-	pfadd	mm5,mm3			; mm5=tmp13
-
-	pfmul	mm1,[GOTOFF(ebx,PD_1_414)]
-	pfsub	mm1,mm5			; mm1=tmp12
-
-	movq	mm6,mm4
-	movq	mm7,mm0
-	pfsub	mm4,mm5			; mm4=tmp3
-	pfsub	mm0,mm1			; mm0=tmp2
-	pfadd	mm6,mm5			; mm6=tmp0
-	pfadd	mm7,mm1			; mm7=tmp1
-
-	movq	MMWORD [wk(1)], mm4	; tmp3
-	movq	MMWORD [wk(0)], mm0	; tmp2
-
-	; -- Odd part
-
-	movd      mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movd      mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movd      mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movd      mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-	punpcklwd mm2,mm2
-	punpcklwd mm3,mm3
-	psrad     mm2,(DWORD_BIT-WORD_BIT)
-	psrad     mm3,(DWORD_BIT-WORD_BIT)
-	pi2fd     mm2,mm2
-	pi2fd     mm3,mm3
-
-	pfmul     mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	pfmul     mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	punpcklwd mm5,mm5
-	punpcklwd mm1,mm1
-	psrad     mm5,(DWORD_BIT-WORD_BIT)
-	psrad     mm1,(DWORD_BIT-WORD_BIT)
-	pi2fd     mm5,mm5
-	pi2fd     mm1,mm1
-
-	pfmul     mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	pfmul     mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movq	mm4,mm2
-	movq	mm0,mm5
-	pfadd	mm2,mm1			; mm2=z11
-	pfadd	mm5,mm3			; mm5=z13
-	pfsub	mm4,mm1			; mm4=z12
-	pfsub	mm0,mm3			; mm0=z10
-
-	movq	mm1,mm2
-	pfsub	mm2,mm5
-	pfadd	mm1,mm5			; mm1=tmp7
-
-	pfmul	mm2,[GOTOFF(ebx,PD_1_414)]	; mm2=tmp11
-
-	movq	mm3,mm0
-	pfadd	mm0,mm4
-	pfmul	mm0,[GOTOFF(ebx,PD_1_847)]	; mm0=z5
-	pfmul	mm3,[GOTOFF(ebx,PD_2_613)]	; mm3=(z10 * 2.613125930)
-	pfmul	mm4,[GOTOFF(ebx,PD_1_082)]	; mm4=(z12 * 1.082392200)
-	pfsubr	mm3,mm0			; mm3=tmp12
-	pfsub	mm4,mm0			; mm4=tmp10
-
-	; -- Final output stage
-
-	pfsub	mm3,mm1			; mm3=tmp6
-	movq	mm5,mm6
-	movq	mm0,mm7
-	pfadd	mm6,mm1			; mm6=data0=(00 01)
-	pfadd	mm7,mm3			; mm7=data1=(10 11)
-	pfsub	mm5,mm1			; mm5=data7=(70 71)
-	pfsub	mm0,mm3			; mm0=data6=(60 61)
-	pfsub	mm2,mm3			; mm2=tmp5
-
-	movq      mm1,mm6		; transpose coefficients
-	punpckldq mm6,mm7		; mm6=(00 10)
-	punpckhdq mm1,mm7		; mm1=(01 11)
-	movq      mm3,mm0		; transpose coefficients
-	punpckldq mm0,mm5		; mm0=(60 70)
-	punpckhdq mm3,mm5		; mm3=(61 71)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
-	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
-
-	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
-	movq	mm5, MMWORD [wk(1)]	; mm5=tmp3
-
-	pfadd	mm4,mm2			; mm4=tmp4
-	movq	mm6,mm7
-	movq	mm1,mm5
-	pfadd	mm7,mm2			; mm7=data2=(20 21)
-	pfadd	mm5,mm4			; mm5=data4=(40 41)
-	pfsub	mm6,mm2			; mm6=data5=(50 51)
-	pfsub	mm1,mm4			; mm1=data3=(30 31)
-
-	movq      mm0,mm7		; transpose coefficients
-	punpckldq mm7,mm1		; mm7=(20 30)
-	punpckhdq mm0,mm1		; mm0=(21 31)
-	movq      mm3,mm5		; transpose coefficients
-	punpckldq mm5,mm6		; mm5=(40 50)
-	punpckhdq mm3,mm6		; mm3=(41 51)
-
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
-	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
-	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
-
-.nextcolumn:
-	add	esi, byte 2*SIZEOF_JCOEF		; coef_block
-	add	edx, byte 2*SIZEOF_FLOAT_MULT_TYPE	; quantptr
-	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
-	dec	ecx					; ctr
-	jnz	near .columnloop
-
-	; -- Prefetch the next coefficient block
-
-	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	lea	esi, [workspace]			; FAST_FLOAT * wsptr
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-	mov	ecx, DCTSIZE/2				; ctr
-	alignx	16,7
-.rowloop:
-
-	; -- Even part
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-	movq	mm4,mm0
-	movq	mm5,mm1
-	pfsub	mm0,mm2			; mm0=tmp11
-	pfsub	mm1,mm3
-	pfadd	mm4,mm2			; mm4=tmp10
-	pfadd	mm5,mm3			; mm5=tmp13
-
-	pfmul	mm1,[GOTOFF(ebx,PD_1_414)]
-	pfsub	mm1,mm5			; mm1=tmp12
-
-	movq	mm6,mm4
-	movq	mm7,mm0
-	pfsub	mm4,mm5			; mm4=tmp3
-	pfsub	mm0,mm1			; mm0=tmp2
-	pfadd	mm6,mm5			; mm6=tmp0
-	pfadd	mm7,mm1			; mm7=tmp1
-
-	movq	MMWORD [wk(1)], mm4	; tmp3
-	movq	MMWORD [wk(0)], mm0	; tmp2
-
-	; -- Odd part
-
-	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-	movq	mm4,mm2
-	movq	mm0,mm5
-	pfadd	mm2,mm1			; mm2=z11
-	pfadd	mm5,mm3			; mm5=z13
-	pfsub	mm4,mm1			; mm4=z12
-	pfsub	mm0,mm3			; mm0=z10
-
-	movq	mm1,mm2
-	pfsub	mm2,mm5
-	pfadd	mm1,mm5			; mm1=tmp7
-
-	pfmul	mm2,[GOTOFF(ebx,PD_1_414)]	; mm2=tmp11
-
-	movq	mm3,mm0
-	pfadd	mm0,mm4
-	pfmul	mm0,[GOTOFF(ebx,PD_1_847)]	; mm0=z5
-	pfmul	mm3,[GOTOFF(ebx,PD_2_613)]	; mm3=(z10 * 2.613125930)
-	pfmul	mm4,[GOTOFF(ebx,PD_1_082)]	; mm4=(z12 * 1.082392200)
-	pfsubr	mm3,mm0			; mm3=tmp12
-	pfsub	mm4,mm0			; mm4=tmp10
-
-	; -- Final output stage
-
-	pfsub	mm3,mm1			; mm3=tmp6
-	movq	mm5,mm6
-	movq	mm0,mm7
-	pfadd	mm6,mm1			; mm6=data0=(00 10)
-	pfadd	mm7,mm3			; mm7=data1=(01 11)
-	pfsub	mm5,mm1			; mm5=data7=(07 17)
-	pfsub	mm0,mm3			; mm0=data6=(06 16)
-	pfsub	mm2,mm3			; mm2=tmp5
-
-	movq	mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; mm1=[PD_RNDINT_MAGIC]
-	pcmpeqd	mm3,mm3
-	psrld	mm3,WORD_BIT		; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
-
-	pfadd	mm6,mm1			; mm6=roundint(data0/8)=(00 ** 10 **)
-	pfadd	mm7,mm1			; mm7=roundint(data1/8)=(01 ** 11 **)
-	pfadd	mm0,mm1			; mm0=roundint(data6/8)=(06 ** 16 **)
-	pfadd	mm5,mm1			; mm5=roundint(data7/8)=(07 ** 17 **)
-
-	pand	mm6,mm3			; mm6=(00 -- 10 --)
-	pslld	mm7,WORD_BIT		; mm7=(-- 01 -- 11)
-	pand	mm0,mm3			; mm0=(06 -- 16 --)
-	pslld	mm5,WORD_BIT		; mm5=(-- 07 -- 17)
-	por	mm6,mm7			; mm6=(00 01 10 11)
-	por	mm0,mm5			; mm0=(06 07 16 17)
-
-	movq	mm1, MMWORD [wk(0)]	; mm1=tmp2
-	movq	mm3, MMWORD [wk(1)]	; mm3=tmp3
-
-	pfadd	mm4,mm2			; mm4=tmp4
-	movq	mm7,mm1
-	movq	mm5,mm3
-	pfadd	mm1,mm2			; mm1=data2=(02 12)
-	pfadd	mm3,mm4			; mm3=data4=(04 14)
-	pfsub	mm7,mm2			; mm7=data5=(05 15)
-	pfsub	mm5,mm4			; mm5=data3=(03 13)
-
-	movq	mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; mm2=[PD_RNDINT_MAGIC]
-	pcmpeqd	mm4,mm4
-	psrld	mm4,WORD_BIT		; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
-
-	pfadd	mm3,mm2			; mm3=roundint(data4/8)=(04 ** 14 **)
-	pfadd	mm7,mm2			; mm7=roundint(data5/8)=(05 ** 15 **)
-	pfadd	mm1,mm2			; mm1=roundint(data2/8)=(02 ** 12 **)
-	pfadd	mm5,mm2			; mm5=roundint(data3/8)=(03 ** 13 **)
-
-	pand	mm3,mm4			; mm3=(04 -- 14 --)
-	pslld	mm7,WORD_BIT		; mm7=(-- 05 -- 15)
-	pand	mm1,mm4			; mm1=(02 -- 12 --)
-	pslld	mm5,WORD_BIT		; mm5=(-- 03 -- 13)
-	por	mm3,mm7			; mm3=(04 05 14 15)
-	por	mm1,mm5			; mm1=(02 03 12 13)
-
-	movq      mm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm2=[PB_CENTERJSAMP]
-
-	packsswb  mm6,mm3		; mm6=(00 01 10 11 04 05 14 15)
-	packsswb  mm1,mm0		; mm1=(02 03 12 13 06 07 16 17)
-	paddb     mm6,mm2
-	paddb     mm1,mm2
-
-	movq      mm4,mm6		; transpose coefficients(phase 2)
-	punpcklwd mm6,mm1		; mm6=(00 01 02 03 10 11 12 13)
-	punpckhwd mm4,mm1		; mm4=(04 05 06 07 14 15 16 17)
-
-	movq      mm7,mm6		; transpose coefficients(phase 3)
-	punpckldq mm6,mm4		; mm6=(00 01 02 03 04 05 06 07)
-	punpckhdq mm7,mm4		; mm7=(10 11 12 13 14 15 16 17)
-
-	pushpic	ebx			; save GOT address
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
-
-	poppic	ebx			; restore GOT address
-
-	add	esi, byte 2*SIZEOF_FAST_FLOAT	; wsptr
-	add	edi, byte 2*SIZEOF_JSAMPROW
-	dec	ecx				; ctr
-	jnz	near .rowloop
-
-	femms		; empty MMX/3DNow! state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jidctflt-3dn.asm b/simd/jidctflt-3dn.asm
new file mode 100644
index 0000000..24bd105
--- /dev/null
+++ b/simd/jidctflt-3dn.asm
@@ -0,0 +1,452 @@
+;
+; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414        times 2 dd  1.414213562373095048801689
+PD_1_847        times 2 dd  1.847759065022573512256366
+PD_1_082        times 2 dd  1.082392200292393968799446
+PD_2_613        times 2 dd  2.613125929752753055713286
+PD_RNDINT_MAGIC times 2 dd  100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_3dnow (void *dct_table, JCOEFPTR coef_block,
+;                         JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)    (b)+8           ; void *dct_table
+%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
+%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
+%define output_col(b)   (b)+20          ; JDIMENSION output_col
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM          2
+%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+        align   16
+        global  EXTN(jsimd_idct_float_3dnow)
+
+EXTN(jsimd_idct_float_3dnow):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [workspace]
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process columns from input, store into work array.
+
+;       mov     eax, [original_ebp]
+        mov     edx, POINTER [dct_table(eax)]           ; quantptr
+        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
+        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
+        mov     ecx, DCTSIZE/2                          ; ctr
+        alignx  16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        jnz     short .columnDCT
+
+        pushpic ebx             ; save GOT address
+        mov     ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        mov     eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        or      ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        or      ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        or      eax,ebx
+        poppic  ebx             ; restore GOT address
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+        punpcklwd mm0,mm0
+        psrad     mm0,(DWORD_BIT-WORD_BIT)
+        pi2fd     mm0,mm0
+
+        pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movq      mm1,mm0
+        punpckldq mm0,mm0
+        punpckhdq mm1,mm1
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+        jmp     near .nextcolumn
+        alignx  16,7
+%endif
+.columnDCT:
+
+        ; -- Even part
+
+        movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movd      mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        movd      mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        movd      mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+        punpcklwd mm0,mm0
+        punpcklwd mm1,mm1
+        psrad     mm0,(DWORD_BIT-WORD_BIT)
+        psrad     mm1,(DWORD_BIT-WORD_BIT)
+        pi2fd     mm0,mm0
+        pi2fd     mm1,mm1
+
+        pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        pfmul     mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        punpcklwd mm2,mm2
+        punpcklwd mm3,mm3
+        psrad     mm2,(DWORD_BIT-WORD_BIT)
+        psrad     mm3,(DWORD_BIT-WORD_BIT)
+        pi2fd     mm2,mm2
+        pi2fd     mm3,mm3
+
+        pfmul     mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        pfmul     mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movq    mm4,mm0
+        movq    mm5,mm1
+        pfsub   mm0,mm2                 ; mm0=tmp11
+        pfsub   mm1,mm3
+        pfadd   mm4,mm2                 ; mm4=tmp10
+        pfadd   mm5,mm3                 ; mm5=tmp13
+
+        pfmul   mm1,[GOTOFF(ebx,PD_1_414)]
+        pfsub   mm1,mm5                 ; mm1=tmp12
+
+        movq    mm6,mm4
+        movq    mm7,mm0
+        pfsub   mm4,mm5                 ; mm4=tmp3
+        pfsub   mm0,mm1                 ; mm0=tmp2
+        pfadd   mm6,mm5                 ; mm6=tmp0
+        pfadd   mm7,mm1                 ; mm7=tmp1
+
+        movq    MMWORD [wk(1)], mm4     ; tmp3
+        movq    MMWORD [wk(0)], mm0     ; tmp2
+
+        ; -- Odd part
+
+        movd      mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movd      mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        movd      mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movd      mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+        punpcklwd mm2,mm2
+        punpcklwd mm3,mm3
+        psrad     mm2,(DWORD_BIT-WORD_BIT)
+        psrad     mm3,(DWORD_BIT-WORD_BIT)
+        pi2fd     mm2,mm2
+        pi2fd     mm3,mm3
+
+        pfmul     mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        pfmul     mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        punpcklwd mm5,mm5
+        punpcklwd mm1,mm1
+        psrad     mm5,(DWORD_BIT-WORD_BIT)
+        psrad     mm1,(DWORD_BIT-WORD_BIT)
+        pi2fd     mm5,mm5
+        pi2fd     mm1,mm1
+
+        pfmul     mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        pfmul     mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movq    mm4,mm2
+        movq    mm0,mm5
+        pfadd   mm2,mm1                 ; mm2=z11
+        pfadd   mm5,mm3                 ; mm5=z13
+        pfsub   mm4,mm1                 ; mm4=z12
+        pfsub   mm0,mm3                 ; mm0=z10
+
+        movq    mm1,mm2
+        pfsub   mm2,mm5
+        pfadd   mm1,mm5                 ; mm1=tmp7
+
+        pfmul   mm2,[GOTOFF(ebx,PD_1_414)]      ; mm2=tmp11
+
+        movq    mm3,mm0
+        pfadd   mm0,mm4
+        pfmul   mm0,[GOTOFF(ebx,PD_1_847)]      ; mm0=z5
+        pfmul   mm3,[GOTOFF(ebx,PD_2_613)]      ; mm3=(z10 * 2.613125930)
+        pfmul   mm4,[GOTOFF(ebx,PD_1_082)]      ; mm4=(z12 * 1.082392200)
+        pfsubr  mm3,mm0                 ; mm3=tmp12
+        pfsub   mm4,mm0                 ; mm4=tmp10
+
+        ; -- Final output stage
+
+        pfsub   mm3,mm1                 ; mm3=tmp6
+        movq    mm5,mm6
+        movq    mm0,mm7
+        pfadd   mm6,mm1                 ; mm6=data0=(00 01)
+        pfadd   mm7,mm3                 ; mm7=data1=(10 11)
+        pfsub   mm5,mm1                 ; mm5=data7=(70 71)
+        pfsub   mm0,mm3                 ; mm0=data6=(60 61)
+        pfsub   mm2,mm3                 ; mm2=tmp5
+
+        movq      mm1,mm6               ; transpose coefficients
+        punpckldq mm6,mm7               ; mm6=(00 10)
+        punpckhdq mm1,mm7               ; mm1=(01 11)
+        movq      mm3,mm0               ; transpose coefficients
+        punpckldq mm0,mm5               ; mm0=(60 70)
+        punpckhdq mm3,mm5               ; mm3=(61 71)
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp2
+        movq    mm5, MMWORD [wk(1)]     ; mm5=tmp3
+
+        pfadd   mm4,mm2                 ; mm4=tmp4
+        movq    mm6,mm7
+        movq    mm1,mm5
+        pfadd   mm7,mm2                 ; mm7=data2=(20 21)
+        pfadd   mm5,mm4                 ; mm5=data4=(40 41)
+        pfsub   mm6,mm2                 ; mm6=data5=(50 51)
+        pfsub   mm1,mm4                 ; mm1=data3=(30 31)
+
+        movq      mm0,mm7               ; transpose coefficients
+        punpckldq mm7,mm1               ; mm7=(20 30)
+        punpckhdq mm0,mm1               ; mm0=(21 31)
+        movq      mm3,mm5               ; transpose coefficients
+        punpckldq mm5,mm6               ; mm5=(40 50)
+        punpckhdq mm3,mm6               ; mm3=(41 51)
+
+        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+        add     esi, byte 2*SIZEOF_JCOEF                ; coef_block
+        add     edx, byte 2*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
+        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
+        dec     ecx                                     ; ctr
+        jnz     near .columnloop
+
+        ; -- Prefetch the next coefficient block
+
+        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+        ; ---- Pass 2: process rows from work array, store into output array.
+
+        mov     eax, [original_ebp]
+        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
+        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
+        mov     eax, JDIMENSION [output_col(eax)]
+        mov     ecx, DCTSIZE/2                          ; ctr
+        alignx  16,7
+.rowloop:
+
+        ; -- Even part
+
+        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+        movq    mm4,mm0
+        movq    mm5,mm1
+        pfsub   mm0,mm2                 ; mm0=tmp11
+        pfsub   mm1,mm3
+        pfadd   mm4,mm2                 ; mm4=tmp10
+        pfadd   mm5,mm3                 ; mm5=tmp13
+
+        pfmul   mm1,[GOTOFF(ebx,PD_1_414)]
+        pfsub   mm1,mm5                 ; mm1=tmp12
+
+        movq    mm6,mm4
+        movq    mm7,mm0
+        pfsub   mm4,mm5                 ; mm4=tmp3
+        pfsub   mm0,mm1                 ; mm0=tmp2
+        pfadd   mm6,mm5                 ; mm6=tmp0
+        pfadd   mm7,mm1                 ; mm7=tmp1
+
+        movq    MMWORD [wk(1)], mm4     ; tmp3
+        movq    MMWORD [wk(0)], mm0     ; tmp2
+
+        ; -- Odd part
+
+        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+        movq    mm4,mm2
+        movq    mm0,mm5
+        pfadd   mm2,mm1                 ; mm2=z11
+        pfadd   mm5,mm3                 ; mm5=z13
+        pfsub   mm4,mm1                 ; mm4=z12
+        pfsub   mm0,mm3                 ; mm0=z10
+
+        movq    mm1,mm2
+        pfsub   mm2,mm5
+        pfadd   mm1,mm5                 ; mm1=tmp7
+
+        pfmul   mm2,[GOTOFF(ebx,PD_1_414)]      ; mm2=tmp11
+
+        movq    mm3,mm0
+        pfadd   mm0,mm4
+        pfmul   mm0,[GOTOFF(ebx,PD_1_847)]      ; mm0=z5
+        pfmul   mm3,[GOTOFF(ebx,PD_2_613)]      ; mm3=(z10 * 2.613125930)
+        pfmul   mm4,[GOTOFF(ebx,PD_1_082)]      ; mm4=(z12 * 1.082392200)
+        pfsubr  mm3,mm0                 ; mm3=tmp12
+        pfsub   mm4,mm0                 ; mm4=tmp10
+
+        ; -- Final output stage
+
+        pfsub   mm3,mm1                 ; mm3=tmp6
+        movq    mm5,mm6
+        movq    mm0,mm7
+        pfadd   mm6,mm1                 ; mm6=data0=(00 10)
+        pfadd   mm7,mm3                 ; mm7=data1=(01 11)
+        pfsub   mm5,mm1                 ; mm5=data7=(07 17)
+        pfsub   mm0,mm3                 ; mm0=data6=(06 16)
+        pfsub   mm2,mm3                 ; mm2=tmp5
+
+        movq    mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]       ; mm1=[PD_RNDINT_MAGIC]
+        pcmpeqd mm3,mm3
+        psrld   mm3,WORD_BIT            ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+        pfadd   mm6,mm1                 ; mm6=roundint(data0/8)=(00 ** 10 **)
+        pfadd   mm7,mm1                 ; mm7=roundint(data1/8)=(01 ** 11 **)
+        pfadd   mm0,mm1                 ; mm0=roundint(data6/8)=(06 ** 16 **)
+        pfadd   mm5,mm1                 ; mm5=roundint(data7/8)=(07 ** 17 **)
+
+        pand    mm6,mm3                 ; mm6=(00 -- 10 --)
+        pslld   mm7,WORD_BIT            ; mm7=(-- 01 -- 11)
+        pand    mm0,mm3                 ; mm0=(06 -- 16 --)
+        pslld   mm5,WORD_BIT            ; mm5=(-- 07 -- 17)
+        por     mm6,mm7                 ; mm6=(00 01 10 11)
+        por     mm0,mm5                 ; mm0=(06 07 16 17)
+
+        movq    mm1, MMWORD [wk(0)]     ; mm1=tmp2
+        movq    mm3, MMWORD [wk(1)]     ; mm3=tmp3
+
+        pfadd   mm4,mm2                 ; mm4=tmp4
+        movq    mm7,mm1
+        movq    mm5,mm3
+        pfadd   mm1,mm2                 ; mm1=data2=(02 12)
+        pfadd   mm3,mm4                 ; mm3=data4=(04 14)
+        pfsub   mm7,mm2                 ; mm7=data5=(05 15)
+        pfsub   mm5,mm4                 ; mm5=data3=(03 13)
+
+        movq    mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]       ; mm2=[PD_RNDINT_MAGIC]
+        pcmpeqd mm4,mm4
+        psrld   mm4,WORD_BIT            ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+        pfadd   mm3,mm2                 ; mm3=roundint(data4/8)=(04 ** 14 **)
+        pfadd   mm7,mm2                 ; mm7=roundint(data5/8)=(05 ** 15 **)
+        pfadd   mm1,mm2                 ; mm1=roundint(data2/8)=(02 ** 12 **)
+        pfadd   mm5,mm2                 ; mm5=roundint(data3/8)=(03 ** 13 **)
+
+        pand    mm3,mm4                 ; mm3=(04 -- 14 --)
+        pslld   mm7,WORD_BIT            ; mm7=(-- 05 -- 15)
+        pand    mm1,mm4                 ; mm1=(02 -- 12 --)
+        pslld   mm5,WORD_BIT            ; mm5=(-- 03 -- 13)
+        por     mm3,mm7                 ; mm3=(04 05 14 15)
+        por     mm1,mm5                 ; mm1=(02 03 12 13)
+
+        movq      mm2,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm2=[PB_CENTERJSAMP]
+
+        packsswb  mm6,mm3               ; mm6=(00 01 10 11 04 05 14 15)
+        packsswb  mm1,mm0               ; mm1=(02 03 12 13 06 07 16 17)
+        paddb     mm6,mm2
+        paddb     mm1,mm2
+
+        movq      mm4,mm6               ; transpose coefficients(phase 2)
+        punpcklwd mm6,mm1               ; mm6=(00 01 02 03 10 11 12 13)
+        punpckhwd mm4,mm1               ; mm4=(04 05 06 07 14 15 16 17)
+
+        movq      mm7,mm6               ; transpose coefficients(phase 3)
+        punpckldq mm6,mm4               ; mm6=(00 01 02 03 04 05 06 07)
+        punpckhdq mm7,mm4               ; mm7=(10 11 12 13 14 15 16 17)
+
+        pushpic ebx                     ; save GOT address
+
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+        poppic  ebx                     ; restore GOT address
+
+        add     esi, byte 2*SIZEOF_FAST_FLOAT   ; wsptr
+        add     edi, byte 2*SIZEOF_JSAMPROW
+        dec     ecx                             ; ctr
+        jnz     near .rowloop
+
+        femms           ; empty MMX/3DNow! state
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jidctflt-sse.asm b/simd/jidctflt-sse.asm
new file mode 100644
index 0000000..9605b73
--- /dev/null
+++ b/simd/jidctflt-sse.asm
@@ -0,0 +1,572 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+        shufps  %1,%2,0x44
+%endmacro
+
+%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+        shufps  %1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414        times 4 dd  1.414213562373095048801689
+PD_1_847        times 4 dd  1.847759065022573512256366
+PD_1_082        times 4 dd  1.082392200292393968799446
+PD_M2_613       times 4 dd -2.613125929752753055713286
+PD_0_125        times 4 dd  0.125       ; 1/8
+PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse (void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)    (b)+8           ; void *dct_table
+%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
+%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
+%define output_col(b)   (b)+20          ; JDIMENSION output_col
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+        align   16
+        global  EXTN(jsimd_idct_float_sse)
+
+EXTN(jsimd_idct_float_sse):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [workspace]
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process columns from input, store into work array.
+
+;       mov     eax, [original_ebp]
+        mov     edx, POINTER [dct_table(eax)]           ; quantptr
+        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
+        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
+        mov     ecx, DCTSIZE/4                          ; ctr
+        alignx  16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        jnz     near .columnDCT
+
+        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        por     mm1,mm0
+        packsswb mm1,mm1
+        movd    eax,mm1
+        test    eax,eax
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+        punpckhwd mm1,mm0                       ; mm1=(** 02 ** 03)
+        punpcklwd mm0,mm0                       ; mm0=(00 00 01 01)
+        psrad     mm1,(DWORD_BIT-WORD_BIT)      ; mm1=in0H=(02 03)
+        psrad     mm0,(DWORD_BIT-WORD_BIT)      ; mm0=in0L=(00 01)
+        cvtpi2ps  xmm3,mm1                      ; xmm3=(02 03 ** **)
+        cvtpi2ps  xmm0,mm0                      ; xmm0=(00 01 ** **)
+        movlhps   xmm0,xmm3                     ; xmm0=in0=(00 01 02 03)
+
+        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movaps  xmm1,xmm0
+        movaps  xmm2,xmm0
+        movaps  xmm3,xmm0
+
+        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
+        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
+        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
+        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
+
+        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+        jmp     near .nextcolumn
+        alignx  16,7
+%endif
+.columnDCT:
+
+        ; -- Even part
+
+        movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movq      mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        movq      mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        movq      mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+        punpckhwd mm4,mm0                       ; mm4=(** 02 ** 03)
+        punpcklwd mm0,mm0                       ; mm0=(00 00 01 01)
+        punpckhwd mm5,mm1                       ; mm5=(** 22 ** 23)
+        punpcklwd mm1,mm1                       ; mm1=(20 20 21 21)
+
+        psrad     mm4,(DWORD_BIT-WORD_BIT)      ; mm4=in0H=(02 03)
+        psrad     mm0,(DWORD_BIT-WORD_BIT)      ; mm0=in0L=(00 01)
+        cvtpi2ps  xmm4,mm4                      ; xmm4=(02 03 ** **)
+        cvtpi2ps  xmm0,mm0                      ; xmm0=(00 01 ** **)
+        psrad     mm5,(DWORD_BIT-WORD_BIT)      ; mm5=in2H=(22 23)
+        psrad     mm1,(DWORD_BIT-WORD_BIT)      ; mm1=in2L=(20 21)
+        cvtpi2ps  xmm5,mm5                      ; xmm5=(22 23 ** **)
+        cvtpi2ps  xmm1,mm1                      ; xmm1=(20 21 ** **)
+
+        punpckhwd mm6,mm2                       ; mm6=(** 42 ** 43)
+        punpcklwd mm2,mm2                       ; mm2=(40 40 41 41)
+        punpckhwd mm7,mm3                       ; mm7=(** 62 ** 63)
+        punpcklwd mm3,mm3                       ; mm3=(60 60 61 61)
+
+        psrad     mm6,(DWORD_BIT-WORD_BIT)      ; mm6=in4H=(42 43)
+        psrad     mm2,(DWORD_BIT-WORD_BIT)      ; mm2=in4L=(40 41)
+        cvtpi2ps  xmm6,mm6                      ; xmm6=(42 43 ** **)
+        cvtpi2ps  xmm2,mm2                      ; xmm2=(40 41 ** **)
+        psrad     mm7,(DWORD_BIT-WORD_BIT)      ; mm7=in6H=(62 63)
+        psrad     mm3,(DWORD_BIT-WORD_BIT)      ; mm3=in6L=(60 61)
+        cvtpi2ps  xmm7,mm7                      ; xmm7=(62 63 ** **)
+        cvtpi2ps  xmm3,mm3                      ; xmm3=(60 61 ** **)
+
+        movlhps   xmm0,xmm4                     ; xmm0=in0=(00 01 02 03)
+        movlhps   xmm1,xmm5                     ; xmm1=in2=(20 21 22 23)
+        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movlhps   xmm2,xmm6                     ; xmm2=in4=(40 41 42 43)
+        movlhps   xmm3,xmm7                     ; xmm3=in6=(60 61 62 63)
+        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movaps  xmm4,xmm0
+        movaps  xmm5,xmm1
+        subps   xmm0,xmm2               ; xmm0=tmp11
+        subps   xmm1,xmm3
+        addps   xmm4,xmm2               ; xmm4=tmp10
+        addps   xmm5,xmm3               ; xmm5=tmp13
+
+        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
+        subps   xmm1,xmm5               ; xmm1=tmp12
+
+        movaps  xmm6,xmm4
+        movaps  xmm7,xmm0
+        subps   xmm4,xmm5               ; xmm4=tmp3
+        subps   xmm0,xmm1               ; xmm0=tmp2
+        addps   xmm6,xmm5               ; xmm6=tmp0
+        addps   xmm7,xmm1               ; xmm7=tmp1
+
+        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
+        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
+
+        ; -- Odd part
+
+        movq      mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq      mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        movq      mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movq      mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+        punpckhwd mm6,mm4                       ; mm6=(** 12 ** 13)
+        punpcklwd mm4,mm4                       ; mm4=(10 10 11 11)
+        punpckhwd mm2,mm0                       ; mm2=(** 32 ** 33)
+        punpcklwd mm0,mm0                       ; mm0=(30 30 31 31)
+
+        psrad     mm6,(DWORD_BIT-WORD_BIT)      ; mm6=in1H=(12 13)
+        psrad     mm4,(DWORD_BIT-WORD_BIT)      ; mm4=in1L=(10 11)
+        cvtpi2ps  xmm4,mm6                      ; xmm4=(12 13 ** **)
+        cvtpi2ps  xmm2,mm4                      ; xmm2=(10 11 ** **)
+        psrad     mm2,(DWORD_BIT-WORD_BIT)      ; mm2=in3H=(32 33)
+        psrad     mm0,(DWORD_BIT-WORD_BIT)      ; mm0=in3L=(30 31)
+        cvtpi2ps  xmm0,mm2                      ; xmm0=(32 33 ** **)
+        cvtpi2ps  xmm3,mm0                      ; xmm3=(30 31 ** **)
+
+        punpckhwd mm7,mm5                       ; mm7=(** 52 ** 53)
+        punpcklwd mm5,mm5                       ; mm5=(50 50 51 51)
+        punpckhwd mm3,mm1                       ; mm3=(** 72 ** 73)
+        punpcklwd mm1,mm1                       ; mm1=(70 70 71 71)
+
+        movlhps   xmm2,xmm4                     ; xmm2=in1=(10 11 12 13)
+        movlhps   xmm3,xmm0                     ; xmm3=in3=(30 31 32 33)
+
+        psrad     mm7,(DWORD_BIT-WORD_BIT)      ; mm7=in5H=(52 53)
+        psrad     mm5,(DWORD_BIT-WORD_BIT)      ; mm5=in5L=(50 51)
+        cvtpi2ps  xmm4,mm7                      ; xmm4=(52 53 ** **)
+        cvtpi2ps  xmm5,mm5                      ; xmm5=(50 51 ** **)
+        psrad     mm3,(DWORD_BIT-WORD_BIT)      ; mm3=in7H=(72 73)
+        psrad     mm1,(DWORD_BIT-WORD_BIT)      ; mm1=in7L=(70 71)
+        cvtpi2ps  xmm0,mm3                      ; xmm0=(72 73 ** **)
+        cvtpi2ps  xmm1,mm1                      ; xmm1=(70 71 ** **)
+
+        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movlhps   xmm5,xmm4                     ; xmm5=in5=(50 51 52 53)
+        movlhps   xmm1,xmm0                     ; xmm1=in7=(70 71 72 73)
+        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movaps  xmm4,xmm2
+        movaps  xmm0,xmm5
+        addps   xmm2,xmm1               ; xmm2=z11
+        addps   xmm5,xmm3               ; xmm5=z13
+        subps   xmm4,xmm1               ; xmm4=z12
+        subps   xmm0,xmm3               ; xmm0=z10
+
+        movaps  xmm1,xmm2
+        subps   xmm2,xmm5
+        addps   xmm1,xmm5               ; xmm1=tmp7
+
+        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
+
+        movaps  xmm3,xmm0
+        addps   xmm0,xmm4
+        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
+        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
+        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
+        addps   xmm3,xmm0               ; xmm3=tmp12
+        subps   xmm4,xmm0               ; xmm4=tmp10
+
+        ; -- Final output stage
+
+        subps   xmm3,xmm1               ; xmm3=tmp6
+        movaps  xmm5,xmm6
+        movaps  xmm0,xmm7
+        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
+        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
+        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
+        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
+        subps   xmm2,xmm3               ; xmm2=tmp5
+
+        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
+        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
+        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
+        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
+        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
+        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
+
+        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+        addps   xmm4,xmm2               ; xmm4=tmp4
+        movaps  xmm0,xmm7
+        movaps  xmm3,xmm5
+        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
+        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
+        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
+        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
+
+        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
+        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
+        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
+        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
+        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
+        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
+
+        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
+        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
+        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
+        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
+        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
+        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
+
+        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
+        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
+        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
+        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
+        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
+        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
+
+        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
+        add     edx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
+        add     edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
+        dec     ecx                                     ; ctr
+        jnz     near .columnloop
+
+        ; -- Prefetch the next coefficient block
+
+        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+        ; ---- Pass 2: process rows from work array, store into output array.
+
+        mov     eax, [original_ebp]
+        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
+        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
+        mov     eax, JDIMENSION [output_col(eax)]
+        mov     ecx, DCTSIZE/4                          ; ctr
+        alignx  16,7
+.rowloop:
+
+        ; -- Even part
+
+        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+        movaps  xmm4,xmm0
+        movaps  xmm5,xmm1
+        subps   xmm0,xmm2               ; xmm0=tmp11
+        subps   xmm1,xmm3
+        addps   xmm4,xmm2               ; xmm4=tmp10
+        addps   xmm5,xmm3               ; xmm5=tmp13
+
+        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
+        subps   xmm1,xmm5               ; xmm1=tmp12
+
+        movaps  xmm6,xmm4
+        movaps  xmm7,xmm0
+        subps   xmm4,xmm5               ; xmm4=tmp3
+        subps   xmm0,xmm1               ; xmm0=tmp2
+        addps   xmm6,xmm5               ; xmm6=tmp0
+        addps   xmm7,xmm1               ; xmm7=tmp1
+
+        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
+        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
+
+        ; -- Odd part
+
+        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+        movaps  xmm4,xmm2
+        movaps  xmm0,xmm5
+        addps   xmm2,xmm1               ; xmm2=z11
+        addps   xmm5,xmm3               ; xmm5=z13
+        subps   xmm4,xmm1               ; xmm4=z12
+        subps   xmm0,xmm3               ; xmm0=z10
+
+        movaps  xmm1,xmm2
+        subps   xmm2,xmm5
+        addps   xmm1,xmm5               ; xmm1=tmp7
+
+        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
+
+        movaps  xmm3,xmm0
+        addps   xmm0,xmm4
+        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
+        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
+        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
+        addps   xmm3,xmm0               ; xmm3=tmp12
+        subps   xmm4,xmm0               ; xmm4=tmp10
+
+        ; -- Final output stage
+
+        subps   xmm3,xmm1               ; xmm3=tmp6
+        movaps  xmm5,xmm6
+        movaps  xmm0,xmm7
+        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
+        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
+        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
+        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
+        subps   xmm2,xmm3               ; xmm2=tmp5
+
+        movaps  xmm1,[GOTOFF(ebx,PD_0_125)]     ; xmm1=[PD_0_125]
+
+        mulps   xmm6,xmm1               ; descale(1/8)
+        mulps   xmm7,xmm1               ; descale(1/8)
+        mulps   xmm5,xmm1               ; descale(1/8)
+        mulps   xmm0,xmm1               ; descale(1/8)
+
+        movhlps   xmm3,xmm6
+        movhlps   xmm1,xmm7
+        cvtps2pi  mm0,xmm6              ; round to int32, mm0=data0L=(00 10)
+        cvtps2pi  mm1,xmm7              ; round to int32, mm1=data1L=(01 11)
+        cvtps2pi  mm2,xmm3              ; round to int32, mm2=data0H=(20 30)
+        cvtps2pi  mm3,xmm1              ; round to int32, mm3=data1H=(21 31)
+        packssdw  mm0,mm2               ; mm0=data0=(00 10 20 30)
+        packssdw  mm1,mm3               ; mm1=data1=(01 11 21 31)
+
+        movhlps   xmm6,xmm5
+        movhlps   xmm7,xmm0
+        cvtps2pi  mm4,xmm5              ; round to int32, mm4=data7L=(07 17)
+        cvtps2pi  mm5,xmm0              ; round to int32, mm5=data6L=(06 16)
+        cvtps2pi  mm6,xmm6              ; round to int32, mm6=data7H=(27 37)
+        cvtps2pi  mm7,xmm7              ; round to int32, mm7=data6H=(26 36)
+        packssdw  mm4,mm6               ; mm4=data7=(07 17 27 37)
+        packssdw  mm5,mm7               ; mm5=data6=(06 16 26 36)
+
+        packsswb  mm0,mm5               ; mm0=(00 10 20 30 06 16 26 36)
+        packsswb  mm1,mm4               ; mm1=(01 11 21 31 07 17 27 37)
+
+        movaps  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp2
+        movaps  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+        movaps  xmm6,[GOTOFF(ebx,PD_0_125)]     ; xmm6=[PD_0_125]
+
+        addps   xmm4,xmm2               ; xmm4=tmp4
+        movaps  xmm5,xmm3
+        movaps  xmm0,xmm1
+        addps   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
+        addps   xmm1,xmm4               ; xmm1=data4=(04 14 24 34)
+        subps   xmm5,xmm2               ; xmm5=data5=(05 15 25 35)
+        subps   xmm0,xmm4               ; xmm0=data3=(03 13 23 33)
+
+        mulps   xmm3,xmm6               ; descale(1/8)
+        mulps   xmm1,xmm6               ; descale(1/8)
+        mulps   xmm5,xmm6               ; descale(1/8)
+        mulps   xmm0,xmm6               ; descale(1/8)
+
+        movhlps   xmm7,xmm3
+        movhlps   xmm2,xmm1
+        cvtps2pi  mm2,xmm3              ; round to int32, mm2=data2L=(02 12)
+        cvtps2pi  mm3,xmm1              ; round to int32, mm3=data4L=(04 14)
+        cvtps2pi  mm6,xmm7              ; round to int32, mm6=data2H=(22 32)
+        cvtps2pi  mm7,xmm2              ; round to int32, mm7=data4H=(24 34)
+        packssdw  mm2,mm6               ; mm2=data2=(02 12 22 32)
+        packssdw  mm3,mm7               ; mm3=data4=(04 14 24 34)
+
+        movhlps   xmm4,xmm5
+        movhlps   xmm6,xmm0
+        cvtps2pi  mm5,xmm5              ; round to int32, mm5=data5L=(05 15)
+        cvtps2pi  mm4,xmm0              ; round to int32, mm4=data3L=(03 13)
+        cvtps2pi  mm6,xmm4              ; round to int32, mm6=data5H=(25 35)
+        cvtps2pi  mm7,xmm6              ; round to int32, mm7=data3H=(23 33)
+        packssdw  mm5,mm6               ; mm5=data5=(05 15 25 35)
+        packssdw  mm4,mm7               ; mm4=data3=(03 13 23 33)
+
+        movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm6=[PB_CENTERJSAMP]
+
+        packsswb  mm2,mm3               ; mm2=(02 12 22 32 04 14 24 34)
+        packsswb  mm4,mm5               ; mm4=(03 13 23 33 05 15 25 35)
+
+        paddb     mm0,mm6
+        paddb     mm1,mm6
+        paddb     mm2,mm6
+        paddb     mm4,mm6
+
+        movq      mm7,mm0               ; transpose coefficients(phase 1)
+        punpcklbw mm0,mm1               ; mm0=(00 01 10 11 20 21 30 31)
+        punpckhbw mm7,mm1               ; mm7=(06 07 16 17 26 27 36 37)
+        movq      mm3,mm2               ; transpose coefficients(phase 1)
+        punpcklbw mm2,mm4               ; mm2=(02 03 12 13 22 23 32 33)
+        punpckhbw mm3,mm4               ; mm3=(04 05 14 15 24 25 34 35)
+
+        movq      mm5,mm0               ; transpose coefficients(phase 2)
+        punpcklwd mm0,mm2               ; mm0=(00 01 02 03 10 11 12 13)
+        punpckhwd mm5,mm2               ; mm5=(20 21 22 23 30 31 32 33)
+        movq      mm6,mm3               ; transpose coefficients(phase 2)
+        punpcklwd mm3,mm7               ; mm3=(04 05 06 07 14 15 16 17)
+        punpckhwd mm6,mm7               ; mm6=(24 25 26 27 34 35 36 37)
+
+        movq      mm1,mm0               ; transpose coefficients(phase 3)
+        punpckldq mm0,mm3               ; mm0=(00 01 02 03 04 05 06 07)
+        punpckhdq mm1,mm3               ; mm1=(10 11 12 13 14 15 16 17)
+        movq      mm4,mm5               ; transpose coefficients(phase 3)
+        punpckldq mm5,mm6               ; mm5=(20 21 22 23 24 25 26 27)
+        punpckhdq mm4,mm6               ; mm4=(30 31 32 33 34 35 36 37)
+
+        pushpic ebx                     ; save GOT address
+
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+        poppic  ebx                     ; restore GOT address
+
+        add     esi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
+        add     edi, byte 4*SIZEOF_JSAMPROW
+        dec     ecx                             ; ctr
+        jnz     near .rowloop
+
+        emms            ; empty MMX state
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jidctflt-sse2-64.asm b/simd/jidctflt-sse2-64.asm
new file mode 100644
index 0000000..3f53501
--- /dev/null
+++ b/simd/jidctflt-sse2-64.asm
@@ -0,0 +1,483 @@
+;
+; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+        shufps  %1,%2,0x44
+%endmacro
+
+%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+        shufps  %1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414        times 4 dd  1.414213562373095048801689
+PD_1_847        times 4 dd  1.847759065022573512256366
+PD_1_082        times 4 dd  1.082392200292393968799446
+PD_M2_613       times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp    rbp+0
+%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+        align   16
+        global  EXTN(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [workspace]
+        collect_args
+        push    rbx
+
+        ; ---- Pass 1: process columns from input, store into work array.
+
+        mov     rdx, r10                ; quantptr
+        mov     rsi, r11                ; inptr
+        lea     rdi, [workspace]                        ; FAST_FLOAT *wsptr
+        mov     rcx, DCTSIZE/4                          ; ctr
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+        jnz     near .columnDCT
+
+        movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+        movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+        movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+        movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+        movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+        movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+        por     xmm1,xmm2
+        por     xmm3,xmm4
+        por     xmm5,xmm6
+        por     xmm1,xmm3
+        por     xmm5,xmm7
+        por     xmm1,xmm5
+        packsswb xmm1,xmm1
+        movd    eax,xmm1
+        test    rax,rax
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
+        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
+        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
+
+        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movaps  xmm1,xmm0
+        movaps  xmm2,xmm0
+        movaps  xmm3,xmm0
+
+        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
+        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
+        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
+        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
+
+        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+        movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+        movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+        movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+        jmp     near .nextcolumn
+%endif
+.columnDCT:
+
+        ; -- Even part
+
+        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+        movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+        movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+        movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
+        punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
+        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
+        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
+        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
+        cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
+
+        punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
+        punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
+        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
+        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
+        cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
+        cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
+
+        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movaps  xmm4,xmm0
+        movaps  xmm5,xmm1
+        subps   xmm0,xmm2               ; xmm0=tmp11
+        subps   xmm1,xmm3
+        addps   xmm4,xmm2               ; xmm4=tmp10
+        addps   xmm5,xmm3               ; xmm5=tmp13
+
+        mulps   xmm1,[rel PD_1_414]
+        subps   xmm1,xmm5               ; xmm1=tmp12
+
+        movaps  xmm6,xmm4
+        movaps  xmm7,xmm0
+        subps   xmm4,xmm5               ; xmm4=tmp3
+        subps   xmm0,xmm1               ; xmm0=tmp2
+        addps   xmm6,xmm5               ; xmm6=tmp0
+        addps   xmm7,xmm1               ; xmm7=tmp1
+
+        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
+        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
+
+        ; -- Odd part
+
+        movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+        movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+        movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+        punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
+        punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
+        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
+        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
+        cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
+        cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
+
+        punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
+        punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
+        psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
+        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
+        cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
+        cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
+
+        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movaps  xmm4,xmm2
+        movaps  xmm0,xmm5
+        addps   xmm2,xmm1               ; xmm2=z11
+        addps   xmm5,xmm3               ; xmm5=z13
+        subps   xmm4,xmm1               ; xmm4=z12
+        subps   xmm0,xmm3               ; xmm0=z10
+
+        movaps  xmm1,xmm2
+        subps   xmm2,xmm5
+        addps   xmm1,xmm5               ; xmm1=tmp7
+
+        mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
+
+        movaps  xmm3,xmm0
+        addps   xmm0,xmm4
+        mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
+        mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
+        mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
+        addps   xmm3,xmm0               ; xmm3=tmp12
+        subps   xmm4,xmm0               ; xmm4=tmp10
+
+        ; -- Final output stage
+
+        subps   xmm3,xmm1               ; xmm3=tmp6
+        movaps  xmm5,xmm6
+        movaps  xmm0,xmm7
+        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
+        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
+        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
+        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
+        subps   xmm2,xmm3               ; xmm2=tmp5
+
+        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
+        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
+        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
+        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
+        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
+        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
+
+        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+        addps   xmm4,xmm2               ; xmm4=tmp4
+        movaps  xmm0,xmm7
+        movaps  xmm3,xmm5
+        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
+        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
+        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
+        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
+
+        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
+        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
+        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
+        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
+        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
+        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
+
+        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
+        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
+        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
+        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
+        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
+        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
+
+        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+        movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
+        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
+        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
+        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
+        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
+        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
+
+        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+        movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+        movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+        add     rsi, byte 4*SIZEOF_JCOEF                ; coef_block
+        add     rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
+        add     rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
+        dec     rcx                                     ; ctr
+        jnz     near .columnloop
+
+        ; -- Prefetch the next coefficient block
+
+        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+        ; ---- Pass 2: process rows from work array, store into output array.
+
+        mov     rax, [original_rbp]
+        lea     rsi, [workspace]                        ; FAST_FLOAT *wsptr
+        mov     rdi, r12        ; (JSAMPROW *)
+        mov     eax, r13d
+        mov     rcx, DCTSIZE/4                          ; ctr
+.rowloop:
+
+        ; -- Even part
+
+        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+        movaps  xmm4,xmm0
+        movaps  xmm5,xmm1
+        subps   xmm0,xmm2               ; xmm0=tmp11
+        subps   xmm1,xmm3
+        addps   xmm4,xmm2               ; xmm4=tmp10
+        addps   xmm5,xmm3               ; xmm5=tmp13
+
+        mulps   xmm1,[rel PD_1_414]
+        subps   xmm1,xmm5               ; xmm1=tmp12
+
+        movaps  xmm6,xmm4
+        movaps  xmm7,xmm0
+        subps   xmm4,xmm5               ; xmm4=tmp3
+        subps   xmm0,xmm1               ; xmm0=tmp2
+        addps   xmm6,xmm5               ; xmm6=tmp0
+        addps   xmm7,xmm1               ; xmm7=tmp1
+
+        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
+        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
+
+        ; -- Odd part
+
+        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+        movaps  xmm4,xmm2
+        movaps  xmm0,xmm5
+        addps   xmm2,xmm1               ; xmm2=z11
+        addps   xmm5,xmm3               ; xmm5=z13
+        subps   xmm4,xmm1               ; xmm4=z12
+        subps   xmm0,xmm3               ; xmm0=z10
+
+        movaps  xmm1,xmm2
+        subps   xmm2,xmm5
+        addps   xmm1,xmm5               ; xmm1=tmp7
+
+        mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
+
+        movaps  xmm3,xmm0
+        addps   xmm0,xmm4
+        mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
+        mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
+        mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
+        addps   xmm3,xmm0               ; xmm3=tmp12
+        subps   xmm4,xmm0               ; xmm4=tmp10
+
+        ; -- Final output stage
+
+        subps   xmm3,xmm1               ; xmm3=tmp6
+        movaps  xmm5,xmm6
+        movaps  xmm0,xmm7
+        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
+        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
+        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
+        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
+        subps   xmm2,xmm3               ; xmm2=tmp5
+
+        movaps  xmm1,[rel PD_RNDINT_MAGIC]      ; xmm1=[rel PD_RNDINT_MAGIC]
+        pcmpeqd xmm3,xmm3
+        psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+        addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+        addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+        addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+        addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+        pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
+        pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+        pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
+        pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+        por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
+        por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
+
+        movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
+        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
+
+        addps   xmm4,xmm2               ; xmm4=tmp4
+        movaps  xmm7,xmm1
+        movaps  xmm5,xmm3
+        addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
+        addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
+        subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
+        subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
+
+        movaps  xmm2,[rel PD_RNDINT_MAGIC]      ; xmm2=[rel PD_RNDINT_MAGIC]
+        pcmpeqd xmm4,xmm4
+        psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+        addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+        addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+        addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+        addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+        pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
+        pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+        pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
+        pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+        por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
+        por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
+
+        movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
+
+        packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+        packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+        paddb     xmm6,xmm2
+        paddb     xmm1,xmm2
+
+        movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
+        punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+        punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+        movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
+        punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+        punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+        pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+        pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+        mov     rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+        movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+        mov     rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+        movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+        add     rsi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
+        add     rdi, byte 4*SIZEOF_JSAMPROW
+        dec     rcx                             ; ctr
+        jnz     near .rowloop
+
+        pop     rbx
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jidctflt-sse2.asm b/simd/jidctflt-sse2.asm
new file mode 100644
index 0000000..be899b3
--- /dev/null
+++ b/simd/jidctflt-sse2.asm
@@ -0,0 +1,498 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+        shufps  %1,%2,0x44
+%endmacro
+
+%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+        shufps  %1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414        times 4 dd  1.414213562373095048801689
+PD_1_847        times 4 dd  1.847759065022573512256366
+PD_1_082        times 4 dd  1.082392200292393968799446
+PD_M2_613       times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)    (b)+8           ; void *dct_table
+%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
+%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
+%define output_col(b)   (b)+20          ; JDIMENSION output_col
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+        align   16
+        global  EXTN(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [workspace]
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process columns from input, store into work array.
+
+;       mov     eax, [original_ebp]
+        mov     edx, POINTER [dct_table(eax)]           ; quantptr
+        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
+        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
+        mov     ecx, DCTSIZE/4                          ; ctr
+        alignx  16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        jnz     near .columnDCT
+
+        movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        por     xmm1,xmm2
+        por     xmm3,xmm4
+        por     xmm5,xmm6
+        por     xmm1,xmm3
+        por     xmm5,xmm7
+        por     xmm1,xmm5
+        packsswb xmm1,xmm1
+        movd    eax,xmm1
+        test    eax,eax
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
+        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
+        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
+
+        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movaps  xmm1,xmm0
+        movaps  xmm2,xmm0
+        movaps  xmm3,xmm0
+
+        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
+        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
+        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
+        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
+
+        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+        jmp     near .nextcolumn
+        alignx  16,7
+%endif
+.columnDCT:
+
+        ; -- Even part
+
+        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
+        punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
+        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
+        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
+        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
+        cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
+
+        punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
+        punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
+        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
+        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
+        cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
+        cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
+
+        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movaps  xmm4,xmm0
+        movaps  xmm5,xmm1
+        subps   xmm0,xmm2               ; xmm0=tmp11
+        subps   xmm1,xmm3
+        addps   xmm4,xmm2               ; xmm4=tmp10
+        addps   xmm5,xmm3               ; xmm5=tmp13
+
+        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
+        subps   xmm1,xmm5               ; xmm1=tmp12
+
+        movaps  xmm6,xmm4
+        movaps  xmm7,xmm0
+        subps   xmm4,xmm5               ; xmm4=tmp3
+        subps   xmm0,xmm1               ; xmm0=tmp2
+        addps   xmm6,xmm5               ; xmm6=tmp0
+        addps   xmm7,xmm1               ; xmm7=tmp1
+
+        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
+        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
+
+        ; -- Odd part
+
+        movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+        punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
+        punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
+        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
+        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
+        cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
+        cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
+
+        punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
+        punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
+        psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
+        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
+        cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
+        cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
+
+        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+        movaps  xmm4,xmm2
+        movaps  xmm0,xmm5
+        addps   xmm2,xmm1               ; xmm2=z11
+        addps   xmm5,xmm3               ; xmm5=z13
+        subps   xmm4,xmm1               ; xmm4=z12
+        subps   xmm0,xmm3               ; xmm0=z10
+
+        movaps  xmm1,xmm2
+        subps   xmm2,xmm5
+        addps   xmm1,xmm5               ; xmm1=tmp7
+
+        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
+
+        movaps  xmm3,xmm0
+        addps   xmm0,xmm4
+        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
+        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
+        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
+        addps   xmm3,xmm0               ; xmm3=tmp12
+        subps   xmm4,xmm0               ; xmm4=tmp10
+
+        ; -- Final output stage
+
+        subps   xmm3,xmm1               ; xmm3=tmp6
+        movaps  xmm5,xmm6
+        movaps  xmm0,xmm7
+        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
+        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
+        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
+        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
+        subps   xmm2,xmm3               ; xmm2=tmp5
+
+        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
+        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
+        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
+        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
+        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
+        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
+
+        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+        addps   xmm4,xmm2               ; xmm4=tmp4
+        movaps  xmm0,xmm7
+        movaps  xmm3,xmm5
+        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
+        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
+        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
+        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
+
+        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
+        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
+        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
+        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
+        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
+        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
+
+        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
+        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
+        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
+        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
+        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
+        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
+
+        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
+        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
+        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
+        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
+        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
+        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
+
+        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
+        add     edx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
+        add     edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
+        dec     ecx                                     ; ctr
+        jnz     near .columnloop
+
+        ; -- Prefetch the next coefficient block
+
+        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+        ; ---- Pass 2: process rows from work array, store into output array.
+
+        mov     eax, [original_ebp]
+        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
+        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
+        mov     eax, JDIMENSION [output_col(eax)]
+        mov     ecx, DCTSIZE/4                          ; ctr
+        alignx  16,7
+.rowloop:
+
+        ; -- Even part
+
+        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+        movaps  xmm4,xmm0
+        movaps  xmm5,xmm1
+        subps   xmm0,xmm2               ; xmm0=tmp11
+        subps   xmm1,xmm3
+        addps   xmm4,xmm2               ; xmm4=tmp10
+        addps   xmm5,xmm3               ; xmm5=tmp13
+
+        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
+        subps   xmm1,xmm5               ; xmm1=tmp12
+
+        movaps  xmm6,xmm4
+        movaps  xmm7,xmm0
+        subps   xmm4,xmm5               ; xmm4=tmp3
+        subps   xmm0,xmm1               ; xmm0=tmp2
+        addps   xmm6,xmm5               ; xmm6=tmp0
+        addps   xmm7,xmm1               ; xmm7=tmp1
+
+        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
+        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
+
+        ; -- Odd part
+
+        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+        movaps  xmm4,xmm2
+        movaps  xmm0,xmm5
+        addps   xmm2,xmm1               ; xmm2=z11
+        addps   xmm5,xmm3               ; xmm5=z13
+        subps   xmm4,xmm1               ; xmm4=z12
+        subps   xmm0,xmm3               ; xmm0=z10
+
+        movaps  xmm1,xmm2
+        subps   xmm2,xmm5
+        addps   xmm1,xmm5               ; xmm1=tmp7
+
+        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
+
+        movaps  xmm3,xmm0
+        addps   xmm0,xmm4
+        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
+        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
+        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
+        addps   xmm3,xmm0               ; xmm3=tmp12
+        subps   xmm4,xmm0               ; xmm4=tmp10
+
+        ; -- Final output stage
+
+        subps   xmm3,xmm1               ; xmm3=tmp6
+        movaps  xmm5,xmm6
+        movaps  xmm0,xmm7
+        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
+        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
+        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
+        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
+        subps   xmm2,xmm3               ; xmm2=tmp5
+
+        movaps  xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm1=[PD_RNDINT_MAGIC]
+        pcmpeqd xmm3,xmm3
+        psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+        addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+        addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+        addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+        addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+        pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
+        pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+        pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
+        pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+        por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
+        por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
+
+        movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
+        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
+
+        addps   xmm4,xmm2               ; xmm4=tmp4
+        movaps  xmm7,xmm1
+        movaps  xmm5,xmm3
+        addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
+        addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
+        subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
+        subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
+
+        movaps  xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm2=[PD_RNDINT_MAGIC]
+        pcmpeqd xmm4,xmm4
+        psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+        addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+        addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+        addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+        addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+        pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
+        pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+        pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
+        pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+        por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
+        por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
+
+        movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
+
+        packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+        packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+        paddb     xmm6,xmm2
+        paddb     xmm1,xmm2
+
+        movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
+        punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+        punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+        movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
+        punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+        punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+        pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+        pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+        pushpic ebx                     ; save GOT address
+
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+        mov     ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+        movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+        movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+        poppic  ebx                     ; restore GOT address
+
+        add     esi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
+        add     edi, byte 4*SIZEOF_JSAMPROW
+        dec     ecx                             ; ctr
+        jnz     near .rowloop
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jidctfst-altivec.c b/simd/jidctfst-altivec.c
new file mode 100644
index 0000000..67cbe84
--- /dev/null
+++ b/simd/jidctfst-altivec.c
@@ -0,0 +1,257 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_1_082 277              /* FIX(1.082392200) */
+#define F_1_414 362              /* FIX(1.414213562) */
+#define F_1_847 473              /* FIX(1.847759065) */
+#define F_2_613 669              /* FIX(2.613125930) */
+#define F_1_613 (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
+
+#define CONST_BITS 8
+#define PASS1_BITS 2
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_IDCT(in)  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(in##0, in##4);  \
+  tmp11 = vec_sub(in##0, in##4);  \
+  tmp13 = vec_add(in##2, in##6);  \
+  \
+  tmp12 = vec_sub(in##2, in##6);  \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
+  tmp12 = vec_madds(tmp12, pw_F1414, pw_zero);  \
+  tmp12 = vec_sub(tmp12, tmp13);  \
+  \
+  tmp0 = vec_add(tmp10, tmp13);  \
+  tmp3 = vec_sub(tmp10, tmp13);  \
+  tmp1 = vec_add(tmp11, tmp12);  \
+  tmp2 = vec_sub(tmp11, tmp12);  \
+  \
+  /* Odd part */  \
+  \
+  z13 = vec_add(in##5, in##3);  \
+  z10 = vec_sub(in##5, in##3);  \
+  z10s = vec_sl(z10, pre_multiply_scale_bits);  \
+  z11 = vec_add(in##1, in##7);  \
+  z12s = vec_sub(in##1, in##7);  \
+  z12s = vec_sl(z12s, pre_multiply_scale_bits);  \
+  \
+  tmp11 = vec_sub(z11, z13);  \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
+  tmp11 = vec_madds(tmp11, pw_F1414, pw_zero);  \
+  \
+  tmp7 = vec_add(z11, z13);  \
+  \
+  /* To avoid overflow...  \
+   *  \
+   * (Original)  \
+   * tmp12 = -2.613125930 * z10 + z5;  \
+   *  \
+   * (This implementation)  \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5;  \
+   *       = -1.613125930 * z10 - z10 + z5;  \
+   */  \
+  \
+  z5 = vec_add(z10s, z12s);  \
+  z5 = vec_madds(z5, pw_F1847, pw_zero);  \
+  \
+  tmp10 = vec_madds(z12s, pw_F1082, pw_zero);  \
+  tmp10 = vec_sub(tmp10, z5);  \
+  tmp12 = vec_madds(z10s, pw_MF1613, z5);  \
+  tmp12 = vec_sub(tmp12, z10);  \
+  \
+  tmp6 = vec_sub(tmp12, tmp7);  \
+  tmp5 = vec_sub(tmp11, tmp6);  \
+  tmp4 = vec_add(tmp10, tmp5);  \
+  \
+  out0 = vec_add(tmp0, tmp7);  \
+  out1 = vec_add(tmp1, tmp6);  \
+  out2 = vec_add(tmp2, tmp5);  \
+  out3 = vec_sub(tmp3, tmp4);  \
+  out4 = vec_add(tmp3, tmp4);  \
+  out5 = vec_sub(tmp2, tmp5);  \
+  out6 = vec_sub(tmp1, tmp6);  \
+  out7 = vec_sub(tmp0, tmp7);  \
+}
+
+
+void
+jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  int *outptr;
+
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z5, z10, z10s, z11, z12s, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector signed char outb;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
+    pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
+    pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
+    pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
+  __vector unsigned short
+    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
+    pass1_bits3 = { __8X(PASS1_BITS + 3) };
+  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+  /* Pass 1: process columns */
+
+  col0 = vec_ld(0, coef_block);
+  col1 = vec_ld(16, coef_block);
+  col2 = vec_ld(32, coef_block);
+  col3 = vec_ld(48, coef_block);
+  col4 = vec_ld(64, coef_block);
+  col5 = vec_ld(80, coef_block);
+  col6 = vec_ld(96, coef_block);
+  col7 = vec_ld(112, coef_block);
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = vec_ld(0, dct_table);
+  col0 = vec_mladd(col0, quant0, pw_zero);
+
+  if (vec_all_eq(tmp1, pw_zero)) {
+    /* AC terms all zero */
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = vec_ld(16, dct_table);
+    quant2 = vec_ld(32, dct_table);
+    quant3 = vec_ld(48, dct_table);
+    quant4 = vec_ld(64, dct_table);
+    quant5 = vec_ld(80, dct_table);
+    quant6 = vec_ld(96, dct_table);
+    quant7 = vec_ld(112, dct_table);
+
+    col1 = vec_mladd(col1, quant1, pw_zero);
+    col2 = vec_mladd(col2, quant2, pw_zero);
+    col3 = vec_mladd(col3, quant3, pw_zero);
+    col4 = vec_mladd(col4, quant4, pw_zero);
+    col5 = vec_mladd(col5, quant5, pw_zero);
+    col6 = vec_mladd(col6, quant6, pw_zero);
+    col7 = vec_mladd(col7, quant7, pw_zero);
+
+    DO_IDCT(col);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows */
+
+  DO_IDCT(row);
+
+  out0 = vec_sra(out0, pass1_bits3);
+  out1 = vec_sra(out1, pass1_bits3);
+  out2 = vec_sra(out2, pass1_bits3);
+  out3 = vec_sra(out3, pass1_bits3);
+  out4 = vec_sra(out4, pass1_bits3);
+  out5 = vec_sra(out5, pass1_bits3);
+  out6 = vec_sra(out6, pass1_bits3);
+  out7 = vec_sra(out7, pass1_bits3);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col0);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[0] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col1, col1);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[1] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col2, col2);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[2] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col3, col3);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[3] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col4, col4);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[4] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col5, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[5] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col6, col6);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[6] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col7, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[7] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/simd/jidctfst-mmx.asm b/simd/jidctfst-mmx.asm
new file mode 100644
index 0000000..0e3963d
--- /dev/null
+++ b/simd/jidctfst-mmx.asm
@@ -0,0 +1,500 @@
+;
+; jidctfst.asm - fast integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      8       ; 14 is also OK.
+%define PASS1_BITS      2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ     277             ; FIX(1.082392200)
+F_1_414 equ     362             ; FIX(1.414213562)
+F_1_847 equ     473             ; FIX(1.847759065)
+F_2_613 equ     669             ; FIX(2.613125930)
+F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
+F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
+F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
+F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+        alignz  16
+        global  EXTN(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414        times 4 dw  F_1_414 << CONST_SHIFT
+PW_F1847        times 4 dw  F_1_847 << CONST_SHIFT
+PW_MF1613       times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082        times 4 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_mmx (void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
+%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
+%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
+%define output_col(b)   (b)+20          ; JDIMENSION output_col
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM          2
+%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+        align   16
+        global  EXTN(jsimd_idct_ifast_mmx)
+
+EXTN(jsimd_idct_ifast_mmx):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [workspace]
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process columns from input, store into work array.
+
+;       mov     eax, [original_ebp]
+        mov     edx, POINTER [dct_table(eax)]           ; quantptr
+        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
+        lea     edi, [workspace]                        ; JCOEF *wsptr
+        mov     ecx, DCTSIZE/4                          ; ctr
+        alignx  16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        jnz     short .columnDCT
+
+        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        por     mm1,mm0
+        packsswb mm1,mm1
+        movd    eax,mm1
+        test    eax,eax
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
+        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
+        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
+
+        movq      mm1,mm0
+        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
+        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
+        movq      mm3,mm2
+        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
+        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+        jmp     near .nextcolumn
+        alignx  16,7
+%endif
+.columnDCT:
+
+        ; -- Even part
+
+        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+        pmullw  mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+        pmullw  mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+        movq    mm4,mm0
+        movq    mm5,mm1
+        psubw   mm0,mm2                 ; mm0=tmp11
+        psubw   mm1,mm3
+        paddw   mm4,mm2                 ; mm4=tmp10
+        paddw   mm5,mm3                 ; mm5=tmp13
+
+        psllw   mm1,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  mm1,[GOTOFF(ebx,PW_F1414)]
+        psubw   mm1,mm5                 ; mm1=tmp12
+
+        movq    mm6,mm4
+        movq    mm7,mm0
+        psubw   mm4,mm5                 ; mm4=tmp3
+        psubw   mm0,mm1                 ; mm0=tmp2
+        paddw   mm6,mm5                 ; mm6=tmp0
+        paddw   mm7,mm1                 ; mm7=tmp1
+
+        movq    MMWORD [wk(1)], mm4     ; wk(1)=tmp3
+        movq    MMWORD [wk(0)], mm0     ; wk(0)=tmp2
+
+        ; -- Odd part
+
+        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+        pmullw  mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+        pmullw  mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+        movq    mm4,mm2
+        movq    mm0,mm5
+        psubw   mm2,mm1                 ; mm2=z12
+        psubw   mm5,mm3                 ; mm5=z10
+        paddw   mm4,mm1                 ; mm4=z11
+        paddw   mm0,mm3                 ; mm0=z13
+
+        movq    mm1,mm5                 ; mm1=z10(unscaled)
+        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
+        psllw   mm5,PRE_MULTIPLY_SCALE_BITS
+
+        movq    mm3,mm4
+        psubw   mm4,mm0
+        paddw   mm3,mm0                 ; mm3=tmp7
+
+        psllw   mm4,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  mm4,[GOTOFF(ebx,PW_F1414)]      ; mm4=tmp11
+
+        ; To avoid overflow...
+        ;
+        ; (Original)
+        ; tmp12 = -2.613125930 * z10 + z5;
+        ;
+        ; (This implementation)
+        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+        ;       = -1.613125930 * z10 - z10 + z5;
+
+        movq    mm0,mm5
+        paddw   mm5,mm2
+        pmulhw  mm5,[GOTOFF(ebx,PW_F1847)]      ; mm5=z5
+        pmulhw  mm0,[GOTOFF(ebx,PW_MF1613)]
+        pmulhw  mm2,[GOTOFF(ebx,PW_F1082)]
+        psubw   mm0,mm1
+        psubw   mm2,mm5                 ; mm2=tmp10
+        paddw   mm0,mm5                 ; mm0=tmp12
+
+        ; -- Final output stage
+
+        psubw   mm0,mm3                 ; mm0=tmp6
+        movq    mm1,mm6
+        movq    mm5,mm7
+        paddw   mm6,mm3                 ; mm6=data0=(00 01 02 03)
+        paddw   mm7,mm0                 ; mm7=data1=(10 11 12 13)
+        psubw   mm1,mm3                 ; mm1=data7=(70 71 72 73)
+        psubw   mm5,mm0                 ; mm5=data6=(60 61 62 63)
+        psubw   mm4,mm0                 ; mm4=tmp5
+
+        movq      mm3,mm6               ; transpose coefficients(phase 1)
+        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
+        punpckhwd mm3,mm7               ; mm3=(02 12 03 13)
+        movq      mm0,mm5               ; transpose coefficients(phase 1)
+        punpcklwd mm5,mm1               ; mm5=(60 70 61 71)
+        punpckhwd mm0,mm1               ; mm0=(62 72 63 73)
+
+        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp2
+        movq    mm1, MMWORD [wk(1)]     ; mm1=tmp3
+
+        movq    MMWORD [wk(0)], mm5     ; wk(0)=(60 70 61 71)
+        movq    MMWORD [wk(1)], mm0     ; wk(1)=(62 72 63 73)
+
+        paddw   mm2,mm4                 ; mm2=tmp4
+        movq    mm5,mm7
+        movq    mm0,mm1
+        paddw   mm7,mm4                 ; mm7=data2=(20 21 22 23)
+        paddw   mm1,mm2                 ; mm1=data4=(40 41 42 43)
+        psubw   mm5,mm4                 ; mm5=data5=(50 51 52 53)
+        psubw   mm0,mm2                 ; mm0=data3=(30 31 32 33)
+
+        movq      mm4,mm7               ; transpose coefficients(phase 1)
+        punpcklwd mm7,mm0               ; mm7=(20 30 21 31)
+        punpckhwd mm4,mm0               ; mm4=(22 32 23 33)
+        movq      mm2,mm1               ; transpose coefficients(phase 1)
+        punpcklwd mm1,mm5               ; mm1=(40 50 41 51)
+        punpckhwd mm2,mm5               ; mm2=(42 52 43 53)
+
+        movq      mm0,mm6               ; transpose coefficients(phase 2)
+        punpckldq mm6,mm7               ; mm6=(00 10 20 30)
+        punpckhdq mm0,mm7               ; mm0=(01 11 21 31)
+        movq      mm5,mm3               ; transpose coefficients(phase 2)
+        punpckldq mm3,mm4               ; mm3=(02 12 22 32)
+        punpckhdq mm5,mm4               ; mm5=(03 13 23 33)
+
+        movq    mm7, MMWORD [wk(0)]     ; mm7=(60 70 61 71)
+        movq    mm4, MMWORD [wk(1)]     ; mm4=(62 72 63 73)
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+        movq      mm6,mm1               ; transpose coefficients(phase 2)
+        punpckldq mm1,mm7               ; mm1=(40 50 60 70)
+        punpckhdq mm6,mm7               ; mm6=(41 51 61 71)
+        movq      mm0,mm2               ; transpose coefficients(phase 2)
+        punpckldq mm2,mm4               ; mm2=(42 52 62 72)
+        punpckhdq mm0,mm4               ; mm0=(43 53 63 73)
+
+        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
+        add     edx, byte 4*SIZEOF_IFAST_MULT_TYPE      ; quantptr
+        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
+        dec     ecx                                     ; ctr
+        jnz     near .columnloop
+
+        ; ---- Pass 2: process rows from work array, store into output array.
+
+        mov     eax, [original_ebp]
+        lea     esi, [workspace]                        ; JCOEF *wsptr
+        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
+        mov     eax, JDIMENSION [output_col(eax)]
+        mov     ecx, DCTSIZE/4                          ; ctr
+        alignx  16,7
+.rowloop:
+
+        ; -- Even part
+
+        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+        movq    mm4,mm0
+        movq    mm5,mm1
+        psubw   mm0,mm2                 ; mm0=tmp11
+        psubw   mm1,mm3
+        paddw   mm4,mm2                 ; mm4=tmp10
+        paddw   mm5,mm3                 ; mm5=tmp13
+
+        psllw   mm1,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  mm1,[GOTOFF(ebx,PW_F1414)]
+        psubw   mm1,mm5                 ; mm1=tmp12
+
+        movq    mm6,mm4
+        movq    mm7,mm0
+        psubw   mm4,mm5                 ; mm4=tmp3
+        psubw   mm0,mm1                 ; mm0=tmp2
+        paddw   mm6,mm5                 ; mm6=tmp0
+        paddw   mm7,mm1                 ; mm7=tmp1
+
+        movq    MMWORD [wk(1)], mm4     ; wk(1)=tmp3
+        movq    MMWORD [wk(0)], mm0     ; wk(0)=tmp2
+
+        ; -- Odd part
+
+        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+        movq    mm4,mm2
+        movq    mm0,mm5
+        psubw   mm2,mm1                 ; mm2=z12
+        psubw   mm5,mm3                 ; mm5=z10
+        paddw   mm4,mm1                 ; mm4=z11
+        paddw   mm0,mm3                 ; mm0=z13
+
+        movq    mm1,mm5                 ; mm1=z10(unscaled)
+        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
+        psllw   mm5,PRE_MULTIPLY_SCALE_BITS
+
+        movq    mm3,mm4
+        psubw   mm4,mm0
+        paddw   mm3,mm0                 ; mm3=tmp7
+
+        psllw   mm4,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  mm4,[GOTOFF(ebx,PW_F1414)]      ; mm4=tmp11
+
+        ; To avoid overflow...
+        ;
+        ; (Original)
+        ; tmp12 = -2.613125930 * z10 + z5;
+        ;
+        ; (This implementation)
+        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+        ;       = -1.613125930 * z10 - z10 + z5;
+
+        movq    mm0,mm5
+        paddw   mm5,mm2
+        pmulhw  mm5,[GOTOFF(ebx,PW_F1847)]      ; mm5=z5
+        pmulhw  mm0,[GOTOFF(ebx,PW_MF1613)]
+        pmulhw  mm2,[GOTOFF(ebx,PW_F1082)]
+        psubw   mm0,mm1
+        psubw   mm2,mm5                 ; mm2=tmp10
+        paddw   mm0,mm5                 ; mm0=tmp12
+
+        ; -- Final output stage
+
+        psubw   mm0,mm3                 ; mm0=tmp6
+        movq    mm1,mm6
+        movq    mm5,mm7
+        paddw   mm6,mm3                 ; mm6=data0=(00 10 20 30)
+        paddw   mm7,mm0                 ; mm7=data1=(01 11 21 31)
+        psraw   mm6,(PASS1_BITS+3)      ; descale
+        psraw   mm7,(PASS1_BITS+3)      ; descale
+        psubw   mm1,mm3                 ; mm1=data7=(07 17 27 37)
+        psubw   mm5,mm0                 ; mm5=data6=(06 16 26 36)
+        psraw   mm1,(PASS1_BITS+3)      ; descale
+        psraw   mm5,(PASS1_BITS+3)      ; descale
+        psubw   mm4,mm0                 ; mm4=tmp5
+
+        packsswb  mm6,mm5               ; mm6=(00 10 20 30 06 16 26 36)
+        packsswb  mm7,mm1               ; mm7=(01 11 21 31 07 17 27 37)
+
+        movq    mm3, MMWORD [wk(0)]     ; mm3=tmp2
+        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp3
+
+        paddw   mm2,mm4                 ; mm2=tmp4
+        movq    mm5,mm3
+        movq    mm1,mm0
+        paddw   mm3,mm4                 ; mm3=data2=(02 12 22 32)
+        paddw   mm0,mm2                 ; mm0=data4=(04 14 24 34)
+        psraw   mm3,(PASS1_BITS+3)      ; descale
+        psraw   mm0,(PASS1_BITS+3)      ; descale
+        psubw   mm5,mm4                 ; mm5=data5=(05 15 25 35)
+        psubw   mm1,mm2                 ; mm1=data3=(03 13 23 33)
+        psraw   mm5,(PASS1_BITS+3)      ; descale
+        psraw   mm1,(PASS1_BITS+3)      ; descale
+
+        movq      mm4,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm4=[PB_CENTERJSAMP]
+
+        packsswb  mm3,mm0               ; mm3=(02 12 22 32 04 14 24 34)
+        packsswb  mm1,mm5               ; mm1=(03 13 23 33 05 15 25 35)
+
+        paddb     mm6,mm4
+        paddb     mm7,mm4
+        paddb     mm3,mm4
+        paddb     mm1,mm4
+
+        movq      mm2,mm6               ; transpose coefficients(phase 1)
+        punpcklbw mm6,mm7               ; mm6=(00 01 10 11 20 21 30 31)
+        punpckhbw mm2,mm7               ; mm2=(06 07 16 17 26 27 36 37)
+        movq      mm0,mm3               ; transpose coefficients(phase 1)
+        punpcklbw mm3,mm1               ; mm3=(02 03 12 13 22 23 32 33)
+        punpckhbw mm0,mm1               ; mm0=(04 05 14 15 24 25 34 35)
+
+        movq      mm5,mm6               ; transpose coefficients(phase 2)
+        punpcklwd mm6,mm3               ; mm6=(00 01 02 03 10 11 12 13)
+        punpckhwd mm5,mm3               ; mm5=(20 21 22 23 30 31 32 33)
+        movq      mm4,mm0               ; transpose coefficients(phase 2)
+        punpcklwd mm0,mm2               ; mm0=(04 05 06 07 14 15 16 17)
+        punpckhwd mm4,mm2               ; mm4=(24 25 26 27 34 35 36 37)
+
+        movq      mm7,mm6               ; transpose coefficients(phase 3)
+        punpckldq mm6,mm0               ; mm6=(00 01 02 03 04 05 06 07)
+        punpckhdq mm7,mm0               ; mm7=(10 11 12 13 14 15 16 17)
+        movq      mm1,mm5               ; transpose coefficients(phase 3)
+        punpckldq mm5,mm4               ; mm5=(20 21 22 23 24 25 26 27)
+        punpckhdq mm1,mm4               ; mm1=(30 31 32 33 34 35 36 37)
+
+        pushpic ebx                     ; save GOT address
+
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+        poppic  ebx                     ; restore GOT address
+
+        add     esi, byte 4*SIZEOF_JCOEF        ; wsptr
+        add     edi, byte 4*SIZEOF_JSAMPROW
+        dec     ecx                             ; ctr
+        jnz     near .rowloop
+
+        emms            ; empty MMX state
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jidctfst-sse2-64.asm b/simd/jidctfst-sse2-64.asm
new file mode 100644
index 0000000..da4ecf2
--- /dev/null
+++ b/simd/jidctfst-sse2-64.asm
@@ -0,0 +1,492 @@
+;
+; jidctfst.asm - fast integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      8       ; 14 is also OK.
+%define PASS1_BITS      2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ     277             ; FIX(1.082392200)
+F_1_414 equ     362             ; FIX(1.414213562)
+F_1_847 equ     473             ; FIX(1.847759065)
+F_2_613 equ     669             ; FIX(2.613125930)
+F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
+F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
+F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
+F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+        alignz  16
+        global  EXTN(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414        times 8 dw  F_1_414 << CONST_SHIFT
+PW_F1847        times 8 dw  F_1_847 << CONST_SHIFT
+PW_MF1613       times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082        times 8 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp    rbp+0
+%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+        global  EXTN(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [wk(0)]
+        collect_args
+
+        ; ---- Pass 1: process columns from input.
+
+        mov     rdx, r10                ; quantptr
+        mov     rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+        jnz     near .columnDCT
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+        por     xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+        por     xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+        por     xmm1,xmm0
+        packsswb xmm1,xmm1
+        packsswb xmm1,xmm1
+        movd    eax,xmm1
+        test    rax,rax
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        movdqa    xmm7,xmm0             ; xmm0=in0=(00 01 02 03 04 05 06 07)
+        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
+        punpckhwd xmm7,xmm7             ; xmm7=(04 04 05 05 06 06 07 07)
+
+        pshufd  xmm6,xmm0,0x00          ; xmm6=col0=(00 00 00 00 00 00 00 00)
+        pshufd  xmm2,xmm0,0x55          ; xmm2=col1=(01 01 01 01 01 01 01 01)
+        pshufd  xmm5,xmm0,0xAA          ; xmm5=col2=(02 02 02 02 02 02 02 02)
+        pshufd  xmm0,xmm0,0xFF          ; xmm0=col3=(03 03 03 03 03 03 03 03)
+        pshufd  xmm1,xmm7,0x00          ; xmm1=col4=(04 04 04 04 04 04 04 04)
+        pshufd  xmm4,xmm7,0x55          ; xmm4=col5=(05 05 05 05 05 05 05 05)
+        pshufd  xmm3,xmm7,0xAA          ; xmm3=col6=(06 06 06 06 06 06 06 06)
+        pshufd  xmm7,xmm7,0xFF          ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+        jmp     near .column_end
+%endif
+.columnDCT:
+
+        ; -- Even part
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+        movdqa  xmm4,xmm0
+        movdqa  xmm5,xmm1
+        psubw   xmm0,xmm2               ; xmm0=tmp11
+        psubw   xmm1,xmm3
+        paddw   xmm4,xmm2               ; xmm4=tmp10
+        paddw   xmm5,xmm3               ; xmm5=tmp13
+
+        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm1,[rel PW_F1414]
+        psubw   xmm1,xmm5               ; xmm1=tmp12
+
+        movdqa  xmm6,xmm4
+        movdqa  xmm7,xmm0
+        psubw   xmm4,xmm5               ; xmm4=tmp3
+        psubw   xmm0,xmm1               ; xmm0=tmp2
+        paddw   xmm6,xmm5               ; xmm6=tmp0
+        paddw   xmm7,xmm1               ; xmm7=tmp1
+
+        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+        ; -- Odd part
+
+        movdqa  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+        pmullw  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+        movdqa  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+        pmullw  xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+        movdqa  xmm4,xmm2
+        movdqa  xmm0,xmm5
+        psubw   xmm2,xmm1               ; xmm2=z12
+        psubw   xmm5,xmm3               ; xmm5=z10
+        paddw   xmm4,xmm1               ; xmm4=z11
+        paddw   xmm0,xmm3               ; xmm0=z13
+
+        movdqa  xmm1,xmm5               ; xmm1=z10(unscaled)
+        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
+        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
+
+        movdqa  xmm3,xmm4
+        psubw   xmm4,xmm0
+        paddw   xmm3,xmm0               ; xmm3=tmp7
+
+        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm4,[rel PW_F1414]     ; xmm4=tmp11
+
+        ; To avoid overflow...
+        ;
+        ; (Original)
+        ; tmp12 = -2.613125930 * z10 + z5;
+        ;
+        ; (This implementation)
+        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+        ;       = -1.613125930 * z10 - z10 + z5;
+
+        movdqa  xmm0,xmm5
+        paddw   xmm5,xmm2
+        pmulhw  xmm5,[rel PW_F1847]     ; xmm5=z5
+        pmulhw  xmm0,[rel PW_MF1613]
+        pmulhw  xmm2,[rel PW_F1082]
+        psubw   xmm0,xmm1
+        psubw   xmm2,xmm5               ; xmm2=tmp10
+        paddw   xmm0,xmm5               ; xmm0=tmp12
+
+        ; -- Final output stage
+
+        psubw   xmm0,xmm3               ; xmm0=tmp6
+        movdqa  xmm1,xmm6
+        movdqa  xmm5,xmm7
+        paddw   xmm6,xmm3               ; xmm6=data0=(00 01 02 03 04 05 06 07)
+        paddw   xmm7,xmm0               ; xmm7=data1=(10 11 12 13 14 15 16 17)
+        psubw   xmm1,xmm3               ; xmm1=data7=(70 71 72 73 74 75 76 77)
+        psubw   xmm5,xmm0               ; xmm5=data6=(60 61 62 63 64 65 66 67)
+        psubw   xmm4,xmm0               ; xmm4=tmp5
+
+        movdqa    xmm3,xmm6             ; transpose coefficients(phase 1)
+        punpcklwd xmm6,xmm7             ; xmm6=(00 10 01 11 02 12 03 13)
+        punpckhwd xmm3,xmm7             ; xmm3=(04 14 05 15 06 16 07 17)
+        movdqa    xmm0,xmm5             ; transpose coefficients(phase 1)
+        punpcklwd xmm5,xmm1             ; xmm5=(60 70 61 71 62 72 63 73)
+        punpckhwd xmm0,xmm1             ; xmm0=(64 74 65 75 66 76 67 77)
+
+        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+        paddw   xmm2,xmm4               ; xmm2=tmp4
+        movdqa  xmm5,xmm7
+        movdqa  xmm0,xmm1
+        paddw   xmm7,xmm4               ; xmm7=data2=(20 21 22 23 24 25 26 27)
+        paddw   xmm1,xmm2               ; xmm1=data4=(40 41 42 43 44 45 46 47)
+        psubw   xmm5,xmm4               ; xmm5=data5=(50 51 52 53 54 55 56 57)
+        psubw   xmm0,xmm2               ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+        movdqa    xmm4,xmm7             ; transpose coefficients(phase 1)
+        punpcklwd xmm7,xmm0             ; xmm7=(20 30 21 31 22 32 23 33)
+        punpckhwd xmm4,xmm0             ; xmm4=(24 34 25 35 26 36 27 37)
+        movdqa    xmm2,xmm1             ; transpose coefficients(phase 1)
+        punpcklwd xmm1,xmm5             ; xmm1=(40 50 41 51 42 52 43 53)
+        punpckhwd xmm2,xmm5             ; xmm2=(44 54 45 55 46 56 47 57)
+
+        movdqa    xmm0,xmm3             ; transpose coefficients(phase 2)
+        punpckldq xmm3,xmm4             ; xmm3=(04 14 24 34 05 15 25 35)
+        punpckhdq xmm0,xmm4             ; xmm0=(06 16 26 36 07 17 27 37)
+        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
+        punpckldq xmm6,xmm7             ; xmm6=(00 10 20 30 01 11 21 31)
+        punpckhdq xmm5,xmm7             ; xmm5=(02 12 22 32 03 13 23 33)
+
+        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+        movdqa  XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+        movdqa    xmm3,xmm1             ; transpose coefficients(phase 2)
+        punpckldq xmm1,xmm4             ; xmm1=(40 50 60 70 41 51 61 71)
+        punpckhdq xmm3,xmm4             ; xmm3=(42 52 62 72 43 53 63 73)
+        movdqa    xmm0,xmm2             ; transpose coefficients(phase 2)
+        punpckldq xmm2,xmm7             ; xmm2=(44 54 64 74 45 55 65 75)
+        punpckhdq xmm0,xmm7             ; xmm0=(46 56 66 76 47 57 67 77)
+
+        movdqa     xmm4,xmm6            ; transpose coefficients(phase 3)
+        punpcklqdq xmm6,xmm1            ; xmm6=col0=(00 10 20 30 40 50 60 70)
+        punpckhqdq xmm4,xmm1            ; xmm4=col1=(01 11 21 31 41 51 61 71)
+        movdqa     xmm7,xmm5            ; transpose coefficients(phase 3)
+        punpcklqdq xmm5,xmm3            ; xmm5=col2=(02 12 22 32 42 52 62 72)
+        punpckhqdq xmm7,xmm3            ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
+        punpcklqdq xmm1,xmm2            ; xmm1=col4=(04 14 24 34 44 54 64 74)
+        punpckhqdq xmm4,xmm2            ; xmm4=col5=(05 15 25 35 45 55 65 75)
+        movdqa     xmm7,xmm3            ; transpose coefficients(phase 3)
+        punpcklqdq xmm3,xmm0            ; xmm3=col6=(06 16 26 36 46 56 66 76)
+        punpckhqdq xmm7,xmm0            ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+        ; -- Prefetch the next coefficient block
+
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+        ; ---- Pass 2: process rows from work array, store into output array.
+
+        mov     rax, [original_rbp]
+        mov     rdi, r12        ; (JSAMPROW *)
+        mov     eax, r13d
+
+        ; -- Even part
+
+        ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+        movdqa  xmm2,xmm6
+        movdqa  xmm0,xmm5
+        psubw   xmm6,xmm1               ; xmm6=tmp11
+        psubw   xmm5,xmm3
+        paddw   xmm2,xmm1               ; xmm2=tmp10
+        paddw   xmm0,xmm3               ; xmm0=tmp13
+
+        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm5,[rel PW_F1414]
+        psubw   xmm5,xmm0               ; xmm5=tmp12
+
+        movdqa  xmm1,xmm2
+        movdqa  xmm3,xmm6
+        psubw   xmm2,xmm0               ; xmm2=tmp3
+        psubw   xmm6,xmm5               ; xmm6=tmp2
+        paddw   xmm1,xmm0               ; xmm1=tmp0
+        paddw   xmm3,xmm5               ; xmm3=tmp1
+
+        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+        ; -- Odd part
+
+        ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+        movdqa  xmm2,xmm0
+        movdqa  xmm6,xmm4
+        psubw   xmm0,xmm7               ; xmm0=z12
+        psubw   xmm4,xmm5               ; xmm4=z10
+        paddw   xmm2,xmm7               ; xmm2=z11
+        paddw   xmm6,xmm5               ; xmm6=z13
+
+        movdqa  xmm7,xmm4               ; xmm7=z10(unscaled)
+        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
+        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
+
+        movdqa  xmm5,xmm2
+        psubw   xmm2,xmm6
+        paddw   xmm5,xmm6               ; xmm5=tmp7
+
+        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm2,[rel PW_F1414]     ; xmm2=tmp11
+
+        ; To avoid overflow...
+        ;
+        ; (Original)
+        ; tmp12 = -2.613125930 * z10 + z5;
+        ;
+        ; (This implementation)
+        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+        ;       = -1.613125930 * z10 - z10 + z5;
+
+        movdqa  xmm6,xmm4
+        paddw   xmm4,xmm0
+        pmulhw  xmm4,[rel PW_F1847]     ; xmm4=z5
+        pmulhw  xmm6,[rel PW_MF1613]
+        pmulhw  xmm0,[rel PW_F1082]
+        psubw   xmm6,xmm7
+        psubw   xmm0,xmm4               ; xmm0=tmp10
+        paddw   xmm6,xmm4               ; xmm6=tmp12
+
+        ; -- Final output stage
+
+        psubw   xmm6,xmm5               ; xmm6=tmp6
+        movdqa  xmm7,xmm1
+        movdqa  xmm4,xmm3
+        paddw   xmm1,xmm5               ; xmm1=data0=(00 10 20 30 40 50 60 70)
+        paddw   xmm3,xmm6               ; xmm3=data1=(01 11 21 31 41 51 61 71)
+        psraw   xmm1,(PASS1_BITS+3)     ; descale
+        psraw   xmm3,(PASS1_BITS+3)     ; descale
+        psubw   xmm7,xmm5               ; xmm7=data7=(07 17 27 37 47 57 67 77)
+        psubw   xmm4,xmm6               ; xmm4=data6=(06 16 26 36 46 56 66 76)
+        psraw   xmm7,(PASS1_BITS+3)     ; descale
+        psraw   xmm4,(PASS1_BITS+3)     ; descale
+        psubw   xmm2,xmm6               ; xmm2=tmp5
+
+        packsswb  xmm1,xmm4     ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+        packsswb  xmm3,xmm7     ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+        paddw   xmm0,xmm2               ; xmm0=tmp4
+        movdqa  xmm4,xmm5
+        movdqa  xmm7,xmm6
+        paddw   xmm5,xmm2               ; xmm5=data2=(02 12 22 32 42 52 62 72)
+        paddw   xmm6,xmm0               ; xmm6=data4=(04 14 24 34 44 54 64 74)
+        psraw   xmm5,(PASS1_BITS+3)     ; descale
+        psraw   xmm6,(PASS1_BITS+3)     ; descale
+        psubw   xmm4,xmm2               ; xmm4=data5=(05 15 25 35 45 55 65 75)
+        psubw   xmm7,xmm0               ; xmm7=data3=(03 13 23 33 43 53 63 73)
+        psraw   xmm4,(PASS1_BITS+3)     ; descale
+        psraw   xmm7,(PASS1_BITS+3)     ; descale
+
+        movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
+
+        packsswb  xmm5,xmm6     ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+        packsswb  xmm7,xmm4     ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+        paddb     xmm1,xmm2
+        paddb     xmm3,xmm2
+        paddb     xmm5,xmm2
+        paddb     xmm7,xmm2
+
+        movdqa    xmm0,xmm1     ; transpose coefficients(phase 1)
+        punpcklbw xmm1,xmm3     ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+        punpckhbw xmm0,xmm3     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+        movdqa    xmm6,xmm5     ; transpose coefficients(phase 1)
+        punpcklbw xmm5,xmm7     ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+        punpckhbw xmm6,xmm7     ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+        movdqa    xmm4,xmm1     ; transpose coefficients(phase 2)
+        punpcklwd xmm1,xmm5     ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+        punpckhwd xmm4,xmm5     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+        movdqa    xmm2,xmm6     ; transpose coefficients(phase 2)
+        punpcklwd xmm6,xmm0     ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+        punpckhwd xmm2,xmm0     ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+        movdqa    xmm3,xmm1     ; transpose coefficients(phase 3)
+        punpckldq xmm1,xmm6     ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+        punpckhdq xmm3,xmm6     ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+        movdqa    xmm7,xmm4     ; transpose coefficients(phase 3)
+        punpckldq xmm4,xmm2     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+        punpckhdq xmm7,xmm2     ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+        pshufd  xmm5,xmm1,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+        pshufd  xmm0,xmm3,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+        pshufd  xmm6,xmm4,0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+        pshufd  xmm2,xmm7,0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+        mov     rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+        mov     rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+        mov     rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+        mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+        mov     rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+        mov     rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jidctfst-sse2.asm b/simd/jidctfst-sse2.asm
new file mode 100644
index 0000000..065842c
--- /dev/null
+++ b/simd/jidctfst-sse2.asm
@@ -0,0 +1,502 @@
+;
+; jidctfst.asm - fast integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      8       ; 14 is also OK.
+%define PASS1_BITS      2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ     277             ; FIX(1.082392200)
+F_1_414 equ     362             ; FIX(1.414213562)
+F_1_847 equ     473             ; FIX(1.847759065)
+F_2_613 equ     669             ; FIX(2.613125930)
+F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
+F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
+F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
+F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+        alignz  16
+        global  EXTN(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414        times 8 dw  F_1_414 << CONST_SHIFT
+PW_F1847        times 8 dw  F_1_847 << CONST_SHIFT
+PW_MF1613       times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082        times 8 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
+%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
+%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
+%define output_col(b)   (b)+20          ; JDIMENSION output_col
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+        global  EXTN(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic ebx
+;       push    ecx             ; unused
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process columns from input.
+
+;       mov     eax, [original_ebp]
+        mov     edx, POINTER [dct_table(eax)]           ; quantptr
+        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        jnz     near .columnDCT
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        por     xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        por     xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        por     xmm1,xmm0
+        packsswb xmm1,xmm1
+        packsswb xmm1,xmm1
+        movd    eax,xmm1
+        test    eax,eax
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        movdqa    xmm7,xmm0             ; xmm0=in0=(00 01 02 03 04 05 06 07)
+        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
+        punpckhwd xmm7,xmm7             ; xmm7=(04 04 05 05 06 06 07 07)
+
+        pshufd  xmm6,xmm0,0x00          ; xmm6=col0=(00 00 00 00 00 00 00 00)
+        pshufd  xmm2,xmm0,0x55          ; xmm2=col1=(01 01 01 01 01 01 01 01)
+        pshufd  xmm5,xmm0,0xAA          ; xmm5=col2=(02 02 02 02 02 02 02 02)
+        pshufd  xmm0,xmm0,0xFF          ; xmm0=col3=(03 03 03 03 03 03 03 03)
+        pshufd  xmm1,xmm7,0x00          ; xmm1=col4=(04 04 04 04 04 04 04 04)
+        pshufd  xmm4,xmm7,0x55          ; xmm4=col5=(05 05 05 05 05 05 05 05)
+        pshufd  xmm3,xmm7,0xAA          ; xmm3=col6=(06 06 06 06 06 06 06 06)
+        pshufd  xmm7,xmm7,0xFF          ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+        jmp     near .column_end
+        alignx  16,7
+%endif
+.columnDCT:
+
+        ; -- Even part
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+        movdqa  xmm4,xmm0
+        movdqa  xmm5,xmm1
+        psubw   xmm0,xmm2               ; xmm0=tmp11
+        psubw   xmm1,xmm3
+        paddw   xmm4,xmm2               ; xmm4=tmp10
+        paddw   xmm5,xmm3               ; xmm5=tmp13
+
+        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm1,[GOTOFF(ebx,PW_F1414)]
+        psubw   xmm1,xmm5               ; xmm1=tmp12
+
+        movdqa  xmm6,xmm4
+        movdqa  xmm7,xmm0
+        psubw   xmm4,xmm5               ; xmm4=tmp3
+        psubw   xmm0,xmm1               ; xmm0=tmp2
+        paddw   xmm6,xmm5               ; xmm6=tmp0
+        paddw   xmm7,xmm1               ; xmm7=tmp1
+
+        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+        ; -- Odd part
+
+        movdqa  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+        pmullw  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+        movdqa  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+        pmullw  xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+        movdqa  xmm4,xmm2
+        movdqa  xmm0,xmm5
+        psubw   xmm2,xmm1               ; xmm2=z12
+        psubw   xmm5,xmm3               ; xmm5=z10
+        paddw   xmm4,xmm1               ; xmm4=z11
+        paddw   xmm0,xmm3               ; xmm0=z13
+
+        movdqa  xmm1,xmm5               ; xmm1=z10(unscaled)
+        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
+        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
+
+        movdqa  xmm3,xmm4
+        psubw   xmm4,xmm0
+        paddw   xmm3,xmm0               ; xmm3=tmp7
+
+        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm4,[GOTOFF(ebx,PW_F1414)]     ; xmm4=tmp11
+
+        ; To avoid overflow...
+        ;
+        ; (Original)
+        ; tmp12 = -2.613125930 * z10 + z5;
+        ;
+        ; (This implementation)
+        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+        ;       = -1.613125930 * z10 - z10 + z5;
+
+        movdqa  xmm0,xmm5
+        paddw   xmm5,xmm2
+        pmulhw  xmm5,[GOTOFF(ebx,PW_F1847)]     ; xmm5=z5
+        pmulhw  xmm0,[GOTOFF(ebx,PW_MF1613)]
+        pmulhw  xmm2,[GOTOFF(ebx,PW_F1082)]
+        psubw   xmm0,xmm1
+        psubw   xmm2,xmm5               ; xmm2=tmp10
+        paddw   xmm0,xmm5               ; xmm0=tmp12
+
+        ; -- Final output stage
+
+        psubw   xmm0,xmm3               ; xmm0=tmp6
+        movdqa  xmm1,xmm6
+        movdqa  xmm5,xmm7
+        paddw   xmm6,xmm3               ; xmm6=data0=(00 01 02 03 04 05 06 07)
+        paddw   xmm7,xmm0               ; xmm7=data1=(10 11 12 13 14 15 16 17)
+        psubw   xmm1,xmm3               ; xmm1=data7=(70 71 72 73 74 75 76 77)
+        psubw   xmm5,xmm0               ; xmm5=data6=(60 61 62 63 64 65 66 67)
+        psubw   xmm4,xmm0               ; xmm4=tmp5
+
+        movdqa    xmm3,xmm6             ; transpose coefficients(phase 1)
+        punpcklwd xmm6,xmm7             ; xmm6=(00 10 01 11 02 12 03 13)
+        punpckhwd xmm3,xmm7             ; xmm3=(04 14 05 15 06 16 07 17)
+        movdqa    xmm0,xmm5             ; transpose coefficients(phase 1)
+        punpcklwd xmm5,xmm1             ; xmm5=(60 70 61 71 62 72 63 73)
+        punpckhwd xmm0,xmm1             ; xmm0=(64 74 65 75 66 76 67 77)
+
+        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+        paddw   xmm2,xmm4               ; xmm2=tmp4
+        movdqa  xmm5,xmm7
+        movdqa  xmm0,xmm1
+        paddw   xmm7,xmm4               ; xmm7=data2=(20 21 22 23 24 25 26 27)
+        paddw   xmm1,xmm2               ; xmm1=data4=(40 41 42 43 44 45 46 47)
+        psubw   xmm5,xmm4               ; xmm5=data5=(50 51 52 53 54 55 56 57)
+        psubw   xmm0,xmm2               ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+        movdqa    xmm4,xmm7             ; transpose coefficients(phase 1)
+        punpcklwd xmm7,xmm0             ; xmm7=(20 30 21 31 22 32 23 33)
+        punpckhwd xmm4,xmm0             ; xmm4=(24 34 25 35 26 36 27 37)
+        movdqa    xmm2,xmm1             ; transpose coefficients(phase 1)
+        punpcklwd xmm1,xmm5             ; xmm1=(40 50 41 51 42 52 43 53)
+        punpckhwd xmm2,xmm5             ; xmm2=(44 54 45 55 46 56 47 57)
+
+        movdqa    xmm0,xmm3             ; transpose coefficients(phase 2)
+        punpckldq xmm3,xmm4             ; xmm3=(04 14 24 34 05 15 25 35)
+        punpckhdq xmm0,xmm4             ; xmm0=(06 16 26 36 07 17 27 37)
+        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
+        punpckldq xmm6,xmm7             ; xmm6=(00 10 20 30 01 11 21 31)
+        punpckhdq xmm5,xmm7             ; xmm5=(02 12 22 32 03 13 23 33)
+
+        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+        movdqa  XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+        movdqa    xmm3,xmm1             ; transpose coefficients(phase 2)
+        punpckldq xmm1,xmm4             ; xmm1=(40 50 60 70 41 51 61 71)
+        punpckhdq xmm3,xmm4             ; xmm3=(42 52 62 72 43 53 63 73)
+        movdqa    xmm0,xmm2             ; transpose coefficients(phase 2)
+        punpckldq xmm2,xmm7             ; xmm2=(44 54 64 74 45 55 65 75)
+        punpckhdq xmm0,xmm7             ; xmm0=(46 56 66 76 47 57 67 77)
+
+        movdqa     xmm4,xmm6            ; transpose coefficients(phase 3)
+        punpcklqdq xmm6,xmm1            ; xmm6=col0=(00 10 20 30 40 50 60 70)
+        punpckhqdq xmm4,xmm1            ; xmm4=col1=(01 11 21 31 41 51 61 71)
+        movdqa     xmm7,xmm5            ; transpose coefficients(phase 3)
+        punpcklqdq xmm5,xmm3            ; xmm5=col2=(02 12 22 32 42 52 62 72)
+        punpckhqdq xmm7,xmm3            ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
+        punpcklqdq xmm1,xmm2            ; xmm1=col4=(04 14 24 34 44 54 64 74)
+        punpckhqdq xmm4,xmm2            ; xmm4=col5=(05 15 25 35 45 55 65 75)
+        movdqa     xmm7,xmm3            ; transpose coefficients(phase 3)
+        punpcklqdq xmm3,xmm0            ; xmm3=col6=(06 16 26 36 46 56 66 76)
+        punpckhqdq xmm7,xmm0            ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+        ; -- Prefetch the next coefficient block
+
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+        ; ---- Pass 2: process rows from work array, store into output array.
+
+        mov     eax, [original_ebp]
+        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
+        mov     eax, JDIMENSION [output_col(eax)]
+
+        ; -- Even part
+
+        ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+        movdqa  xmm2,xmm6
+        movdqa  xmm0,xmm5
+        psubw   xmm6,xmm1               ; xmm6=tmp11
+        psubw   xmm5,xmm3
+        paddw   xmm2,xmm1               ; xmm2=tmp10
+        paddw   xmm0,xmm3               ; xmm0=tmp13
+
+        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm5,[GOTOFF(ebx,PW_F1414)]
+        psubw   xmm5,xmm0               ; xmm5=tmp12
+
+        movdqa  xmm1,xmm2
+        movdqa  xmm3,xmm6
+        psubw   xmm2,xmm0               ; xmm2=tmp3
+        psubw   xmm6,xmm5               ; xmm6=tmp2
+        paddw   xmm1,xmm0               ; xmm1=tmp0
+        paddw   xmm3,xmm5               ; xmm3=tmp1
+
+        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+        ; -- Odd part
+
+        ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+        movdqa  xmm2,xmm0
+        movdqa  xmm6,xmm4
+        psubw   xmm0,xmm7               ; xmm0=z12
+        psubw   xmm4,xmm5               ; xmm4=z10
+        paddw   xmm2,xmm7               ; xmm2=z11
+        paddw   xmm6,xmm5               ; xmm6=z13
+
+        movdqa  xmm7,xmm4               ; xmm7=z10(unscaled)
+        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
+        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
+
+        movdqa  xmm5,xmm2
+        psubw   xmm2,xmm6
+        paddw   xmm5,xmm6               ; xmm5=tmp7
+
+        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
+        pmulhw  xmm2,[GOTOFF(ebx,PW_F1414)]     ; xmm2=tmp11
+
+        ; To avoid overflow...
+        ;
+        ; (Original)
+        ; tmp12 = -2.613125930 * z10 + z5;
+        ;
+        ; (This implementation)
+        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+        ;       = -1.613125930 * z10 - z10 + z5;
+
+        movdqa  xmm6,xmm4
+        paddw   xmm4,xmm0
+        pmulhw  xmm4,[GOTOFF(ebx,PW_F1847)]     ; xmm4=z5
+        pmulhw  xmm6,[GOTOFF(ebx,PW_MF1613)]
+        pmulhw  xmm0,[GOTOFF(ebx,PW_F1082)]
+        psubw   xmm6,xmm7
+        psubw   xmm0,xmm4               ; xmm0=tmp10
+        paddw   xmm6,xmm4               ; xmm6=tmp12
+
+        ; -- Final output stage
+
+        psubw   xmm6,xmm5               ; xmm6=tmp6
+        movdqa  xmm7,xmm1
+        movdqa  xmm4,xmm3
+        paddw   xmm1,xmm5               ; xmm1=data0=(00 10 20 30 40 50 60 70)
+        paddw   xmm3,xmm6               ; xmm3=data1=(01 11 21 31 41 51 61 71)
+        psraw   xmm1,(PASS1_BITS+3)     ; descale
+        psraw   xmm3,(PASS1_BITS+3)     ; descale
+        psubw   xmm7,xmm5               ; xmm7=data7=(07 17 27 37 47 57 67 77)
+        psubw   xmm4,xmm6               ; xmm4=data6=(06 16 26 36 46 56 66 76)
+        psraw   xmm7,(PASS1_BITS+3)     ; descale
+        psraw   xmm4,(PASS1_BITS+3)     ; descale
+        psubw   xmm2,xmm6               ; xmm2=tmp5
+
+        packsswb  xmm1,xmm4     ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+        packsswb  xmm3,xmm7     ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+        paddw   xmm0,xmm2               ; xmm0=tmp4
+        movdqa  xmm4,xmm5
+        movdqa  xmm7,xmm6
+        paddw   xmm5,xmm2               ; xmm5=data2=(02 12 22 32 42 52 62 72)
+        paddw   xmm6,xmm0               ; xmm6=data4=(04 14 24 34 44 54 64 74)
+        psraw   xmm5,(PASS1_BITS+3)     ; descale
+        psraw   xmm6,(PASS1_BITS+3)     ; descale
+        psubw   xmm4,xmm2               ; xmm4=data5=(05 15 25 35 45 55 65 75)
+        psubw   xmm7,xmm0               ; xmm7=data3=(03 13 23 33 43 53 63 73)
+        psraw   xmm4,(PASS1_BITS+3)     ; descale
+        psraw   xmm7,(PASS1_BITS+3)     ; descale
+
+        movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
+
+        packsswb  xmm5,xmm6     ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+        packsswb  xmm7,xmm4     ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+        paddb     xmm1,xmm2
+        paddb     xmm3,xmm2
+        paddb     xmm5,xmm2
+        paddb     xmm7,xmm2
+
+        movdqa    xmm0,xmm1     ; transpose coefficients(phase 1)
+        punpcklbw xmm1,xmm3     ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+        punpckhbw xmm0,xmm3     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+        movdqa    xmm6,xmm5     ; transpose coefficients(phase 1)
+        punpcklbw xmm5,xmm7     ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+        punpckhbw xmm6,xmm7     ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+        movdqa    xmm4,xmm1     ; transpose coefficients(phase 2)
+        punpcklwd xmm1,xmm5     ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+        punpckhwd xmm4,xmm5     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+        movdqa    xmm2,xmm6     ; transpose coefficients(phase 2)
+        punpcklwd xmm6,xmm0     ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+        punpckhwd xmm2,xmm0     ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+        movdqa    xmm3,xmm1     ; transpose coefficients(phase 3)
+        punpckldq xmm1,xmm6     ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+        punpckhdq xmm3,xmm6     ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+        movdqa    xmm7,xmm4     ; transpose coefficients(phase 3)
+        punpckldq xmm4,xmm2     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+        punpckhdq xmm7,xmm2     ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+        pshufd  xmm5,xmm1,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+        pshufd  xmm0,xmm3,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+        pshufd  xmm6,xmm4,0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+        pshufd  xmm2,xmm7,0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+        mov     edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+        mov     edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; unused
+        poppic  ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jidctint-altivec.c b/simd/jidctint-altivec.c
new file mode 100644
index 0000000..5f1a5df
--- /dev/null
+++ b/simd/jidctint-altivec.c
@@ -0,0 +1,359 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER INVERSE DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446   /* FIX(0.298631336) */
+#define F_0_390 3196   /* FIX(0.390180644) */
+#define F_0_541 4433   /* FIX(0.541196100) */
+#define F_0_765 6270   /* FIX(0.765366865) */
+#define F_0_899 7373   /* FIX(0.899976223) */
+#define F_1_175 9633   /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS)  \
+{  \
+  /* Even part  \
+   *  \
+   * (Original)  \
+   * z1 = (z2 + z3) * 0.541196100;  \
+   * tmp2 = z1 + z3 * -1.847759065;  \
+   * tmp3 = z1 + z2 * 0.765366865;  \
+   *  \
+   * (This implementation)  \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);  \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;  \
+   */  \
+  \
+  in##26l = vec_mergeh(in##2, in##6);  \
+  in##26h = vec_mergel(in##2, in##6);  \
+  \
+  tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero);  \
+  tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero);  \
+  tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero);  \
+  tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero);  \
+  \
+  tmp0 = vec_add(in##0, in##4);  \
+  tmp1 = vec_sub(in##0, in##4);  \
+  \
+  tmp0l = vec_unpackh(tmp0);  \
+  tmp0h = vec_unpackl(tmp0);  \
+  tmp0l = vec_sl(tmp0l, const_bits);  \
+  tmp0h = vec_sl(tmp0h, const_bits);  \
+  tmp0l = vec_add(tmp0l, pd_descale_p##PASS);  \
+  tmp0h = vec_add(tmp0h, pd_descale_p##PASS);  \
+  \
+  tmp10l = vec_add(tmp0l, tmp3l);  \
+  tmp10h = vec_add(tmp0h, tmp3h);  \
+  tmp13l = vec_sub(tmp0l, tmp3l);  \
+  tmp13h = vec_sub(tmp0h, tmp3h);  \
+  \
+  tmp1l = vec_unpackh(tmp1);  \
+  tmp1h = vec_unpackl(tmp1);  \
+  tmp1l = vec_sl(tmp1l, const_bits);  \
+  tmp1h = vec_sl(tmp1h, const_bits);  \
+  tmp1l = vec_add(tmp1l, pd_descale_p##PASS);  \
+  tmp1h = vec_add(tmp1h, pd_descale_p##PASS);  \
+  \
+  tmp11l = vec_add(tmp1l, tmp2l);  \
+  tmp11h = vec_add(tmp1h, tmp2h);  \
+  tmp12l = vec_sub(tmp1l, tmp2l);  \
+  tmp12h = vec_sub(tmp1h, tmp2h);  \
+  \
+  /* Odd part */  \
+  \
+  z3 = vec_add(in##3, in##7);  \
+  z4 = vec_add(in##1, in##5);  \
+  \
+  /* (Original)  \
+   * z5 = (z3 + z4) * 1.175875602;  \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
+   * z3 += z5;  z4 += z5;  \
+   *  \
+   * (This implementation)  \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
+   */  \
+  \
+  z34l = vec_mergeh(z3, z4);  \
+  z34h = vec_mergel(z3, z4);  \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_zero);  \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_zero);  \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_zero);  \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_zero);  \
+  \
+  /* (Original)  \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;  \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;  \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;  \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4;  \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4;  \
+   *  \
+   * (This implementation)  \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;  \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;  \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);  \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);  \
+   * tmp0 += z3;  tmp1 += z4;  \
+   * tmp2 += z3;  tmp3 += z4;  \
+   */  \
+  \
+  in##71l = vec_mergeh(in##7, in##1);  \
+  in##71h = vec_mergel(in##7, in##1);  \
+  \
+  tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l);  \
+  tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h);  \
+  tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l);  \
+  tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h);  \
+  \
+  in##53l = vec_mergeh(in##5, in##3);  \
+  in##53h = vec_mergel(in##5, in##3);  \
+  \
+  tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l);  \
+  tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h);  \
+  tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l);  \
+  tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h);  \
+  \
+  /* Final output stage */  \
+  \
+  out0l = vec_add(tmp10l, tmp3l);  \
+  out0h = vec_add(tmp10h, tmp3h);  \
+  out7l = vec_sub(tmp10l, tmp3l);  \
+  out7h = vec_sub(tmp10h, tmp3h);  \
+  \
+  out0l = vec_sra(out0l, descale_p##PASS);  \
+  out0h = vec_sra(out0h, descale_p##PASS);  \
+  out7l = vec_sra(out7l, descale_p##PASS);  \
+  out7h = vec_sra(out7h, descale_p##PASS);  \
+  \
+  out0 = vec_pack(out0l, out0h);  \
+  out7 = vec_pack(out7l, out7h);  \
+  \
+  out1l = vec_add(tmp11l, tmp2l);  \
+  out1h = vec_add(tmp11h, tmp2h);  \
+  out6l = vec_sub(tmp11l, tmp2l);  \
+  out6h = vec_sub(tmp11h, tmp2h);  \
+  \
+  out1l = vec_sra(out1l, descale_p##PASS);  \
+  out1h = vec_sra(out1h, descale_p##PASS);  \
+  out6l = vec_sra(out6l, descale_p##PASS);  \
+  out6h = vec_sra(out6h, descale_p##PASS);  \
+  \
+  out1 = vec_pack(out1l, out1h);  \
+  out6 = vec_pack(out6l, out6h);  \
+  \
+  out2l = vec_add(tmp12l, tmp1l);  \
+  out2h = vec_add(tmp12h, tmp1h);  \
+  out5l = vec_sub(tmp12l, tmp1l);  \
+  out5h = vec_sub(tmp12h, tmp1h);  \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS);  \
+  out2h = vec_sra(out2h, descale_p##PASS);  \
+  out5l = vec_sra(out5l, descale_p##PASS);  \
+  out5h = vec_sra(out5h, descale_p##PASS);  \
+  \
+  out2 = vec_pack(out2l, out2h);  \
+  out5 = vec_pack(out5l, out5h);  \
+  \
+  out3l = vec_add(tmp13l, tmp0l);  \
+  out3h = vec_add(tmp13h, tmp0h);  \
+  out4l = vec_sub(tmp13l, tmp0l);  \
+  out4h = vec_sub(tmp13h, tmp0h);  \
+  \
+  out3l = vec_sra(out3l, descale_p##PASS);  \
+  out3h = vec_sra(out3h, descale_p##PASS);  \
+  out4l = vec_sra(out4l, descale_p##PASS);  \
+  out4h = vec_sra(out4h, descale_p##PASS);  \
+  \
+  out3 = vec_pack(out3l, out3h);  \
+  out4 = vec_pack(out4l, out4h);  \
+}
+
+
+void
+jsimd_idct_islow_altivec (void *dct_table_, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  int *outptr;
+
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, z3, z4,
+    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
+    row71l, row71h, row26l, row26h, row53l, row53h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
+    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
+    z3l, z3h, z4l, z4h,
+    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
+    out5l, out5h, out6l, out6h, out7l, out7h;
+  __vector signed char outb;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_zero = { __4X(0) },
+    pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) },
+    const_bits = { __4X(CONST_BITS) };
+  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+  /* Pass 1: process columns */
+
+  col0 = vec_ld(0, coef_block);
+  col1 = vec_ld(16, coef_block);
+  col2 = vec_ld(32, coef_block);
+  col3 = vec_ld(48, coef_block);
+  col4 = vec_ld(64, coef_block);
+  col5 = vec_ld(80, coef_block);
+  col6 = vec_ld(96, coef_block);
+  col7 = vec_ld(112, coef_block);
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = vec_ld(0, dct_table);
+  col0 = vec_mladd(col0, quant0, pw_zero);
+
+  if (vec_all_eq(tmp1, pw_zero)) {
+    /* AC terms all zero */
+
+    col0 = vec_sl(col0, pass1_bits);
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = vec_ld(16, dct_table);
+    quant2 = vec_ld(32, dct_table);
+    quant3 = vec_ld(48, dct_table);
+    quant4 = vec_ld(64, dct_table);
+    quant5 = vec_ld(80, dct_table);
+    quant6 = vec_ld(96, dct_table);
+    quant7 = vec_ld(112, dct_table);
+
+    col1 = vec_mladd(col1, quant1, pw_zero);
+    col2 = vec_mladd(col2, quant2, pw_zero);
+    col3 = vec_mladd(col3, quant3, pw_zero);
+    col4 = vec_mladd(col4, quant4, pw_zero);
+    col5 = vec_mladd(col5, quant5, pw_zero);
+    col6 = vec_mladd(col6, quant6, pw_zero);
+    col7 = vec_mladd(col7, quant7, pw_zero);
+
+    DO_IDCT(col, 1);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows */
+
+  DO_IDCT(row, 2);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col0);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[0] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col1, col1);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[1] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col2, col2);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[2] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col3, col3);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[3] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col4, col4);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[4] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col5, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[5] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col6, col6);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[6] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col7, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[7] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/simd/jidctint-mmx.asm b/simd/jidctint-mmx.asm
new file mode 100644
index 0000000..fda3b63
--- /dev/null
+++ b/simd/jidctint-mmx.asm
@@ -0,0 +1,852 @@
+;
+; jidctint.asm - accurate integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      13
+%define PASS1_BITS      2
+
+%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2      (CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298 equ      2446           ; FIX(0.298631336)
+F_0_390 equ      3196           ; FIX(0.390180644)
+F_0_541 equ      4433           ; FIX(0.541196100)
+F_0_765 equ      6270           ; FIX(0.765366865)
+F_0_899 equ      7373           ; FIX(0.899976223)
+F_1_175 equ      9633           ; FIX(1.175875602)
+F_1_501 equ     12299           ; FIX(1.501321110)
+F_1_847 equ     15137           ; FIX(1.847759065)
+F_1_961 equ     16069           ; FIX(1.961570560)
+F_2_053 equ     16819           ; FIX(2.053119869)
+F_2_562 equ     20995           ; FIX(2.562915447)
+F_3_072 equ     25172           ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
+F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
+F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
+F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
+F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
+F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
+F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
+F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
+F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
+F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
+F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054    times 2 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130   times 2 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117   times 2 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078    times 2 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089  times 2 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060   times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256  times 2 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050   times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1   times 2 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2   times 2 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_mmx (void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
+%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
+%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
+%define output_col(b)   (b)+20          ; JDIMENSION output_col
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM          12
+%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+        align   16
+        global  EXTN(jsimd_idct_islow_mmx)
+
+EXTN(jsimd_idct_islow_mmx):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [workspace]
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process columns from input, store into work array.
+
+;       mov     eax, [original_ebp]
+        mov     edx, POINTER [dct_table(eax)]           ; quantptr
+        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
+        lea     edi, [workspace]                        ; JCOEF *wsptr
+        mov     ecx, DCTSIZE/4                          ; ctr
+        alignx  16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        jnz     short .columnDCT
+
+        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        por     mm1,mm0
+        packsswb mm1,mm1
+        movd    eax,mm1
+        test    eax,eax
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        psllw   mm0,PASS1_BITS
+
+        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
+        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
+        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
+
+        movq      mm1,mm0
+        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
+        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
+        movq      mm3,mm2
+        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
+        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+        jmp     near .nextcolumn
+        alignx  16,7
+%endif
+.columnDCT:
+
+        ; -- Even part
+
+        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        ; (Original)
+        ; z1 = (z2 + z3) * 0.541196100;
+        ; tmp2 = z1 + z3 * -1.847759065;
+        ; tmp3 = z1 + z2 * 0.765366865;
+        ;
+        ; (This implementation)
+        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+        movq      mm4,mm1               ; mm1=in2=z2
+        movq      mm5,mm1
+        punpcklwd mm4,mm3               ; mm3=in6=z3
+        punpckhwd mm5,mm3
+        movq      mm1,mm4
+        movq      mm3,mm5
+        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=tmp3L
+        pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]        ; mm5=tmp3H
+        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=tmp2L
+        pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]       ; mm3=tmp2H
+
+        movq      mm6,mm0
+        paddw     mm0,mm2               ; mm0=in0+in4
+        psubw     mm6,mm2               ; mm6=in0-in4
+
+        pxor      mm7,mm7
+        pxor      mm2,mm2
+        punpcklwd mm7,mm0               ; mm7=tmp0L
+        punpckhwd mm2,mm0               ; mm2=tmp0H
+        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
+        psrad     mm2,(16-CONST_BITS)   ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+        movq    mm0,mm7
+        paddd   mm7,mm4                 ; mm7=tmp10L
+        psubd   mm0,mm4                 ; mm0=tmp13L
+        movq    mm4,mm2
+        paddd   mm2,mm5                 ; mm2=tmp10H
+        psubd   mm4,mm5                 ; mm4=tmp13H
+
+        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
+        movq    MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
+        movq    MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
+        movq    MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
+
+        pxor      mm5,mm5
+        pxor      mm7,mm7
+        punpcklwd mm5,mm6               ; mm5=tmp1L
+        punpckhwd mm7,mm6               ; mm7=tmp1H
+        psrad     mm5,(16-CONST_BITS)   ; psrad mm5,16 & pslld mm5,CONST_BITS
+        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+        movq    mm2,mm5
+        paddd   mm5,mm1                 ; mm5=tmp11L
+        psubd   mm2,mm1                 ; mm2=tmp12L
+        movq    mm0,mm7
+        paddd   mm7,mm3                 ; mm7=tmp11H
+        psubd   mm0,mm3                 ; mm0=tmp12H
+
+        movq    MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
+        movq    MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
+        movq    MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
+        movq    MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
+
+        ; -- Odd part
+
+        movq    mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq    mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        movq    mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        movq    mm5,mm6
+        movq    mm7,mm4
+        paddw   mm5,mm3                 ; mm5=z3
+        paddw   mm7,mm1                 ; mm7=z4
+
+        ; (Original)
+        ; z5 = (z3 + z4) * 1.175875602;
+        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+        ; z3 += z5;  z4 += z5;
+        ;
+        ; (This implementation)
+        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+        movq      mm2,mm5
+        movq      mm0,mm5
+        punpcklwd mm2,mm7
+        punpckhwd mm0,mm7
+        movq      mm5,mm2
+        movq      mm7,mm0
+        pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]       ; mm2=z3L
+        pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]       ; mm0=z3H
+        pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]        ; mm5=z4L
+        pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]        ; mm7=z4H
+
+        movq    MMWORD [wk(10)], mm2    ; wk(10)=z3L
+        movq    MMWORD [wk(11)], mm0    ; wk(11)=z3H
+
+        ; (Original)
+        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+        ;
+        ; (This implementation)
+        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+        ; tmp0 += z3;  tmp1 += z4;
+        ; tmp2 += z3;  tmp3 += z4;
+
+        movq      mm2,mm3
+        movq      mm0,mm3
+        punpcklwd mm2,mm4
+        punpckhwd mm0,mm4
+        movq      mm3,mm2
+        movq      mm4,mm0
+        pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm2=tmp0L
+        pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm0=tmp0H
+        pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]       ; mm3=tmp3L
+        pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]       ; mm4=tmp3H
+
+        paddd   mm2, MMWORD [wk(10)]    ; mm2=tmp0L
+        paddd   mm0, MMWORD [wk(11)]    ; mm0=tmp0H
+        paddd   mm3,mm5                 ; mm3=tmp3L
+        paddd   mm4,mm7                 ; mm4=tmp3H
+
+        movq    MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
+        movq    MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
+
+        movq      mm2,mm1
+        movq      mm0,mm1
+        punpcklwd mm2,mm6
+        punpckhwd mm0,mm6
+        movq      mm1,mm2
+        movq      mm6,mm0
+        pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm2=tmp1L
+        pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm0=tmp1H
+        pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]       ; mm1=tmp2L
+        pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]       ; mm6=tmp2H
+
+        paddd   mm2,mm5                 ; mm2=tmp1L
+        paddd   mm0,mm7                 ; mm0=tmp1H
+        paddd   mm1, MMWORD [wk(10)]    ; mm1=tmp2L
+        paddd   mm6, MMWORD [wk(11)]    ; mm6=tmp2H
+
+        movq    MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
+        movq    MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
+
+        ; -- Final output stage
+
+        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp10L
+        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp10H
+
+        movq    mm2,mm5
+        movq    mm0,mm7
+        paddd   mm5,mm3                 ; mm5=data0L
+        paddd   mm7,mm4                 ; mm7=data0H
+        psubd   mm2,mm3                 ; mm2=data7L
+        psubd   mm0,mm4                 ; mm0=data7H
+
+        movq    mm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1]
+
+        paddd   mm5,mm3
+        paddd   mm7,mm3
+        psrad   mm5,DESCALE_P1
+        psrad   mm7,DESCALE_P1
+        paddd   mm2,mm3
+        paddd   mm0,mm3
+        psrad   mm2,DESCALE_P1
+        psrad   mm0,DESCALE_P1
+
+        packssdw  mm5,mm7               ; mm5=data0=(00 01 02 03)
+        packssdw  mm2,mm0               ; mm2=data7=(70 71 72 73)
+
+        movq    mm4, MMWORD [wk(4)]     ; mm4=tmp11L
+        movq    mm3, MMWORD [wk(5)]     ; mm3=tmp11H
+
+        movq    mm7,mm4
+        movq    mm0,mm3
+        paddd   mm4,mm1                 ; mm4=data1L
+        paddd   mm3,mm6                 ; mm3=data1H
+        psubd   mm7,mm1                 ; mm7=data6L
+        psubd   mm0,mm6                 ; mm0=data6H
+
+        movq    mm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1]
+
+        paddd   mm4,mm1
+        paddd   mm3,mm1
+        psrad   mm4,DESCALE_P1
+        psrad   mm3,DESCALE_P1
+        paddd   mm7,mm1
+        paddd   mm0,mm1
+        psrad   mm7,DESCALE_P1
+        psrad   mm0,DESCALE_P1
+
+        packssdw  mm4,mm3               ; mm4=data1=(10 11 12 13)
+        packssdw  mm7,mm0               ; mm7=data6=(60 61 62 63)
+
+        movq      mm6,mm5               ; transpose coefficients(phase 1)
+        punpcklwd mm5,mm4               ; mm5=(00 10 01 11)
+        punpckhwd mm6,mm4               ; mm6=(02 12 03 13)
+        movq      mm1,mm7               ; transpose coefficients(phase 1)
+        punpcklwd mm7,mm2               ; mm7=(60 70 61 71)
+        punpckhwd mm1,mm2               ; mm1=(62 72 63 73)
+
+        movq    mm3, MMWORD [wk(6)]     ; mm3=tmp12L
+        movq    mm0, MMWORD [wk(7)]     ; mm0=tmp12H
+        movq    mm4, MMWORD [wk(10)]    ; mm4=tmp1L
+        movq    mm2, MMWORD [wk(11)]    ; mm2=tmp1H
+
+        movq    MMWORD [wk(0)], mm5     ; wk(0)=(00 10 01 11)
+        movq    MMWORD [wk(1)], mm6     ; wk(1)=(02 12 03 13)
+        movq    MMWORD [wk(4)], mm7     ; wk(4)=(60 70 61 71)
+        movq    MMWORD [wk(5)], mm1     ; wk(5)=(62 72 63 73)
+
+        movq    mm5,mm3
+        movq    mm6,mm0
+        paddd   mm3,mm4                 ; mm3=data2L
+        paddd   mm0,mm2                 ; mm0=data2H
+        psubd   mm5,mm4                 ; mm5=data5L
+        psubd   mm6,mm2                 ; mm6=data5H
+
+        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1]
+
+        paddd   mm3,mm7
+        paddd   mm0,mm7
+        psrad   mm3,DESCALE_P1
+        psrad   mm0,DESCALE_P1
+        paddd   mm5,mm7
+        paddd   mm6,mm7
+        psrad   mm5,DESCALE_P1
+        psrad   mm6,DESCALE_P1
+
+        packssdw  mm3,mm0               ; mm3=data2=(20 21 22 23)
+        packssdw  mm5,mm6               ; mm5=data5=(50 51 52 53)
+
+        movq    mm1, MMWORD [wk(2)]     ; mm1=tmp13L
+        movq    mm4, MMWORD [wk(3)]     ; mm4=tmp13H
+        movq    mm2, MMWORD [wk(8)]     ; mm2=tmp0L
+        movq    mm7, MMWORD [wk(9)]     ; mm7=tmp0H
+
+        movq    mm0,mm1
+        movq    mm6,mm4
+        paddd   mm1,mm2                 ; mm1=data3L
+        paddd   mm4,mm7                 ; mm4=data3H
+        psubd   mm0,mm2                 ; mm0=data4L
+        psubd   mm6,mm7                 ; mm6=data4H
+
+        movq    mm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1]
+
+        paddd   mm1,mm2
+        paddd   mm4,mm2
+        psrad   mm1,DESCALE_P1
+        psrad   mm4,DESCALE_P1
+        paddd   mm0,mm2
+        paddd   mm6,mm2
+        psrad   mm0,DESCALE_P1
+        psrad   mm6,DESCALE_P1
+
+        packssdw  mm1,mm4               ; mm1=data3=(30 31 32 33)
+        packssdw  mm0,mm6               ; mm0=data4=(40 41 42 43)
+
+        movq    mm7, MMWORD [wk(0)]     ; mm7=(00 10 01 11)
+        movq    mm2, MMWORD [wk(1)]     ; mm2=(02 12 03 13)
+
+        movq      mm4,mm3               ; transpose coefficients(phase 1)
+        punpcklwd mm3,mm1               ; mm3=(20 30 21 31)
+        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
+        movq      mm6,mm0               ; transpose coefficients(phase 1)
+        punpcklwd mm0,mm5               ; mm0=(40 50 41 51)
+        punpckhwd mm6,mm5               ; mm6=(42 52 43 53)
+
+        movq      mm1,mm7               ; transpose coefficients(phase 2)
+        punpckldq mm7,mm3               ; mm7=(00 10 20 30)
+        punpckhdq mm1,mm3               ; mm1=(01 11 21 31)
+        movq      mm5,mm2               ; transpose coefficients(phase 2)
+        punpckldq mm2,mm4               ; mm2=(02 12 22 32)
+        punpckhdq mm5,mm4               ; mm5=(03 13 23 33)
+
+        movq    mm3, MMWORD [wk(4)]     ; mm3=(60 70 61 71)
+        movq    mm4, MMWORD [wk(5)]     ; mm4=(62 72 63 73)
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+        movq      mm7,mm0               ; transpose coefficients(phase 2)
+        punpckldq mm0,mm3               ; mm0=(40 50 60 70)
+        punpckhdq mm7,mm3               ; mm7=(41 51 61 71)
+        movq      mm1,mm6               ; transpose coefficients(phase 2)
+        punpckldq mm6,mm4               ; mm6=(42 52 62 72)
+        punpckhdq mm1,mm4               ; mm1=(43 53 63 73)
+
+        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
+        add     edx, byte 4*SIZEOF_ISLOW_MULT_TYPE      ; quantptr
+        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
+        dec     ecx                                     ; ctr
+        jnz     near .columnloop
+
+        ; ---- Pass 2: process rows from work array, store into output array.
+
+        mov     eax, [original_ebp]
+        lea     esi, [workspace]                        ; JCOEF *wsptr
+        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
+        mov     eax, JDIMENSION [output_col(eax)]
+        mov     ecx, DCTSIZE/4                          ; ctr
+        alignx  16,7
+.rowloop:
+
+        ; -- Even part
+
+        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+        ; (Original)
+        ; z1 = (z2 + z3) * 0.541196100;
+        ; tmp2 = z1 + z3 * -1.847759065;
+        ; tmp3 = z1 + z2 * 0.765366865;
+        ;
+        ; (This implementation)
+        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+        movq      mm4,mm1               ; mm1=in2=z2
+        movq      mm5,mm1
+        punpcklwd mm4,mm3               ; mm3=in6=z3
+        punpckhwd mm5,mm3
+        movq      mm1,mm4
+        movq      mm3,mm5
+        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=tmp3L
+        pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]        ; mm5=tmp3H
+        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=tmp2L
+        pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]       ; mm3=tmp2H
+
+        movq      mm6,mm0
+        paddw     mm0,mm2               ; mm0=in0+in4
+        psubw     mm6,mm2               ; mm6=in0-in4
+
+        pxor      mm7,mm7
+        pxor      mm2,mm2
+        punpcklwd mm7,mm0               ; mm7=tmp0L
+        punpckhwd mm2,mm0               ; mm2=tmp0H
+        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
+        psrad     mm2,(16-CONST_BITS)   ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+        movq    mm0,mm7
+        paddd   mm7,mm4                 ; mm7=tmp10L
+        psubd   mm0,mm4                 ; mm0=tmp13L
+        movq    mm4,mm2
+        paddd   mm2,mm5                 ; mm2=tmp10H
+        psubd   mm4,mm5                 ; mm4=tmp13H
+
+        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
+        movq    MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
+        movq    MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
+        movq    MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
+
+        pxor      mm5,mm5
+        pxor      mm7,mm7
+        punpcklwd mm5,mm6               ; mm5=tmp1L
+        punpckhwd mm7,mm6               ; mm7=tmp1H
+        psrad     mm5,(16-CONST_BITS)   ; psrad mm5,16 & pslld mm5,CONST_BITS
+        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+        movq    mm2,mm5
+        paddd   mm5,mm1                 ; mm5=tmp11L
+        psubd   mm2,mm1                 ; mm2=tmp12L
+        movq    mm0,mm7
+        paddd   mm7,mm3                 ; mm7=tmp11H
+        psubd   mm0,mm3                 ; mm0=tmp12H
+
+        movq    MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
+        movq    MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
+        movq    MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
+        movq    MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
+
+        ; -- Odd part
+
+        movq    mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq    mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+        movq    mm5,mm6
+        movq    mm7,mm4
+        paddw   mm5,mm3                 ; mm5=z3
+        paddw   mm7,mm1                 ; mm7=z4
+
+        ; (Original)
+        ; z5 = (z3 + z4) * 1.175875602;
+        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+        ; z3 += z5;  z4 += z5;
+        ;
+        ; (This implementation)
+        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+        movq      mm2,mm5
+        movq      mm0,mm5
+        punpcklwd mm2,mm7
+        punpckhwd mm0,mm7
+        movq      mm5,mm2
+        movq      mm7,mm0
+        pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]       ; mm2=z3L
+        pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]       ; mm0=z3H
+        pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]        ; mm5=z4L
+        pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]        ; mm7=z4H
+
+        movq    MMWORD [wk(10)], mm2    ; wk(10)=z3L
+        movq    MMWORD [wk(11)], mm0    ; wk(11)=z3H
+
+        ; (Original)
+        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+        ;
+        ; (This implementation)
+        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+        ; tmp0 += z3;  tmp1 += z4;
+        ; tmp2 += z3;  tmp3 += z4;
+
+        movq      mm2,mm3
+        movq      mm0,mm3
+        punpcklwd mm2,mm4
+        punpckhwd mm0,mm4
+        movq      mm3,mm2
+        movq      mm4,mm0
+        pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm2=tmp0L
+        pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm0=tmp0H
+        pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]       ; mm3=tmp3L
+        pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]       ; mm4=tmp3H
+
+        paddd   mm2, MMWORD [wk(10)]    ; mm2=tmp0L
+        paddd   mm0, MMWORD [wk(11)]    ; mm0=tmp0H
+        paddd   mm3,mm5                 ; mm3=tmp3L
+        paddd   mm4,mm7                 ; mm4=tmp3H
+
+        movq    MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
+        movq    MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
+
+        movq      mm2,mm1
+        movq      mm0,mm1
+        punpcklwd mm2,mm6
+        punpckhwd mm0,mm6
+        movq      mm1,mm2
+        movq      mm6,mm0
+        pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm2=tmp1L
+        pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm0=tmp1H
+        pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]       ; mm1=tmp2L
+        pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]       ; mm6=tmp2H
+
+        paddd   mm2,mm5                 ; mm2=tmp1L
+        paddd   mm0,mm7                 ; mm0=tmp1H
+        paddd   mm1, MMWORD [wk(10)]    ; mm1=tmp2L
+        paddd   mm6, MMWORD [wk(11)]    ; mm6=tmp2H
+
+        movq    MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
+        movq    MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
+
+        ; -- Final output stage
+
+        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp10L
+        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp10H
+
+        movq    mm2,mm5
+        movq    mm0,mm7
+        paddd   mm5,mm3                 ; mm5=data0L
+        paddd   mm7,mm4                 ; mm7=data0H
+        psubd   mm2,mm3                 ; mm2=data7L
+        psubd   mm0,mm4                 ; mm0=data7H
+
+        movq    mm3,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2]
+
+        paddd   mm5,mm3
+        paddd   mm7,mm3
+        psrad   mm5,DESCALE_P2
+        psrad   mm7,DESCALE_P2
+        paddd   mm2,mm3
+        paddd   mm0,mm3
+        psrad   mm2,DESCALE_P2
+        psrad   mm0,DESCALE_P2
+
+        packssdw  mm5,mm7               ; mm5=data0=(00 10 20 30)
+        packssdw  mm2,mm0               ; mm2=data7=(07 17 27 37)
+
+        movq    mm4, MMWORD [wk(4)]     ; mm4=tmp11L
+        movq    mm3, MMWORD [wk(5)]     ; mm3=tmp11H
+
+        movq    mm7,mm4
+        movq    mm0,mm3
+        paddd   mm4,mm1                 ; mm4=data1L
+        paddd   mm3,mm6                 ; mm3=data1H
+        psubd   mm7,mm1                 ; mm7=data6L
+        psubd   mm0,mm6                 ; mm0=data6H
+
+        movq    mm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2]
+
+        paddd   mm4,mm1
+        paddd   mm3,mm1
+        psrad   mm4,DESCALE_P2
+        psrad   mm3,DESCALE_P2
+        paddd   mm7,mm1
+        paddd   mm0,mm1
+        psrad   mm7,DESCALE_P2
+        psrad   mm0,DESCALE_P2
+
+        packssdw  mm4,mm3               ; mm4=data1=(01 11 21 31)
+        packssdw  mm7,mm0               ; mm7=data6=(06 16 26 36)
+
+        packsswb  mm5,mm7               ; mm5=(00 10 20 30 06 16 26 36)
+        packsswb  mm4,mm2               ; mm4=(01 11 21 31 07 17 27 37)
+
+        movq    mm6, MMWORD [wk(6)]     ; mm6=tmp12L
+        movq    mm1, MMWORD [wk(7)]     ; mm1=tmp12H
+        movq    mm3, MMWORD [wk(10)]    ; mm3=tmp1L
+        movq    mm0, MMWORD [wk(11)]    ; mm0=tmp1H
+
+        movq    MMWORD [wk(0)], mm5     ; wk(0)=(00 10 20 30 06 16 26 36)
+        movq    MMWORD [wk(1)], mm4     ; wk(1)=(01 11 21 31 07 17 27 37)
+
+        movq    mm7,mm6
+        movq    mm2,mm1
+        paddd   mm6,mm3                 ; mm6=data2L
+        paddd   mm1,mm0                 ; mm1=data2H
+        psubd   mm7,mm3                 ; mm7=data5L
+        psubd   mm2,mm0                 ; mm2=data5H
+
+        movq    mm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2]
+
+        paddd   mm6,mm5
+        paddd   mm1,mm5
+        psrad   mm6,DESCALE_P2
+        psrad   mm1,DESCALE_P2
+        paddd   mm7,mm5
+        paddd   mm2,mm5
+        psrad   mm7,DESCALE_P2
+        psrad   mm2,DESCALE_P2
+
+        packssdw  mm6,mm1               ; mm6=data2=(02 12 22 32)
+        packssdw  mm7,mm2               ; mm7=data5=(05 15 25 35)
+
+        movq    mm4, MMWORD [wk(2)]     ; mm4=tmp13L
+        movq    mm3, MMWORD [wk(3)]     ; mm3=tmp13H
+        movq    mm0, MMWORD [wk(8)]     ; mm0=tmp0L
+        movq    mm5, MMWORD [wk(9)]     ; mm5=tmp0H
+
+        movq    mm1,mm4
+        movq    mm2,mm3
+        paddd   mm4,mm0                 ; mm4=data3L
+        paddd   mm3,mm5                 ; mm3=data3H
+        psubd   mm1,mm0                 ; mm1=data4L
+        psubd   mm2,mm5                 ; mm2=data4H
+
+        movq    mm0,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2]
+
+        paddd   mm4,mm0
+        paddd   mm3,mm0
+        psrad   mm4,DESCALE_P2
+        psrad   mm3,DESCALE_P2
+        paddd   mm1,mm0
+        paddd   mm2,mm0
+        psrad   mm1,DESCALE_P2
+        psrad   mm2,DESCALE_P2
+
+        movq      mm5,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm5=[PB_CENTERJSAMP]
+
+        packssdw  mm4,mm3               ; mm4=data3=(03 13 23 33)
+        packssdw  mm1,mm2               ; mm1=data4=(04 14 24 34)
+
+        movq      mm0, MMWORD [wk(0)]   ; mm0=(00 10 20 30 06 16 26 36)
+        movq      mm3, MMWORD [wk(1)]   ; mm3=(01 11 21 31 07 17 27 37)
+
+        packsswb  mm6,mm1               ; mm6=(02 12 22 32 04 14 24 34)
+        packsswb  mm4,mm7               ; mm4=(03 13 23 33 05 15 25 35)
+
+        paddb     mm0,mm5
+        paddb     mm3,mm5
+        paddb     mm6,mm5
+        paddb     mm4,mm5
+
+        movq      mm2,mm0               ; transpose coefficients(phase 1)
+        punpcklbw mm0,mm3               ; mm0=(00 01 10 11 20 21 30 31)
+        punpckhbw mm2,mm3               ; mm2=(06 07 16 17 26 27 36 37)
+        movq      mm1,mm6               ; transpose coefficients(phase 1)
+        punpcklbw mm6,mm4               ; mm6=(02 03 12 13 22 23 32 33)
+        punpckhbw mm1,mm4               ; mm1=(04 05 14 15 24 25 34 35)
+
+        movq      mm7,mm0               ; transpose coefficients(phase 2)
+        punpcklwd mm0,mm6               ; mm0=(00 01 02 03 10 11 12 13)
+        punpckhwd mm7,mm6               ; mm7=(20 21 22 23 30 31 32 33)
+        movq      mm5,mm1               ; transpose coefficients(phase 2)
+        punpcklwd mm1,mm2               ; mm1=(04 05 06 07 14 15 16 17)
+        punpckhwd mm5,mm2               ; mm5=(24 25 26 27 34 35 36 37)
+
+        movq      mm3,mm0               ; transpose coefficients(phase 3)
+        punpckldq mm0,mm1               ; mm0=(00 01 02 03 04 05 06 07)
+        punpckhdq mm3,mm1               ; mm3=(10 11 12 13 14 15 16 17)
+        movq      mm4,mm7               ; transpose coefficients(phase 3)
+        punpckldq mm7,mm5               ; mm7=(20 21 22 23 24 25 26 27)
+        punpckhdq mm4,mm5               ; mm4=(30 31 32 33 34 35 36 37)
+
+        pushpic ebx                     ; save GOT address
+
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+        poppic  ebx                     ; restore GOT address
+
+        add     esi, byte 4*SIZEOF_JCOEF        ; wsptr
+        add     edi, byte 4*SIZEOF_JSAMPROW
+        dec     ecx                             ; ctr
+        jnz     near .rowloop
+
+        emms            ; empty MMX state
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jidctint-sse2-64.asm b/simd/jidctint-sse2-64.asm
new file mode 100644
index 0000000..bfec499
--- /dev/null
+++ b/simd/jidctint-sse2-64.asm
@@ -0,0 +1,848 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      13
+%define PASS1_BITS      2
+
+%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2      (CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298 equ      2446           ; FIX(0.298631336)
+F_0_390 equ      3196           ; FIX(0.390180644)
+F_0_541 equ      4433           ; FIX(0.541196100)
+F_0_765 equ      6270           ; FIX(0.765366865)
+F_0_899 equ      7373           ; FIX(0.899976223)
+F_1_175 equ      9633           ; FIX(1.175875602)
+F_1_501 equ     12299           ; FIX(1.501321110)
+F_1_847 equ     15137           ; FIX(1.847759065)
+F_1_961 equ     16069           ; FIX(1.961570560)
+F_2_053 equ     16819           ; FIX(2.053119869)
+F_2_562 equ     20995           ; FIX(2.562915447)
+F_3_072 equ     25172           ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
+F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
+F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
+F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
+F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
+F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
+F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
+F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
+F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
+F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
+F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp    rbp+0
+%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          12
+
+        align   16
+        global  EXTN(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [wk(0)]
+        collect_args
+
+        ; ---- Pass 1: process columns from input.
+
+        mov     rdx, r10                ; quantptr
+        mov     rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+        jnz     near .columnDCT
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+        por     xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+        por     xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+        por     xmm1,xmm0
+        packsswb xmm1,xmm1
+        packsswb xmm1,xmm1
+        movd    eax,xmm1
+        test    rax,rax
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movdqa  xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        psllw   xmm5,PASS1_BITS
+
+        movdqa    xmm4,xmm5             ; xmm5=in0=(00 01 02 03 04 05 06 07)
+        punpcklwd xmm5,xmm5             ; xmm5=(00 00 01 01 02 02 03 03)
+        punpckhwd xmm4,xmm4             ; xmm4=(04 04 05 05 06 06 07 07)
+
+        pshufd  xmm7,xmm5,0x00          ; xmm7=col0=(00 00 00 00 00 00 00 00)
+        pshufd  xmm6,xmm5,0x55          ; xmm6=col1=(01 01 01 01 01 01 01 01)
+        pshufd  xmm1,xmm5,0xAA          ; xmm1=col2=(02 02 02 02 02 02 02 02)
+        pshufd  xmm5,xmm5,0xFF          ; xmm5=col3=(03 03 03 03 03 03 03 03)
+        pshufd  xmm0,xmm4,0x00          ; xmm0=col4=(04 04 04 04 04 04 04 04)
+        pshufd  xmm3,xmm4,0x55          ; xmm3=col5=(05 05 05 05 05 05 05 05)
+        pshufd  xmm2,xmm4,0xAA          ; xmm2=col6=(06 06 06 06 06 06 06 06)
+        pshufd  xmm4,xmm4,0xFF          ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+        movdqa  XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+        movdqa  XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+        jmp     near .column_end
+%endif
+.columnDCT:
+
+        ; -- Even part
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        ; (Original)
+        ; z1 = (z2 + z3) * 0.541196100;
+        ; tmp2 = z1 + z3 * -1.847759065;
+        ; tmp3 = z1 + z2 * 0.765366865;
+        ;
+        ; (This implementation)
+        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+        movdqa    xmm4,xmm1             ; xmm1=in2=z2
+        movdqa    xmm5,xmm1
+        punpcklwd xmm4,xmm3             ; xmm3=in6=z3
+        punpckhwd xmm5,xmm3
+        movdqa    xmm1,xmm4
+        movdqa    xmm3,xmm5
+        pmaddwd   xmm4,[rel PW_F130_F054]       ; xmm4=tmp3L
+        pmaddwd   xmm5,[rel PW_F130_F054]       ; xmm5=tmp3H
+        pmaddwd   xmm1,[rel PW_F054_MF130]      ; xmm1=tmp2L
+        pmaddwd   xmm3,[rel PW_F054_MF130]      ; xmm3=tmp2H
+
+        movdqa    xmm6,xmm0
+        paddw     xmm0,xmm2             ; xmm0=in0+in4
+        psubw     xmm6,xmm2             ; xmm6=in0-in4
+
+        pxor      xmm7,xmm7
+        pxor      xmm2,xmm2
+        punpcklwd xmm7,xmm0             ; xmm7=tmp0L
+        punpckhwd xmm2,xmm0             ; xmm2=tmp0H
+        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+        psrad     xmm2,(16-CONST_BITS)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+        movdqa  xmm0,xmm7
+        paddd   xmm7,xmm4               ; xmm7=tmp10L
+        psubd   xmm0,xmm4               ; xmm0=tmp13L
+        movdqa  xmm4,xmm2
+        paddd   xmm2,xmm5               ; xmm2=tmp10H
+        psubd   xmm4,xmm5               ; xmm4=tmp13H
+
+        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+        movdqa  XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+        movdqa  XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+        pxor      xmm5,xmm5
+        pxor      xmm7,xmm7
+        punpcklwd xmm5,xmm6             ; xmm5=tmp1L
+        punpckhwd xmm7,xmm6             ; xmm7=tmp1H
+        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+        movdqa  xmm2,xmm5
+        paddd   xmm5,xmm1               ; xmm5=tmp11L
+        psubd   xmm2,xmm1               ; xmm2=tmp12L
+        movdqa  xmm0,xmm7
+        paddd   xmm7,xmm3               ; xmm7=tmp11H
+        psubd   xmm0,xmm3               ; xmm0=tmp12H
+
+        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+        movdqa  XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+        movdqa  XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+        ; -- Odd part
+
+        movdqa  xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        movdqa  xmm5,xmm6
+        movdqa  xmm7,xmm4
+        paddw   xmm5,xmm3               ; xmm5=z3
+        paddw   xmm7,xmm1               ; xmm7=z4
+
+        ; (Original)
+        ; z5 = (z3 + z4) * 1.175875602;
+        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+        ; z3 += z5;  z4 += z5;
+        ;
+        ; (This implementation)
+        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+        movdqa    xmm2,xmm5
+        movdqa    xmm0,xmm5
+        punpcklwd xmm2,xmm7
+        punpckhwd xmm0,xmm7
+        movdqa    xmm5,xmm2
+        movdqa    xmm7,xmm0
+        pmaddwd   xmm2,[rel PW_MF078_F117]      ; xmm2=z3L
+        pmaddwd   xmm0,[rel PW_MF078_F117]      ; xmm0=z3H
+        pmaddwd   xmm5,[rel PW_F117_F078]       ; xmm5=z4L
+        pmaddwd   xmm7,[rel PW_F117_F078]       ; xmm7=z4H
+
+        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+        ; (Original)
+        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+        ;
+        ; (This implementation)
+        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+        ; tmp0 += z3;  tmp1 += z4;
+        ; tmp2 += z3;  tmp3 += z4;
+
+        movdqa    xmm2,xmm3
+        movdqa    xmm0,xmm3
+        punpcklwd xmm2,xmm4
+        punpckhwd xmm0,xmm4
+        movdqa    xmm3,xmm2
+        movdqa    xmm4,xmm0
+        pmaddwd   xmm2,[rel PW_MF060_MF089]     ; xmm2=tmp0L
+        pmaddwd   xmm0,[rel PW_MF060_MF089]     ; xmm0=tmp0H
+        pmaddwd   xmm3,[rel PW_MF089_F060]      ; xmm3=tmp3L
+        pmaddwd   xmm4,[rel PW_MF089_F060]      ; xmm4=tmp3H
+
+        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+        paddd   xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+        paddd   xmm3,xmm5               ; xmm3=tmp3L
+        paddd   xmm4,xmm7               ; xmm4=tmp3H
+
+        movdqa  XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+        movdqa  XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+        movdqa    xmm2,xmm1
+        movdqa    xmm0,xmm1
+        punpcklwd xmm2,xmm6
+        punpckhwd xmm0,xmm6
+        movdqa    xmm1,xmm2
+        movdqa    xmm6,xmm0
+        pmaddwd   xmm2,[rel PW_MF050_MF256]     ; xmm2=tmp1L
+        pmaddwd   xmm0,[rel PW_MF050_MF256]     ; xmm0=tmp1H
+        pmaddwd   xmm1,[rel PW_MF256_F050]      ; xmm1=tmp2L
+        pmaddwd   xmm6,[rel PW_MF256_F050]      ; xmm6=tmp2H
+
+        paddd   xmm2,xmm5               ; xmm2=tmp1L
+        paddd   xmm0,xmm7               ; xmm0=tmp1H
+        paddd   xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+        ; -- Final output stage
+
+        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+        movdqa  xmm2,xmm5
+        movdqa  xmm0,xmm7
+        paddd   xmm5,xmm3               ; xmm5=data0L
+        paddd   xmm7,xmm4               ; xmm7=data0H
+        psubd   xmm2,xmm3               ; xmm2=data7L
+        psubd   xmm0,xmm4               ; xmm0=data7H
+
+        movdqa  xmm3,[rel PD_DESCALE_P1]        ; xmm3=[rel PD_DESCALE_P1]
+
+        paddd   xmm5,xmm3
+        paddd   xmm7,xmm3
+        psrad   xmm5,DESCALE_P1
+        psrad   xmm7,DESCALE_P1
+        paddd   xmm2,xmm3
+        paddd   xmm0,xmm3
+        psrad   xmm2,DESCALE_P1
+        psrad   xmm0,DESCALE_P1
+
+        packssdw  xmm5,xmm7             ; xmm5=data0=(00 01 02 03 04 05 06 07)
+        packssdw  xmm2,xmm0             ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+        movdqa  xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+        movdqa  xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+        movdqa  xmm7,xmm4
+        movdqa  xmm0,xmm3
+        paddd   xmm4,xmm1               ; xmm4=data1L
+        paddd   xmm3,xmm6               ; xmm3=data1H
+        psubd   xmm7,xmm1               ; xmm7=data6L
+        psubd   xmm0,xmm6               ; xmm0=data6H
+
+        movdqa  xmm1,[rel PD_DESCALE_P1]        ; xmm1=[rel PD_DESCALE_P1]
+
+        paddd   xmm4,xmm1
+        paddd   xmm3,xmm1
+        psrad   xmm4,DESCALE_P1
+        psrad   xmm3,DESCALE_P1
+        paddd   xmm7,xmm1
+        paddd   xmm0,xmm1
+        psrad   xmm7,DESCALE_P1
+        psrad   xmm0,DESCALE_P1
+
+        packssdw  xmm4,xmm3             ; xmm4=data1=(10 11 12 13 14 15 16 17)
+        packssdw  xmm7,xmm0             ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+        movdqa    xmm6,xmm5             ; transpose coefficients(phase 1)
+        punpcklwd xmm5,xmm4             ; xmm5=(00 10 01 11 02 12 03 13)
+        punpckhwd xmm6,xmm4             ; xmm6=(04 14 05 15 06 16 07 17)
+        movdqa    xmm1,xmm7             ; transpose coefficients(phase 1)
+        punpcklwd xmm7,xmm2             ; xmm7=(60 70 61 71 62 72 63 73)
+        punpckhwd xmm1,xmm2             ; xmm1=(64 74 65 75 66 76 67 77)
+
+        movdqa  xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+        movdqa  xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+        movdqa  xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+        movdqa  xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+        movdqa  XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+        movdqa  xmm5,xmm3
+        movdqa  xmm6,xmm0
+        paddd   xmm3,xmm4               ; xmm3=data2L
+        paddd   xmm0,xmm2               ; xmm0=data2H
+        psubd   xmm5,xmm4               ; xmm5=data5L
+        psubd   xmm6,xmm2               ; xmm6=data5H
+
+        movdqa  xmm7,[rel PD_DESCALE_P1]        ; xmm7=[rel PD_DESCALE_P1]
+
+        paddd   xmm3,xmm7
+        paddd   xmm0,xmm7
+        psrad   xmm3,DESCALE_P1
+        psrad   xmm0,DESCALE_P1
+        paddd   xmm5,xmm7
+        paddd   xmm6,xmm7
+        psrad   xmm5,DESCALE_P1
+        psrad   xmm6,DESCALE_P1
+
+        packssdw  xmm3,xmm0             ; xmm3=data2=(20 21 22 23 24 25 26 27)
+        packssdw  xmm5,xmm6             ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+        movdqa  xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+        movdqa  xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+        movdqa  xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+        movdqa  xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+        movdqa  xmm0,xmm1
+        movdqa  xmm6,xmm4
+        paddd   xmm1,xmm2               ; xmm1=data3L
+        paddd   xmm4,xmm7               ; xmm4=data3H
+        psubd   xmm0,xmm2               ; xmm0=data4L
+        psubd   xmm6,xmm7               ; xmm6=data4H
+
+        movdqa  xmm2,[rel PD_DESCALE_P1]        ; xmm2=[rel PD_DESCALE_P1]
+
+        paddd   xmm1,xmm2
+        paddd   xmm4,xmm2
+        psrad   xmm1,DESCALE_P1
+        psrad   xmm4,DESCALE_P1
+        paddd   xmm0,xmm2
+        paddd   xmm6,xmm2
+        psrad   xmm0,DESCALE_P1
+        psrad   xmm6,DESCALE_P1
+
+        packssdw  xmm1,xmm4             ; xmm1=data3=(30 31 32 33 34 35 36 37)
+        packssdw  xmm0,xmm6             ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+        movdqa  xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+        movdqa    xmm4,xmm3             ; transpose coefficients(phase 1)
+        punpcklwd xmm3,xmm1             ; xmm3=(20 30 21 31 22 32 23 33)
+        punpckhwd xmm4,xmm1             ; xmm4=(24 34 25 35 26 36 27 37)
+        movdqa    xmm6,xmm0             ; transpose coefficients(phase 1)
+        punpcklwd xmm0,xmm5             ; xmm0=(40 50 41 51 42 52 43 53)
+        punpckhwd xmm6,xmm5             ; xmm6=(44 54 45 55 46 56 47 57)
+
+        movdqa    xmm1,xmm7             ; transpose coefficients(phase 2)
+        punpckldq xmm7,xmm3             ; xmm7=(00 10 20 30 01 11 21 31)
+        punpckhdq xmm1,xmm3             ; xmm1=(02 12 22 32 03 13 23 33)
+        movdqa    xmm5,xmm2             ; transpose coefficients(phase 2)
+        punpckldq xmm2,xmm4             ; xmm2=(04 14 24 34 05 15 25 35)
+        punpckhdq xmm5,xmm4             ; xmm5=(06 16 26 36 07 17 27 37)
+
+        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+        movdqa  xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+        movdqa  XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+        movdqa    xmm2,xmm0             ; transpose coefficients(phase 2)
+        punpckldq xmm0,xmm3             ; xmm0=(40 50 60 70 41 51 61 71)
+        punpckhdq xmm2,xmm3             ; xmm2=(42 52 62 72 43 53 63 73)
+        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
+        punpckldq xmm6,xmm4             ; xmm6=(44 54 64 74 45 55 65 75)
+        punpckhdq xmm5,xmm4             ; xmm5=(46 56 66 76 47 57 67 77)
+
+        movdqa     xmm3,xmm7            ; transpose coefficients(phase 3)
+        punpcklqdq xmm7,xmm0            ; xmm7=col0=(00 10 20 30 40 50 60 70)
+        punpckhqdq xmm3,xmm0            ; xmm3=col1=(01 11 21 31 41 51 61 71)
+        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
+        punpcklqdq xmm1,xmm2            ; xmm1=col2=(02 12 22 32 42 52 62 72)
+        punpckhqdq xmm4,xmm2            ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+        movdqa  xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+        movdqa  XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+        movdqa  XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
+        punpcklqdq xmm0,xmm6            ; xmm0=col4=(04 14 24 34 44 54 64 74)
+        punpckhqdq xmm3,xmm6            ; xmm3=col5=(05 15 25 35 45 55 65 75)
+        movdqa     xmm4,xmm2            ; transpose coefficients(phase 3)
+        punpcklqdq xmm2,xmm5            ; xmm2=col6=(06 16 26 36 46 56 66 76)
+        punpckhqdq xmm4,xmm5            ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+        ; -- Prefetch the next coefficient block
+
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+        ; ---- Pass 2: process rows from work array, store into output array.
+
+        mov     rax, [original_rbp]
+        mov     rdi, r12        ; (JSAMPROW *)
+        mov     eax, r13d
+
+        ; -- Even part
+
+        ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+        ; (Original)
+        ; z1 = (z2 + z3) * 0.541196100;
+        ; tmp2 = z1 + z3 * -1.847759065;
+        ; tmp3 = z1 + z2 * 0.765366865;
+        ;
+        ; (This implementation)
+        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+        movdqa    xmm6,xmm1             ; xmm1=in2=z2
+        movdqa    xmm5,xmm1
+        punpcklwd xmm6,xmm2             ; xmm2=in6=z3
+        punpckhwd xmm5,xmm2
+        movdqa    xmm1,xmm6
+        movdqa    xmm2,xmm5
+        pmaddwd   xmm6,[rel PW_F130_F054]       ; xmm6=tmp3L
+        pmaddwd   xmm5,[rel PW_F130_F054]       ; xmm5=tmp3H
+        pmaddwd   xmm1,[rel PW_F054_MF130]      ; xmm1=tmp2L
+        pmaddwd   xmm2,[rel PW_F054_MF130]      ; xmm2=tmp2H
+
+        movdqa    xmm3,xmm7
+        paddw     xmm7,xmm0             ; xmm7=in0+in4
+        psubw     xmm3,xmm0             ; xmm3=in0-in4
+
+        pxor      xmm4,xmm4
+        pxor      xmm0,xmm0
+        punpcklwd xmm4,xmm7             ; xmm4=tmp0L
+        punpckhwd xmm0,xmm7             ; xmm0=tmp0H
+        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+        psrad     xmm0,(16-CONST_BITS)  ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+        movdqa  xmm7,xmm4
+        paddd   xmm4,xmm6               ; xmm4=tmp10L
+        psubd   xmm7,xmm6               ; xmm7=tmp13L
+        movdqa  xmm6,xmm0
+        paddd   xmm0,xmm5               ; xmm0=tmp10H
+        psubd   xmm6,xmm5               ; xmm6=tmp13H
+
+        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+        movdqa  XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+        pxor      xmm5,xmm5
+        pxor      xmm4,xmm4
+        punpcklwd xmm5,xmm3             ; xmm5=tmp1L
+        punpckhwd xmm4,xmm3             ; xmm4=tmp1H
+        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+        movdqa  xmm0,xmm5
+        paddd   xmm5,xmm1               ; xmm5=tmp11L
+        psubd   xmm0,xmm1               ; xmm0=tmp12L
+        movdqa  xmm7,xmm4
+        paddd   xmm4,xmm2               ; xmm4=tmp11H
+        psubd   xmm7,xmm2               ; xmm7=tmp12H
+
+        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+        movdqa  XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+        movdqa  XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+        ; -- Odd part
+
+        movdqa  xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+        movdqa  xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+        movdqa  xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+        movdqa  xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+        movdqa  xmm5,xmm6
+        movdqa  xmm4,xmm3
+        paddw   xmm5,xmm1               ; xmm5=z3
+        paddw   xmm4,xmm2               ; xmm4=z4
+
+        ; (Original)
+        ; z5 = (z3 + z4) * 1.175875602;
+        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+        ; z3 += z5;  z4 += z5;
+        ;
+        ; (This implementation)
+        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+        movdqa    xmm0,xmm5
+        movdqa    xmm7,xmm5
+        punpcklwd xmm0,xmm4
+        punpckhwd xmm7,xmm4
+        movdqa    xmm5,xmm0
+        movdqa    xmm4,xmm7
+        pmaddwd   xmm0,[rel PW_MF078_F117]      ; xmm0=z3L
+        pmaddwd   xmm7,[rel PW_MF078_F117]      ; xmm7=z3H
+        pmaddwd   xmm5,[rel PW_F117_F078]       ; xmm5=z4L
+        pmaddwd   xmm4,[rel PW_F117_F078]       ; xmm4=z4H
+
+        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+        ; (Original)
+        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+        ;
+        ; (This implementation)
+        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+        ; tmp0 += z3;  tmp1 += z4;
+        ; tmp2 += z3;  tmp3 += z4;
+
+        movdqa    xmm0,xmm1
+        movdqa    xmm7,xmm1
+        punpcklwd xmm0,xmm3
+        punpckhwd xmm7,xmm3
+        movdqa    xmm1,xmm0
+        movdqa    xmm3,xmm7
+        pmaddwd   xmm0,[rel PW_MF060_MF089]     ; xmm0=tmp0L
+        pmaddwd   xmm7,[rel PW_MF060_MF089]     ; xmm7=tmp0H
+        pmaddwd   xmm1,[rel PW_MF089_F060]      ; xmm1=tmp3L
+        pmaddwd   xmm3,[rel PW_MF089_F060]      ; xmm3=tmp3H
+
+        paddd   xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+        paddd   xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+        paddd   xmm1,xmm5               ; xmm1=tmp3L
+        paddd   xmm3,xmm4               ; xmm3=tmp3H
+
+        movdqa  XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+        movdqa  XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+        movdqa    xmm0,xmm2
+        movdqa    xmm7,xmm2
+        punpcklwd xmm0,xmm6
+        punpckhwd xmm7,xmm6
+        movdqa    xmm2,xmm0
+        movdqa    xmm6,xmm7
+        pmaddwd   xmm0,[rel PW_MF050_MF256]     ; xmm0=tmp1L
+        pmaddwd   xmm7,[rel PW_MF050_MF256]     ; xmm7=tmp1H
+        pmaddwd   xmm2,[rel PW_MF256_F050]      ; xmm2=tmp2L
+        pmaddwd   xmm6,[rel PW_MF256_F050]      ; xmm6=tmp2H
+
+        paddd   xmm0,xmm5               ; xmm0=tmp1L
+        paddd   xmm7,xmm4               ; xmm7=tmp1H
+        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+        ; -- Final output stage
+
+        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+        movdqa  xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+        movdqa  xmm0,xmm5
+        movdqa  xmm7,xmm4
+        paddd   xmm5,xmm1               ; xmm5=data0L
+        paddd   xmm4,xmm3               ; xmm4=data0H
+        psubd   xmm0,xmm1               ; xmm0=data7L
+        psubd   xmm7,xmm3               ; xmm7=data7H
+
+        movdqa  xmm1,[rel PD_DESCALE_P2]        ; xmm1=[rel PD_DESCALE_P2]
+
+        paddd   xmm5,xmm1
+        paddd   xmm4,xmm1
+        psrad   xmm5,DESCALE_P2
+        psrad   xmm4,DESCALE_P2
+        paddd   xmm0,xmm1
+        paddd   xmm7,xmm1
+        psrad   xmm0,DESCALE_P2
+        psrad   xmm7,DESCALE_P2
+
+        packssdw  xmm5,xmm4             ; xmm5=data0=(00 10 20 30 40 50 60 70)
+        packssdw  xmm0,xmm7             ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+        movdqa  xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+        movdqa  xmm4,xmm3
+        movdqa  xmm7,xmm1
+        paddd   xmm3,xmm2               ; xmm3=data1L
+        paddd   xmm1,xmm6               ; xmm1=data1H
+        psubd   xmm4,xmm2               ; xmm4=data6L
+        psubd   xmm7,xmm6               ; xmm7=data6H
+
+        movdqa  xmm2,[rel PD_DESCALE_P2]        ; xmm2=[rel PD_DESCALE_P2]
+
+        paddd   xmm3,xmm2
+        paddd   xmm1,xmm2
+        psrad   xmm3,DESCALE_P2
+        psrad   xmm1,DESCALE_P2
+        paddd   xmm4,xmm2
+        paddd   xmm7,xmm2
+        psrad   xmm4,DESCALE_P2
+        psrad   xmm7,DESCALE_P2
+
+        packssdw  xmm3,xmm1             ; xmm3=data1=(01 11 21 31 41 51 61 71)
+        packssdw  xmm4,xmm7             ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+        packsswb  xmm5,xmm4             ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+        packsswb  xmm3,xmm0             ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+        movdqa  xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+        movdqa  xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+        movdqa  xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+        movdqa  xmm4,xmm6
+        movdqa  xmm0,xmm2
+        paddd   xmm6,xmm1               ; xmm6=data2L
+        paddd   xmm2,xmm7               ; xmm2=data2H
+        psubd   xmm4,xmm1               ; xmm4=data5L
+        psubd   xmm0,xmm7               ; xmm0=data5H
+
+        movdqa  xmm5,[rel PD_DESCALE_P2]        ; xmm5=[rel PD_DESCALE_P2]
+
+        paddd   xmm6,xmm5
+        paddd   xmm2,xmm5
+        psrad   xmm6,DESCALE_P2
+        psrad   xmm2,DESCALE_P2
+        paddd   xmm4,xmm5
+        paddd   xmm0,xmm5
+        psrad   xmm4,DESCALE_P2
+        psrad   xmm0,DESCALE_P2
+
+        packssdw  xmm6,xmm2             ; xmm6=data2=(02 12 22 32 42 52 62 72)
+        packssdw  xmm4,xmm0             ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+        movdqa  xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+        movdqa  xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+        movdqa  xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+        movdqa  xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+        movdqa  xmm2,xmm3
+        movdqa  xmm0,xmm1
+        paddd   xmm3,xmm7               ; xmm3=data3L
+        paddd   xmm1,xmm5               ; xmm1=data3H
+        psubd   xmm2,xmm7               ; xmm2=data4L
+        psubd   xmm0,xmm5               ; xmm0=data4H
+
+        movdqa  xmm7,[rel PD_DESCALE_P2]        ; xmm7=[rel PD_DESCALE_P2]
+
+        paddd   xmm3,xmm7
+        paddd   xmm1,xmm7
+        psrad   xmm3,DESCALE_P2
+        psrad   xmm1,DESCALE_P2
+        paddd   xmm2,xmm7
+        paddd   xmm0,xmm7
+        psrad   xmm2,DESCALE_P2
+        psrad   xmm0,DESCALE_P2
+
+        movdqa    xmm5,[rel PB_CENTERJSAMP]     ; xmm5=[rel PB_CENTERJSAMP]
+
+        packssdw  xmm3,xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+        packssdw  xmm2,xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+        movdqa    xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+        packsswb  xmm6,xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+        packsswb  xmm3,xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+        paddb     xmm7,xmm5
+        paddb     xmm1,xmm5
+        paddb     xmm6,xmm5
+        paddb     xmm3,xmm5
+
+        movdqa    xmm0,xmm7     ; transpose coefficients(phase 1)
+        punpcklbw xmm7,xmm1     ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+        punpckhbw xmm0,xmm1     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+        movdqa    xmm2,xmm6     ; transpose coefficients(phase 1)
+        punpcklbw xmm6,xmm3     ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+        punpckhbw xmm2,xmm3     ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+        movdqa    xmm4,xmm7     ; transpose coefficients(phase 2)
+        punpcklwd xmm7,xmm6     ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+        punpckhwd xmm4,xmm6     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+        movdqa    xmm5,xmm2     ; transpose coefficients(phase 2)
+        punpcklwd xmm2,xmm0     ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+        punpckhwd xmm5,xmm0     ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+        movdqa    xmm1,xmm7     ; transpose coefficients(phase 3)
+        punpckldq xmm7,xmm2     ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+        punpckhdq xmm1,xmm2     ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+        movdqa    xmm3,xmm4     ; transpose coefficients(phase 3)
+        punpckldq xmm4,xmm5     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+        punpckhdq xmm3,xmm5     ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+        pshufd  xmm6,xmm7,0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+        pshufd  xmm0,xmm1,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+        pshufd  xmm2,xmm4,0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+        pshufd  xmm5,xmm3,0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+        mov     rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+        mov     rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+        mov     rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+        mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+        mov     rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+        mov     rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jidctint-sse2.asm b/simd/jidctint-sse2.asm
new file mode 100644
index 0000000..1960bcd
--- /dev/null
+++ b/simd/jidctint-sse2.asm
@@ -0,0 +1,859 @@
+;
+; jidctint.asm - accurate integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      13
+%define PASS1_BITS      2
+
+%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2      (CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298 equ      2446           ; FIX(0.298631336)
+F_0_390 equ      3196           ; FIX(0.390180644)
+F_0_541 equ      4433           ; FIX(0.541196100)
+F_0_765 equ      6270           ; FIX(0.765366865)
+F_0_899 equ      7373           ; FIX(0.899976223)
+F_1_175 equ      9633           ; FIX(1.175875602)
+F_1_501 equ     12299           ; FIX(1.501321110)
+F_1_847 equ     15137           ; FIX(1.847759065)
+F_1_961 equ     16069           ; FIX(1.961570560)
+F_2_053 equ     16819           ; FIX(2.053119869)
+F_2_562 equ     20995           ; FIX(2.562915447)
+F_3_072 equ     25172           ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
+F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
+F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
+F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
+F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
+F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
+F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
+F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
+F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
+F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
+F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
+%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
+%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
+%define output_col(b)   (b)+20          ; JDIMENSION output_col
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          12
+
+        align   16
+        global  EXTN(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic ebx
+;       push    ecx             ; unused
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process columns from input.
+
+;       mov     eax, [original_ebp]
+        mov     edx, POINTER [dct_table(eax)]           ; quantptr
+        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        jnz     near .columnDCT
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        por     xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        por     xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        por     xmm1,xmm0
+        packsswb xmm1,xmm1
+        packsswb xmm1,xmm1
+        movd    eax,xmm1
+        test    eax,eax
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movdqa  xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        psllw   xmm5,PASS1_BITS
+
+        movdqa    xmm4,xmm5             ; xmm5=in0=(00 01 02 03 04 05 06 07)
+        punpcklwd xmm5,xmm5             ; xmm5=(00 00 01 01 02 02 03 03)
+        punpckhwd xmm4,xmm4             ; xmm4=(04 04 05 05 06 06 07 07)
+
+        pshufd  xmm7,xmm5,0x00          ; xmm7=col0=(00 00 00 00 00 00 00 00)
+        pshufd  xmm6,xmm5,0x55          ; xmm6=col1=(01 01 01 01 01 01 01 01)
+        pshufd  xmm1,xmm5,0xAA          ; xmm1=col2=(02 02 02 02 02 02 02 02)
+        pshufd  xmm5,xmm5,0xFF          ; xmm5=col3=(03 03 03 03 03 03 03 03)
+        pshufd  xmm0,xmm4,0x00          ; xmm0=col4=(04 04 04 04 04 04 04 04)
+        pshufd  xmm3,xmm4,0x55          ; xmm3=col5=(05 05 05 05 05 05 05 05)
+        pshufd  xmm2,xmm4,0xAA          ; xmm2=col6=(06 06 06 06 06 06 06 06)
+        pshufd  xmm4,xmm4,0xFF          ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+        movdqa  XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+        movdqa  XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+        jmp     near .column_end
+        alignx  16,7
+%endif
+.columnDCT:
+
+        ; -- Even part
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        ; (Original)
+        ; z1 = (z2 + z3) * 0.541196100;
+        ; tmp2 = z1 + z3 * -1.847759065;
+        ; tmp3 = z1 + z2 * 0.765366865;
+        ;
+        ; (This implementation)
+        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+        movdqa    xmm4,xmm1             ; xmm1=in2=z2
+        movdqa    xmm5,xmm1
+        punpcklwd xmm4,xmm3             ; xmm3=in6=z3
+        punpckhwd xmm5,xmm3
+        movdqa    xmm1,xmm4
+        movdqa    xmm3,xmm5
+        pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]       ; xmm4=tmp3L
+        pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]       ; xmm5=tmp3H
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm1=tmp2L
+        pmaddwd   xmm3,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm3=tmp2H
+
+        movdqa    xmm6,xmm0
+        paddw     xmm0,xmm2             ; xmm0=in0+in4
+        psubw     xmm6,xmm2             ; xmm6=in0-in4
+
+        pxor      xmm7,xmm7
+        pxor      xmm2,xmm2
+        punpcklwd xmm7,xmm0             ; xmm7=tmp0L
+        punpckhwd xmm2,xmm0             ; xmm2=tmp0H
+        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+        psrad     xmm2,(16-CONST_BITS)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+        movdqa  xmm0,xmm7
+        paddd   xmm7,xmm4               ; xmm7=tmp10L
+        psubd   xmm0,xmm4               ; xmm0=tmp13L
+        movdqa  xmm4,xmm2
+        paddd   xmm2,xmm5               ; xmm2=tmp10H
+        psubd   xmm4,xmm5               ; xmm4=tmp13H
+
+        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+        movdqa  XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+        movdqa  XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+        pxor      xmm5,xmm5
+        pxor      xmm7,xmm7
+        punpcklwd xmm5,xmm6             ; xmm5=tmp1L
+        punpckhwd xmm7,xmm6             ; xmm7=tmp1H
+        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+        movdqa  xmm2,xmm5
+        paddd   xmm5,xmm1               ; xmm5=tmp11L
+        psubd   xmm2,xmm1               ; xmm2=tmp12L
+        movdqa  xmm0,xmm7
+        paddd   xmm7,xmm3               ; xmm7=tmp11H
+        psubd   xmm0,xmm3               ; xmm0=tmp12H
+
+        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+        movdqa  XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+        movdqa  XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+        ; -- Odd part
+
+        movdqa  xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        movdqa  xmm5,xmm6
+        movdqa  xmm7,xmm4
+        paddw   xmm5,xmm3               ; xmm5=z3
+        paddw   xmm7,xmm1               ; xmm7=z4
+
+        ; (Original)
+        ; z5 = (z3 + z4) * 1.175875602;
+        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+        ; z3 += z5;  z4 += z5;
+        ;
+        ; (This implementation)
+        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+        movdqa    xmm2,xmm5
+        movdqa    xmm0,xmm5
+        punpcklwd xmm2,xmm7
+        punpckhwd xmm0,xmm7
+        movdqa    xmm5,xmm2
+        movdqa    xmm7,xmm0
+        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm2=z3L
+        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm0=z3H
+        pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]       ; xmm5=z4L
+        pmaddwd   xmm7,[GOTOFF(ebx,PW_F117_F078)]       ; xmm7=z4H
+
+        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+        ; (Original)
+        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+        ;
+        ; (This implementation)
+        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+        ; tmp0 += z3;  tmp1 += z4;
+        ; tmp2 += z3;  tmp3 += z4;
+
+        movdqa    xmm2,xmm3
+        movdqa    xmm0,xmm3
+        punpcklwd xmm2,xmm4
+        punpckhwd xmm0,xmm4
+        movdqa    xmm3,xmm2
+        movdqa    xmm4,xmm0
+        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm2=tmp0L
+        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm0=tmp0H
+        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm3=tmp3L
+        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm4=tmp3H
+
+        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+        paddd   xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+        paddd   xmm3,xmm5               ; xmm3=tmp3L
+        paddd   xmm4,xmm7               ; xmm4=tmp3H
+
+        movdqa  XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+        movdqa  XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+        movdqa    xmm2,xmm1
+        movdqa    xmm0,xmm1
+        punpcklwd xmm2,xmm6
+        punpckhwd xmm0,xmm6
+        movdqa    xmm1,xmm2
+        movdqa    xmm6,xmm0
+        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm2=tmp1L
+        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm0=tmp1H
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm1=tmp2L
+        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm6=tmp2H
+
+        paddd   xmm2,xmm5               ; xmm2=tmp1L
+        paddd   xmm0,xmm7               ; xmm0=tmp1H
+        paddd   xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+        ; -- Final output stage
+
+        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+        movdqa  xmm2,xmm5
+        movdqa  xmm0,xmm7
+        paddd   xmm5,xmm3               ; xmm5=data0L
+        paddd   xmm7,xmm4               ; xmm7=data0H
+        psubd   xmm2,xmm3               ; xmm2=data7L
+        psubd   xmm0,xmm4               ; xmm0=data7H
+
+        movdqa  xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm3=[PD_DESCALE_P1]
+
+        paddd   xmm5,xmm3
+        paddd   xmm7,xmm3
+        psrad   xmm5,DESCALE_P1
+        psrad   xmm7,DESCALE_P1
+        paddd   xmm2,xmm3
+        paddd   xmm0,xmm3
+        psrad   xmm2,DESCALE_P1
+        psrad   xmm0,DESCALE_P1
+
+        packssdw  xmm5,xmm7             ; xmm5=data0=(00 01 02 03 04 05 06 07)
+        packssdw  xmm2,xmm0             ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+        movdqa  xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+        movdqa  xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+        movdqa  xmm7,xmm4
+        movdqa  xmm0,xmm3
+        paddd   xmm4,xmm1               ; xmm4=data1L
+        paddd   xmm3,xmm6               ; xmm3=data1H
+        psubd   xmm7,xmm1               ; xmm7=data6L
+        psubd   xmm0,xmm6               ; xmm0=data6H
+
+        movdqa  xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm1=[PD_DESCALE_P1]
+
+        paddd   xmm4,xmm1
+        paddd   xmm3,xmm1
+        psrad   xmm4,DESCALE_P1
+        psrad   xmm3,DESCALE_P1
+        paddd   xmm7,xmm1
+        paddd   xmm0,xmm1
+        psrad   xmm7,DESCALE_P1
+        psrad   xmm0,DESCALE_P1
+
+        packssdw  xmm4,xmm3             ; xmm4=data1=(10 11 12 13 14 15 16 17)
+        packssdw  xmm7,xmm0             ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+        movdqa    xmm6,xmm5             ; transpose coefficients(phase 1)
+        punpcklwd xmm5,xmm4             ; xmm5=(00 10 01 11 02 12 03 13)
+        punpckhwd xmm6,xmm4             ; xmm6=(04 14 05 15 06 16 07 17)
+        movdqa    xmm1,xmm7             ; transpose coefficients(phase 1)
+        punpcklwd xmm7,xmm2             ; xmm7=(60 70 61 71 62 72 63 73)
+        punpckhwd xmm1,xmm2             ; xmm1=(64 74 65 75 66 76 67 77)
+
+        movdqa  xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+        movdqa  xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+        movdqa  xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+        movdqa  xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+        movdqa  XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+        movdqa  xmm5,xmm3
+        movdqa  xmm6,xmm0
+        paddd   xmm3,xmm4               ; xmm3=data2L
+        paddd   xmm0,xmm2               ; xmm0=data2H
+        psubd   xmm5,xmm4               ; xmm5=data5L
+        psubd   xmm6,xmm2               ; xmm6=data5H
+
+        movdqa  xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm7=[PD_DESCALE_P1]
+
+        paddd   xmm3,xmm7
+        paddd   xmm0,xmm7
+        psrad   xmm3,DESCALE_P1
+        psrad   xmm0,DESCALE_P1
+        paddd   xmm5,xmm7
+        paddd   xmm6,xmm7
+        psrad   xmm5,DESCALE_P1
+        psrad   xmm6,DESCALE_P1
+
+        packssdw  xmm3,xmm0             ; xmm3=data2=(20 21 22 23 24 25 26 27)
+        packssdw  xmm5,xmm6             ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+        movdqa  xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+        movdqa  xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+        movdqa  xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+        movdqa  xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+        movdqa  xmm0,xmm1
+        movdqa  xmm6,xmm4
+        paddd   xmm1,xmm2               ; xmm1=data3L
+        paddd   xmm4,xmm7               ; xmm4=data3H
+        psubd   xmm0,xmm2               ; xmm0=data4L
+        psubd   xmm6,xmm7               ; xmm6=data4H
+
+        movdqa  xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm2=[PD_DESCALE_P1]
+
+        paddd   xmm1,xmm2
+        paddd   xmm4,xmm2
+        psrad   xmm1,DESCALE_P1
+        psrad   xmm4,DESCALE_P1
+        paddd   xmm0,xmm2
+        paddd   xmm6,xmm2
+        psrad   xmm0,DESCALE_P1
+        psrad   xmm6,DESCALE_P1
+
+        packssdw  xmm1,xmm4             ; xmm1=data3=(30 31 32 33 34 35 36 37)
+        packssdw  xmm0,xmm6             ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+        movdqa  xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+        movdqa    xmm4,xmm3             ; transpose coefficients(phase 1)
+        punpcklwd xmm3,xmm1             ; xmm3=(20 30 21 31 22 32 23 33)
+        punpckhwd xmm4,xmm1             ; xmm4=(24 34 25 35 26 36 27 37)
+        movdqa    xmm6,xmm0             ; transpose coefficients(phase 1)
+        punpcklwd xmm0,xmm5             ; xmm0=(40 50 41 51 42 52 43 53)
+        punpckhwd xmm6,xmm5             ; xmm6=(44 54 45 55 46 56 47 57)
+
+        movdqa    xmm1,xmm7             ; transpose coefficients(phase 2)
+        punpckldq xmm7,xmm3             ; xmm7=(00 10 20 30 01 11 21 31)
+        punpckhdq xmm1,xmm3             ; xmm1=(02 12 22 32 03 13 23 33)
+        movdqa    xmm5,xmm2             ; transpose coefficients(phase 2)
+        punpckldq xmm2,xmm4             ; xmm2=(04 14 24 34 05 15 25 35)
+        punpckhdq xmm5,xmm4             ; xmm5=(06 16 26 36 07 17 27 37)
+
+        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+        movdqa  xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+        movdqa  XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+        movdqa    xmm2,xmm0             ; transpose coefficients(phase 2)
+        punpckldq xmm0,xmm3             ; xmm0=(40 50 60 70 41 51 61 71)
+        punpckhdq xmm2,xmm3             ; xmm2=(42 52 62 72 43 53 63 73)
+        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
+        punpckldq xmm6,xmm4             ; xmm6=(44 54 64 74 45 55 65 75)
+        punpckhdq xmm5,xmm4             ; xmm5=(46 56 66 76 47 57 67 77)
+
+        movdqa     xmm3,xmm7            ; transpose coefficients(phase 3)
+        punpcklqdq xmm7,xmm0            ; xmm7=col0=(00 10 20 30 40 50 60 70)
+        punpckhqdq xmm3,xmm0            ; xmm3=col1=(01 11 21 31 41 51 61 71)
+        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
+        punpcklqdq xmm1,xmm2            ; xmm1=col2=(02 12 22 32 42 52 62 72)
+        punpckhqdq xmm4,xmm2            ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+        movdqa  xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+        movdqa  XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+        movdqa  XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
+        punpcklqdq xmm0,xmm6            ; xmm0=col4=(04 14 24 34 44 54 64 74)
+        punpckhqdq xmm3,xmm6            ; xmm3=col5=(05 15 25 35 45 55 65 75)
+        movdqa     xmm4,xmm2            ; transpose coefficients(phase 3)
+        punpcklqdq xmm2,xmm5            ; xmm2=col6=(06 16 26 36 46 56 66 76)
+        punpckhqdq xmm4,xmm5            ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+        ; -- Prefetch the next coefficient block
+
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+        ; ---- Pass 2: process rows from work array, store into output array.
+
+        mov     eax, [original_ebp]
+        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
+        mov     eax, JDIMENSION [output_col(eax)]
+
+        ; -- Even part
+
+        ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+        ; (Original)
+        ; z1 = (z2 + z3) * 0.541196100;
+        ; tmp2 = z1 + z3 * -1.847759065;
+        ; tmp3 = z1 + z2 * 0.765366865;
+        ;
+        ; (This implementation)
+        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+        movdqa    xmm6,xmm1             ; xmm1=in2=z2
+        movdqa    xmm5,xmm1
+        punpcklwd xmm6,xmm2             ; xmm2=in6=z3
+        punpckhwd xmm5,xmm2
+        movdqa    xmm1,xmm6
+        movdqa    xmm2,xmm5
+        pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]       ; xmm6=tmp3L
+        pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]       ; xmm5=tmp3H
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm1=tmp2L
+        pmaddwd   xmm2,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm2=tmp2H
+
+        movdqa    xmm3,xmm7
+        paddw     xmm7,xmm0             ; xmm7=in0+in4
+        psubw     xmm3,xmm0             ; xmm3=in0-in4
+
+        pxor      xmm4,xmm4
+        pxor      xmm0,xmm0
+        punpcklwd xmm4,xmm7             ; xmm4=tmp0L
+        punpckhwd xmm0,xmm7             ; xmm0=tmp0H
+        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+        psrad     xmm0,(16-CONST_BITS)  ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+        movdqa  xmm7,xmm4
+        paddd   xmm4,xmm6               ; xmm4=tmp10L
+        psubd   xmm7,xmm6               ; xmm7=tmp13L
+        movdqa  xmm6,xmm0
+        paddd   xmm0,xmm5               ; xmm0=tmp10H
+        psubd   xmm6,xmm5               ; xmm6=tmp13H
+
+        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+        movdqa  XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+        pxor      xmm5,xmm5
+        pxor      xmm4,xmm4
+        punpcklwd xmm5,xmm3             ; xmm5=tmp1L
+        punpckhwd xmm4,xmm3             ; xmm4=tmp1H
+        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+        movdqa  xmm0,xmm5
+        paddd   xmm5,xmm1               ; xmm5=tmp11L
+        psubd   xmm0,xmm1               ; xmm0=tmp12L
+        movdqa  xmm7,xmm4
+        paddd   xmm4,xmm2               ; xmm4=tmp11H
+        psubd   xmm7,xmm2               ; xmm7=tmp12H
+
+        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+        movdqa  XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+        movdqa  XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+        ; -- Odd part
+
+        movdqa  xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+        movdqa  xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+        movdqa  xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+        movdqa  xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+        movdqa  xmm5,xmm6
+        movdqa  xmm4,xmm3
+        paddw   xmm5,xmm1               ; xmm5=z3
+        paddw   xmm4,xmm2               ; xmm4=z4
+
+        ; (Original)
+        ; z5 = (z3 + z4) * 1.175875602;
+        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+        ; z3 += z5;  z4 += z5;
+        ;
+        ; (This implementation)
+        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+        movdqa    xmm0,xmm5
+        movdqa    xmm7,xmm5
+        punpcklwd xmm0,xmm4
+        punpckhwd xmm7,xmm4
+        movdqa    xmm5,xmm0
+        movdqa    xmm4,xmm7
+        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm0=z3L
+        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm7=z3H
+        pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]       ; xmm5=z4L
+        pmaddwd   xmm4,[GOTOFF(ebx,PW_F117_F078)]       ; xmm4=z4H
+
+        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+        ; (Original)
+        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+        ;
+        ; (This implementation)
+        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+        ; tmp0 += z3;  tmp1 += z4;
+        ; tmp2 += z3;  tmp3 += z4;
+
+        movdqa    xmm0,xmm1
+        movdqa    xmm7,xmm1
+        punpcklwd xmm0,xmm3
+        punpckhwd xmm7,xmm3
+        movdqa    xmm1,xmm0
+        movdqa    xmm3,xmm7
+        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm0=tmp0L
+        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm7=tmp0H
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm1=tmp3L
+        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm3=tmp3H
+
+        paddd   xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+        paddd   xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+        paddd   xmm1,xmm5               ; xmm1=tmp3L
+        paddd   xmm3,xmm4               ; xmm3=tmp3H
+
+        movdqa  XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+        movdqa  XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+        movdqa    xmm0,xmm2
+        movdqa    xmm7,xmm2
+        punpcklwd xmm0,xmm6
+        punpckhwd xmm7,xmm6
+        movdqa    xmm2,xmm0
+        movdqa    xmm6,xmm7
+        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm0=tmp1L
+        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm7=tmp1H
+        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm2=tmp2L
+        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm6=tmp2H
+
+        paddd   xmm0,xmm5               ; xmm0=tmp1L
+        paddd   xmm7,xmm4               ; xmm7=tmp1H
+        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+        ; -- Final output stage
+
+        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+        movdqa  xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+        movdqa  xmm0,xmm5
+        movdqa  xmm7,xmm4
+        paddd   xmm5,xmm1               ; xmm5=data0L
+        paddd   xmm4,xmm3               ; xmm4=data0H
+        psubd   xmm0,xmm1               ; xmm0=data7L
+        psubd   xmm7,xmm3               ; xmm7=data7H
+
+        movdqa  xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm1=[PD_DESCALE_P2]
+
+        paddd   xmm5,xmm1
+        paddd   xmm4,xmm1
+        psrad   xmm5,DESCALE_P2
+        psrad   xmm4,DESCALE_P2
+        paddd   xmm0,xmm1
+        paddd   xmm7,xmm1
+        psrad   xmm0,DESCALE_P2
+        psrad   xmm7,DESCALE_P2
+
+        packssdw  xmm5,xmm4             ; xmm5=data0=(00 10 20 30 40 50 60 70)
+        packssdw  xmm0,xmm7             ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+        movdqa  xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+        movdqa  xmm4,xmm3
+        movdqa  xmm7,xmm1
+        paddd   xmm3,xmm2               ; xmm3=data1L
+        paddd   xmm1,xmm6               ; xmm1=data1H
+        psubd   xmm4,xmm2               ; xmm4=data6L
+        psubd   xmm7,xmm6               ; xmm7=data6H
+
+        movdqa  xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm2=[PD_DESCALE_P2]
+
+        paddd   xmm3,xmm2
+        paddd   xmm1,xmm2
+        psrad   xmm3,DESCALE_P2
+        psrad   xmm1,DESCALE_P2
+        paddd   xmm4,xmm2
+        paddd   xmm7,xmm2
+        psrad   xmm4,DESCALE_P2
+        psrad   xmm7,DESCALE_P2
+
+        packssdw  xmm3,xmm1             ; xmm3=data1=(01 11 21 31 41 51 61 71)
+        packssdw  xmm4,xmm7             ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+        packsswb  xmm5,xmm4             ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+        packsswb  xmm3,xmm0             ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+        movdqa  xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+        movdqa  xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+        movdqa  xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+        movdqa  xmm4,xmm6
+        movdqa  xmm0,xmm2
+        paddd   xmm6,xmm1               ; xmm6=data2L
+        paddd   xmm2,xmm7               ; xmm2=data2H
+        psubd   xmm4,xmm1               ; xmm4=data5L
+        psubd   xmm0,xmm7               ; xmm0=data5H
+
+        movdqa  xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm5=[PD_DESCALE_P2]
+
+        paddd   xmm6,xmm5
+        paddd   xmm2,xmm5
+        psrad   xmm6,DESCALE_P2
+        psrad   xmm2,DESCALE_P2
+        paddd   xmm4,xmm5
+        paddd   xmm0,xmm5
+        psrad   xmm4,DESCALE_P2
+        psrad   xmm0,DESCALE_P2
+
+        packssdw  xmm6,xmm2             ; xmm6=data2=(02 12 22 32 42 52 62 72)
+        packssdw  xmm4,xmm0             ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+        movdqa  xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+        movdqa  xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+        movdqa  xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+        movdqa  xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+        movdqa  xmm2,xmm3
+        movdqa  xmm0,xmm1
+        paddd   xmm3,xmm7               ; xmm3=data3L
+        paddd   xmm1,xmm5               ; xmm1=data3H
+        psubd   xmm2,xmm7               ; xmm2=data4L
+        psubd   xmm0,xmm5               ; xmm0=data4H
+
+        movdqa  xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm7=[PD_DESCALE_P2]
+
+        paddd   xmm3,xmm7
+        paddd   xmm1,xmm7
+        psrad   xmm3,DESCALE_P2
+        psrad   xmm1,DESCALE_P2
+        paddd   xmm2,xmm7
+        paddd   xmm0,xmm7
+        psrad   xmm2,DESCALE_P2
+        psrad   xmm0,DESCALE_P2
+
+        movdqa    xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm5=[PB_CENTERJSAMP]
+
+        packssdw  xmm3,xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+        packssdw  xmm2,xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+        movdqa    xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+        packsswb  xmm6,xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+        packsswb  xmm3,xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+        paddb     xmm7,xmm5
+        paddb     xmm1,xmm5
+        paddb     xmm6,xmm5
+        paddb     xmm3,xmm5
+
+        movdqa    xmm0,xmm7     ; transpose coefficients(phase 1)
+        punpcklbw xmm7,xmm1     ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+        punpckhbw xmm0,xmm1     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+        movdqa    xmm2,xmm6     ; transpose coefficients(phase 1)
+        punpcklbw xmm6,xmm3     ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+        punpckhbw xmm2,xmm3     ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+        movdqa    xmm4,xmm7     ; transpose coefficients(phase 2)
+        punpcklwd xmm7,xmm6     ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+        punpckhwd xmm4,xmm6     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+        movdqa    xmm5,xmm2     ; transpose coefficients(phase 2)
+        punpcklwd xmm2,xmm0     ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+        punpckhwd xmm5,xmm0     ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+        movdqa    xmm1,xmm7     ; transpose coefficients(phase 3)
+        punpckldq xmm7,xmm2     ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+        punpckhdq xmm1,xmm2     ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+        movdqa    xmm3,xmm4     ; transpose coefficients(phase 3)
+        punpckldq xmm4,xmm5     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+        punpckhdq xmm3,xmm5     ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+        pshufd  xmm6,xmm7,0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+        pshufd  xmm0,xmm1,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+        pshufd  xmm2,xmm4,0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+        pshufd  xmm5,xmm3,0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+        mov     edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+        mov     edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; unused
+        poppic  ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jidctred-mmx.asm b/simd/jidctred-mmx.asm
new file mode 100644
index 0000000..21e17fc
--- /dev/null
+++ b/simd/jidctred-mmx.asm
@@ -0,0 +1,706 @@
+;
+; jidctred.asm - reduced-size IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      13
+%define PASS1_BITS      2
+
+%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211 equ      1730           ; FIX(0.211164243)
+F_0_509 equ      4176           ; FIX(0.509795579)
+F_0_601 equ      4926           ; FIX(0.601344887)
+F_0_720 equ      5906           ; FIX(0.720959822)
+F_0_765 equ      6270           ; FIX(0.765366865)
+F_0_850 equ      6967           ; FIX(0.850430095)
+F_0_899 equ      7373           ; FIX(0.899976223)
+F_1_061 equ      8697           ; FIX(1.061594337)
+F_1_272 equ     10426           ; FIX(1.272758580)
+F_1_451 equ     11893           ; FIX(1.451774981)
+F_1_847 equ     15137           ; FIX(1.847759065)
+F_2_172 equ     17799           ; FIX(2.172734803)
+F_2_562 equ     20995           ; FIX(2.562915447)
+F_3_624 equ     29692           ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
+F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
+F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
+F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
+F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
+F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
+F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
+F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
+F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
+F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
+F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
+F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
+F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076   times 2 dw  F_1_847,-F_0_765
+PW_F256_F089    times 2 dw  F_2_562, F_0_899
+PW_F106_MF217   times 2 dw  F_1_061,-F_2_172
+PW_MF060_MF050  times 2 dw -F_0_601,-F_0_509
+PW_F145_MF021   times 2 dw  F_1_451,-F_0_211
+PW_F362_MF127   times 2 dw  F_3_624,-F_1_272
+PW_F085_MF072   times 2 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_mmx (void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)    (b)+8           ; void *dct_table
+%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
+%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
+%define output_col(b)   (b)+20          ; JDIMENSION output_col
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM          2
+%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+        align   16
+        global  EXTN(jsimd_idct_4x4_mmx)
+
+EXTN(jsimd_idct_4x4_mmx):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [workspace]
+        pushpic ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process columns from input, store into work array.
+
+;       mov     eax, [original_ebp]
+        mov     edx, POINTER [dct_table(eax)]           ; quantptr
+        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
+        lea     edi, [workspace]                        ; JCOEF *wsptr
+        mov     ecx, DCTSIZE/4                          ; ctr
+        alignx  16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        jnz     short .columnDCT
+
+        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        por     mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        por     mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        por     mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        por     mm0,mm1
+        packsswb mm0,mm0
+        movd    eax,mm0
+        test    eax,eax
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        psllw   mm0,PASS1_BITS
+
+        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
+        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
+        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
+
+        movq      mm1,mm0
+        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
+        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
+        movq      mm3,mm2
+        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
+        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+        jmp     near .nextcolumn
+        alignx  16,7
+%endif
+.columnDCT:
+
+        ; -- Odd part
+
+        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        movq      mm4,mm0
+        movq      mm5,mm0
+        punpcklwd mm4,mm1
+        punpckhwd mm5,mm1
+        movq      mm0,mm4
+        movq      mm1,mm5
+        pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]        ; mm4=(tmp2L)
+        pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]        ; mm5=(tmp2H)
+        pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]       ; mm0=(tmp0L)
+        pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]       ; mm1=(tmp0H)
+
+        movq      mm6,mm2
+        movq      mm7,mm2
+        punpcklwd mm6,mm3
+        punpckhwd mm7,mm3
+        movq      mm2,mm6
+        movq      mm3,mm7
+        pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm6=(tmp2L)
+        pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm7=(tmp2H)
+        pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]       ; mm2=(tmp0L)
+        pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]       ; mm3=(tmp0H)
+
+        paddd   mm6,mm4                 ; mm6=tmp2L
+        paddd   mm7,mm5                 ; mm7=tmp2H
+        paddd   mm2,mm0                 ; mm2=tmp0L
+        paddd   mm3,mm1                 ; mm3=tmp0H
+
+        movq    MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
+        movq    MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
+
+        ; -- Even part
+
+        movq    mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movq    mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        movq    mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        pxor      mm1,mm1
+        pxor      mm2,mm2
+        punpcklwd mm1,mm4               ; mm1=tmp0L
+        punpckhwd mm2,mm4               ; mm2=tmp0H
+        psrad     mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+        psrad     mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+        movq      mm3,mm5               ; mm5=in2=z2
+        punpcklwd mm5,mm0               ; mm0=in6=z3
+        punpckhwd mm3,mm0
+        pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]       ; mm5=tmp2L
+        pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]       ; mm3=tmp2H
+
+        movq    mm4,mm1
+        movq    mm0,mm2
+        paddd   mm1,mm5                 ; mm1=tmp10L
+        paddd   mm2,mm3                 ; mm2=tmp10H
+        psubd   mm4,mm5                 ; mm4=tmp12L
+        psubd   mm0,mm3                 ; mm0=tmp12H
+
+        ; -- Final output stage
+
+        movq    mm5,mm1
+        movq    mm3,mm2
+        paddd   mm1,mm6                 ; mm1=data0L
+        paddd   mm2,mm7                 ; mm2=data0H
+        psubd   mm5,mm6                 ; mm5=data3L
+        psubd   mm3,mm7                 ; mm3=data3H
+
+        movq    mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]       ; mm6=[PD_DESCALE_P1_4]
+
+        paddd   mm1,mm6
+        paddd   mm2,mm6
+        psrad   mm1,DESCALE_P1_4
+        psrad   mm2,DESCALE_P1_4
+        paddd   mm5,mm6
+        paddd   mm3,mm6
+        psrad   mm5,DESCALE_P1_4
+        psrad   mm3,DESCALE_P1_4
+
+        packssdw  mm1,mm2               ; mm1=data0=(00 01 02 03)
+        packssdw  mm5,mm3               ; mm5=data3=(30 31 32 33)
+
+        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp0L
+        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp0H
+
+        movq    mm2,mm4
+        movq    mm3,mm0
+        paddd   mm4,mm7                 ; mm4=data1L
+        paddd   mm0,mm6                 ; mm0=data1H
+        psubd   mm2,mm7                 ; mm2=data2L
+        psubd   mm3,mm6                 ; mm3=data2H
+
+        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]       ; mm7=[PD_DESCALE_P1_4]
+
+        paddd   mm4,mm7
+        paddd   mm0,mm7
+        psrad   mm4,DESCALE_P1_4
+        psrad   mm0,DESCALE_P1_4
+        paddd   mm2,mm7
+        paddd   mm3,mm7
+        psrad   mm2,DESCALE_P1_4
+        psrad   mm3,DESCALE_P1_4
+
+        packssdw  mm4,mm0               ; mm4=data1=(10 11 12 13)
+        packssdw  mm2,mm3               ; mm2=data2=(20 21 22 23)
+
+        movq      mm6,mm1               ; transpose coefficients(phase 1)
+        punpcklwd mm1,mm4               ; mm1=(00 10 01 11)
+        punpckhwd mm6,mm4               ; mm6=(02 12 03 13)
+        movq      mm7,mm2               ; transpose coefficients(phase 1)
+        punpcklwd mm2,mm5               ; mm2=(20 30 21 31)
+        punpckhwd mm7,mm5               ; mm7=(22 32 23 33)
+
+        movq      mm0,mm1               ; transpose coefficients(phase 2)
+        punpckldq mm1,mm2               ; mm1=(00 10 20 30)
+        punpckhdq mm0,mm2               ; mm0=(01 11 21 31)
+        movq      mm3,mm6               ; transpose coefficients(phase 2)
+        punpckldq mm6,mm7               ; mm6=(02 12 22 32)
+        punpckhdq mm3,mm7               ; mm3=(03 13 23 33)
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
+        add     edx, byte 4*SIZEOF_ISLOW_MULT_TYPE      ; quantptr
+        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
+        dec     ecx                                     ; ctr
+        jnz     near .columnloop
+
+        ; ---- Pass 2: process rows from work array, store into output array.
+
+        mov     eax, [original_ebp]
+        lea     esi, [workspace]                        ; JCOEF *wsptr
+        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
+        mov     eax, JDIMENSION [output_col(eax)]
+
+        ; -- Odd part
+
+        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+        movq      mm4,mm0
+        movq      mm5,mm0
+        punpcklwd mm4,mm1
+        punpckhwd mm5,mm1
+        movq      mm0,mm4
+        movq      mm1,mm5
+        pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]        ; mm4=(tmp2L)
+        pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]        ; mm5=(tmp2H)
+        pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]       ; mm0=(tmp0L)
+        pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]       ; mm1=(tmp0H)
+
+        movq      mm6,mm2
+        movq      mm7,mm2
+        punpcklwd mm6,mm3
+        punpckhwd mm7,mm3
+        movq      mm2,mm6
+        movq      mm3,mm7
+        pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm6=(tmp2L)
+        pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm7=(tmp2H)
+        pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]       ; mm2=(tmp0L)
+        pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]       ; mm3=(tmp0H)
+
+        paddd   mm6,mm4                 ; mm6=tmp2L
+        paddd   mm7,mm5                 ; mm7=tmp2H
+        paddd   mm2,mm0                 ; mm2=tmp0L
+        paddd   mm3,mm1                 ; mm3=tmp0H
+
+        movq    MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
+        movq    MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
+
+        ; -- Even part
+
+        movq    mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movq    mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        movq    mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+        pxor      mm1,mm1
+        pxor      mm2,mm2
+        punpcklwd mm1,mm4               ; mm1=tmp0L
+        punpckhwd mm2,mm4               ; mm2=tmp0H
+        psrad     mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+        psrad     mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+        movq      mm3,mm5               ; mm5=in2=z2
+        punpcklwd mm5,mm0               ; mm0=in6=z3
+        punpckhwd mm3,mm0
+        pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]       ; mm5=tmp2L
+        pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]       ; mm3=tmp2H
+
+        movq    mm4,mm1
+        movq    mm0,mm2
+        paddd   mm1,mm5                 ; mm1=tmp10L
+        paddd   mm2,mm3                 ; mm2=tmp10H
+        psubd   mm4,mm5                 ; mm4=tmp12L
+        psubd   mm0,mm3                 ; mm0=tmp12H
+
+        ; -- Final output stage
+
+        movq    mm5,mm1
+        movq    mm3,mm2
+        paddd   mm1,mm6                 ; mm1=data0L
+        paddd   mm2,mm7                 ; mm2=data0H
+        psubd   mm5,mm6                 ; mm5=data3L
+        psubd   mm3,mm7                 ; mm3=data3H
+
+        movq    mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]       ; mm6=[PD_DESCALE_P2_4]
+
+        paddd   mm1,mm6
+        paddd   mm2,mm6
+        psrad   mm1,DESCALE_P2_4
+        psrad   mm2,DESCALE_P2_4
+        paddd   mm5,mm6
+        paddd   mm3,mm6
+        psrad   mm5,DESCALE_P2_4
+        psrad   mm3,DESCALE_P2_4
+
+        packssdw  mm1,mm2               ; mm1=data0=(00 10 20 30)
+        packssdw  mm5,mm3               ; mm5=data3=(03 13 23 33)
+
+        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp0L
+        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp0H
+
+        movq    mm2,mm4
+        movq    mm3,mm0
+        paddd   mm4,mm7                 ; mm4=data1L
+        paddd   mm0,mm6                 ; mm0=data1H
+        psubd   mm2,mm7                 ; mm2=data2L
+        psubd   mm3,mm6                 ; mm3=data2H
+
+        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]       ; mm7=[PD_DESCALE_P2_4]
+
+        paddd   mm4,mm7
+        paddd   mm0,mm7
+        psrad   mm4,DESCALE_P2_4
+        psrad   mm0,DESCALE_P2_4
+        paddd   mm2,mm7
+        paddd   mm3,mm7
+        psrad   mm2,DESCALE_P2_4
+        psrad   mm3,DESCALE_P2_4
+
+        packssdw  mm4,mm0               ; mm4=data1=(01 11 21 31)
+        packssdw  mm2,mm3               ; mm2=data2=(02 12 22 32)
+
+        movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm6=[PB_CENTERJSAMP]
+
+        packsswb  mm1,mm2               ; mm1=(00 10 20 30 02 12 22 32)
+        packsswb  mm4,mm5               ; mm4=(01 11 21 31 03 13 23 33)
+        paddb     mm1,mm6
+        paddb     mm4,mm6
+
+        movq      mm7,mm1               ; transpose coefficients(phase 1)
+        punpcklbw mm1,mm4               ; mm1=(00 01 10 11 20 21 30 31)
+        punpckhbw mm7,mm4               ; mm7=(02 03 12 13 22 23 32 33)
+
+        movq      mm0,mm1               ; transpose coefficients(phase 2)
+        punpcklwd mm1,mm7               ; mm1=(00 01 02 03 10 11 12 13)
+        punpckhwd mm0,mm7               ; mm0=(20 21 22 23 30 31 32 33)
+
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+        movd    DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+        movd    DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+        psrlq   mm1,4*BYTE_BIT
+        psrlq   mm0,4*BYTE_BIT
+
+        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+        movd    DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+        movd    DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+        emms            ; empty MMX state
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        poppic  ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_mmx (void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)    (b)+8           ; void *dct_table
+%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
+%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
+%define output_col(b)   (b)+20          ; JDIMENSION output_col
+
+        align   16
+        global  EXTN(jsimd_idct_2x2_mmx)
+
+EXTN(jsimd_idct_2x2_mmx):
+        push    ebp
+        mov     ebp,esp
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process columns from input.
+
+        mov     edx, POINTER [dct_table(ebp)]           ; quantptr
+        mov     esi, JCOEFPTR [coef_block(ebp)]         ; inptr
+
+        ; | input:                  | result:        |
+        ; | 00 01 ** 03 ** 05 ** 07 |                |
+        ; | 10 11 ** 13 ** 15 ** 17 |                |
+        ; | ** ** ** ** ** ** ** ** |                |
+        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+        ; | 50 51 ** 53 ** 55 ** 57 |                |
+        ; | ** ** ** ** ** ** ** ** |                |
+        ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+        ; -- Odd part
+
+        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        pmullw  mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+        ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+        pcmpeqd   mm7,mm7
+        pslld     mm7,WORD_BIT          ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+        movq      mm4,mm0               ; mm4=(10 11 ** 13)
+        movq      mm5,mm2               ; mm5=(50 51 ** 53)
+        punpcklwd mm4,mm1               ; mm4=(10 30 11 31)
+        punpcklwd mm5,mm3               ; mm5=(50 70 51 71)
+        pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
+        pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+        psrld   mm0,WORD_BIT            ; mm0=(11 -- 13 --)
+        pand    mm1,mm7                 ; mm1=(-- 31 -- 33)
+        psrld   mm2,WORD_BIT            ; mm2=(51 -- 53 --)
+        pand    mm3,mm7                 ; mm3=(-- 71 -- 73)
+        por     mm0,mm1                 ; mm0=(11 31 13 33)
+        por     mm2,mm3                 ; mm2=(51 71 53 73)
+        pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)]
+        pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+        paddd   mm4,mm5                 ; mm4=tmp0[col0 col1]
+
+        movq    mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+        movq    mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+        pmullw  mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        movq    mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+        movq    mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+        pmullw  mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+        ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+        psrld   mm6,WORD_BIT            ; mm6=(15 -- 17 --)
+        pand    mm1,mm7                 ; mm1=(-- 35 -- 37)
+        psrld   mm3,WORD_BIT            ; mm3=(55 -- 57 --)
+        pand    mm5,mm7                 ; mm5=(-- 75 -- 77)
+        por     mm6,mm1                 ; mm6=(15 35 17 37)
+        por     mm3,mm5                 ; mm3=(55 75 57 77)
+        pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)]
+        pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)]
+
+        paddd   mm0,mm2                 ; mm0=tmp0[col1 col3]
+        paddd   mm6,mm3                 ; mm6=tmp0[col5 col7]
+
+        ; -- Even part
+
+        movq    mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movq    mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+        pmullw  mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+        movq    mm2,mm1                         ; mm2=(00 01 ** 03)
+        pslld   mm1,WORD_BIT                    ; mm1=(-- 00 -- **)
+        psrad   mm1,(WORD_BIT-CONST_BITS-2)     ; mm1=tmp10[col0 ****]
+
+        pand    mm2,mm7                         ; mm2=(-- 01 -- 03)
+        pand    mm5,mm7                         ; mm5=(-- 05 -- 07)
+        psrad   mm2,(WORD_BIT-CONST_BITS-2)     ; mm2=tmp10[col1 col3]
+        psrad   mm5,(WORD_BIT-CONST_BITS-2)     ; mm5=tmp10[col5 col7]
+
+        ; -- Final output stage
+
+        movq      mm3,mm1
+        paddd     mm1,mm4               ; mm1=data0[col0 ****]=(A0 **)
+        psubd     mm3,mm4               ; mm3=data1[col0 ****]=(B0 **)
+        punpckldq mm1,mm3               ; mm1=(A0 B0)
+
+        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]       ; mm7=[PD_DESCALE_P1_2]
+
+        movq    mm4,mm2
+        movq    mm3,mm5
+        paddd   mm2,mm0                 ; mm2=data0[col1 col3]=(A1 A3)
+        paddd   mm5,mm6                 ; mm5=data0[col5 col7]=(A5 A7)
+        psubd   mm4,mm0                 ; mm4=data1[col1 col3]=(B1 B3)
+        psubd   mm3,mm6                 ; mm3=data1[col5 col7]=(B5 B7)
+
+        paddd   mm1,mm7
+        psrad   mm1,DESCALE_P1_2
+
+        paddd   mm2,mm7
+        paddd   mm5,mm7
+        psrad   mm2,DESCALE_P1_2
+        psrad   mm5,DESCALE_P1_2
+        paddd   mm4,mm7
+        paddd   mm3,mm7
+        psrad   mm4,DESCALE_P1_2
+        psrad   mm3,DESCALE_P1_2
+
+        ; ---- Pass 2: process rows, store into output array.
+
+        mov     edi, JSAMPARRAY [output_buf(ebp)]       ; (JSAMPROW *)
+        mov     eax, JDIMENSION [output_col(ebp)]
+
+        ; | input:| result:|
+        ; | A0 B0 |        |
+        ; | A1 B1 | C0 C1  |
+        ; | A3 B3 | D0 D1  |
+        ; | A5 B5 |        |
+        ; | A7 B7 |        |
+
+        ; -- Odd part
+
+        packssdw  mm2,mm4               ; mm2=(A1 A3 B1 B3)
+        packssdw  mm5,mm3               ; mm5=(A5 A7 B5 B7)
+        pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
+        pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+        paddd     mm2,mm5               ; mm2=tmp0[row0 row1]
+
+        ; -- Even part
+
+        pslld     mm1,(CONST_BITS+2)    ; mm1=tmp10[row0 row1]
+
+        ; -- Final output stage
+
+        movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]     ; mm0=[PD_DESCALE_P2_2]
+
+        movq      mm6,mm1
+        paddd     mm1,mm2               ; mm1=data0[row0 row1]=(C0 C1)
+        psubd     mm6,mm2               ; mm6=data1[row0 row1]=(D0 D1)
+
+        paddd     mm1,mm0
+        paddd     mm6,mm0
+        psrad     mm1,DESCALE_P2_2
+        psrad     mm6,DESCALE_P2_2
+
+        movq      mm7,mm1               ; transpose coefficients
+        punpckldq mm1,mm6               ; mm1=(C0 D0)
+        punpckhdq mm7,mm6               ; mm7=(C1 D1)
+
+        packssdw  mm1,mm7               ; mm1=(C0 D0 C1 D1)
+        packsswb  mm1,mm1               ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+        paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+        movd    ecx,mm1
+        movd    ebx,mm1                 ; ebx=(C0 D0 C1 D1)
+        shr     ecx,2*BYTE_BIT          ; ecx=(C1 D1 -- --)
+
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+        mov     WORD [edx+eax*SIZEOF_JSAMPLE], bx
+        mov     WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+        emms            ; empty MMX state
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jidctred-sse2-64.asm b/simd/jidctred-sse2-64.asm
new file mode 100644
index 0000000..d1b1874
--- /dev/null
+++ b/simd/jidctred-sse2-64.asm
@@ -0,0 +1,576 @@
+;
+; jidctred.asm - reduced-size IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      13
+%define PASS1_BITS      2
+
+%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211 equ      1730           ; FIX(0.211164243)
+F_0_509 equ      4176           ; FIX(0.509795579)
+F_0_601 equ      4926           ; FIX(0.601344887)
+F_0_720 equ      5906           ; FIX(0.720959822)
+F_0_765 equ      6270           ; FIX(0.765366865)
+F_0_850 equ      6967           ; FIX(0.850430095)
+F_0_899 equ      7373           ; FIX(0.899976223)
+F_1_061 equ      8697           ; FIX(1.061594337)
+F_1_272 equ     10426           ; FIX(1.272758580)
+F_1_451 equ     11893           ; FIX(1.451774981)
+F_1_847 equ     15137           ; FIX(1.847759065)
+F_2_172 equ     17799           ; FIX(2.172734803)
+F_2_562 equ     20995           ; FIX(2.562915447)
+F_3_624 equ     29692           ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
+F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
+F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
+F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
+F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
+F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
+F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
+F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
+F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
+F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
+F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
+F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
+F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076   times 4 dw  F_1_847,-F_0_765
+PW_F256_F089    times 4 dw  F_2_562, F_0_899
+PW_F106_MF217   times 4 dw  F_1_061,-F_2_172
+PW_MF060_MF050  times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021   times 4 dw  F_1_451,-F_0_211
+PW_F362_MF127   times 4 dw  F_3_624,-F_1_272
+PW_F085_MF072   times 4 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4 times 4 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4 times 4 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2 times 4 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2 times 4 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp    rbp+0
+%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+        global  EXTN(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+        push    rbp
+        mov     rax,rsp                         ; rax = original rbp
+        sub     rsp, byte 4
+        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [rsp],rax
+        mov     rbp,rsp                         ; rbp = aligned rbp
+        lea     rsp, [wk(0)]
+        collect_args
+
+        ; ---- Pass 1: process columns from input.
+
+        mov     rdx, r10                ; quantptr
+        mov     rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+        jnz     short .columnDCT
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+        por     xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+        por     xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+        por     xmm0,xmm1
+        packsswb xmm0,xmm0
+        packsswb xmm0,xmm0
+        movd    eax,xmm0
+        test    rax,rax
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        psllw   xmm0,PASS1_BITS
+
+        movdqa    xmm3,xmm0     ; xmm0=in0=(00 01 02 03 04 05 06 07)
+        punpcklwd xmm0,xmm0     ; xmm0=(00 00 01 01 02 02 03 03)
+        punpckhwd xmm3,xmm3     ; xmm3=(04 04 05 05 06 06 07 07)
+
+        pshufd  xmm1,xmm0,0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+        pshufd  xmm0,xmm0,0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+        pshufd  xmm6,xmm3,0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+        pshufd  xmm3,xmm3,0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+        jmp     near .column_end
+%endif
+.columnDCT:
+
+        ; -- Odd part
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        movdqa    xmm4,xmm0
+        movdqa    xmm5,xmm0
+        punpcklwd xmm4,xmm1
+        punpckhwd xmm5,xmm1
+        movdqa    xmm0,xmm4
+        movdqa    xmm1,xmm5
+        pmaddwd   xmm4,[rel PW_F256_F089]       ; xmm4=(tmp2L)
+        pmaddwd   xmm5,[rel PW_F256_F089]       ; xmm5=(tmp2H)
+        pmaddwd   xmm0,[rel PW_F106_MF217]      ; xmm0=(tmp0L)
+        pmaddwd   xmm1,[rel PW_F106_MF217]      ; xmm1=(tmp0H)
+
+        movdqa    xmm6,xmm2
+        movdqa    xmm7,xmm2
+        punpcklwd xmm6,xmm3
+        punpckhwd xmm7,xmm3
+        movdqa    xmm2,xmm6
+        movdqa    xmm3,xmm7
+        pmaddwd   xmm6,[rel PW_MF060_MF050]     ; xmm6=(tmp2L)
+        pmaddwd   xmm7,[rel PW_MF060_MF050]     ; xmm7=(tmp2H)
+        pmaddwd   xmm2,[rel PW_F145_MF021]      ; xmm2=(tmp0L)
+        pmaddwd   xmm3,[rel PW_F145_MF021]      ; xmm3=(tmp0H)
+
+        paddd   xmm6,xmm4               ; xmm6=tmp2L
+        paddd   xmm7,xmm5               ; xmm7=tmp2H
+        paddd   xmm2,xmm0               ; xmm2=tmp0L
+        paddd   xmm3,xmm1               ; xmm3=tmp0H
+
+        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+        ; -- Even part
+
+        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        pxor      xmm1,xmm1
+        pxor      xmm2,xmm2
+        punpcklwd xmm1,xmm4             ; xmm1=tmp0L
+        punpckhwd xmm2,xmm4             ; xmm2=tmp0H
+        psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+        psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+        movdqa    xmm3,xmm5             ; xmm5=in2=z2
+        punpcklwd xmm5,xmm0             ; xmm0=in6=z3
+        punpckhwd xmm3,xmm0
+        pmaddwd   xmm5,[rel PW_F184_MF076]      ; xmm5=tmp2L
+        pmaddwd   xmm3,[rel PW_F184_MF076]      ; xmm3=tmp2H
+
+        movdqa  xmm4,xmm1
+        movdqa  xmm0,xmm2
+        paddd   xmm1,xmm5               ; xmm1=tmp10L
+        paddd   xmm2,xmm3               ; xmm2=tmp10H
+        psubd   xmm4,xmm5               ; xmm4=tmp12L
+        psubd   xmm0,xmm3               ; xmm0=tmp12H
+
+        ; -- Final output stage
+
+        movdqa  xmm5,xmm1
+        movdqa  xmm3,xmm2
+        paddd   xmm1,xmm6               ; xmm1=data0L
+        paddd   xmm2,xmm7               ; xmm2=data0H
+        psubd   xmm5,xmm6               ; xmm5=data3L
+        psubd   xmm3,xmm7               ; xmm3=data3H
+
+        movdqa  xmm6,[rel PD_DESCALE_P1_4]      ; xmm6=[rel PD_DESCALE_P1_4]
+
+        paddd   xmm1,xmm6
+        paddd   xmm2,xmm6
+        psrad   xmm1,DESCALE_P1_4
+        psrad   xmm2,DESCALE_P1_4
+        paddd   xmm5,xmm6
+        paddd   xmm3,xmm6
+        psrad   xmm5,DESCALE_P1_4
+        psrad   xmm3,DESCALE_P1_4
+
+        packssdw  xmm1,xmm2             ; xmm1=data0=(00 01 02 03 04 05 06 07)
+        packssdw  xmm5,xmm3             ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+        movdqa  xmm2,xmm4
+        movdqa  xmm3,xmm0
+        paddd   xmm4,xmm7               ; xmm4=data1L
+        paddd   xmm0,xmm6               ; xmm0=data1H
+        psubd   xmm2,xmm7               ; xmm2=data2L
+        psubd   xmm3,xmm6               ; xmm3=data2H
+
+        movdqa  xmm7,[rel PD_DESCALE_P1_4]      ; xmm7=[rel PD_DESCALE_P1_4]
+
+        paddd   xmm4,xmm7
+        paddd   xmm0,xmm7
+        psrad   xmm4,DESCALE_P1_4
+        psrad   xmm0,DESCALE_P1_4
+        paddd   xmm2,xmm7
+        paddd   xmm3,xmm7
+        psrad   xmm2,DESCALE_P1_4
+        psrad   xmm3,DESCALE_P1_4
+
+        packssdw  xmm4,xmm0             ; xmm4=data1=(10 11 12 13 14 15 16 17)
+        packssdw  xmm2,xmm3             ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+        movdqa    xmm6,xmm1     ; transpose coefficients(phase 1)
+        punpcklwd xmm1,xmm4     ; xmm1=(00 10 01 11 02 12 03 13)
+        punpckhwd xmm6,xmm4     ; xmm6=(04 14 05 15 06 16 07 17)
+        movdqa    xmm7,xmm2     ; transpose coefficients(phase 1)
+        punpcklwd xmm2,xmm5     ; xmm2=(20 30 21 31 22 32 23 33)
+        punpckhwd xmm7,xmm5     ; xmm7=(24 34 25 35 26 36 27 37)
+
+        movdqa    xmm0,xmm1     ; transpose coefficients(phase 2)
+        punpckldq xmm1,xmm2     ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+        punpckhdq xmm0,xmm2     ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+        movdqa    xmm3,xmm6     ; transpose coefficients(phase 2)
+        punpckldq xmm6,xmm7     ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+        punpckhdq xmm3,xmm7     ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+        ; -- Prefetch the next coefficient block
+
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+        ; ---- Pass 2: process rows, store into output array.
+
+        mov     rax, [original_rbp]
+        mov     rdi, r12        ; (JSAMPROW *)
+        mov     eax, r13d
+
+        ; -- Even part
+
+        pxor      xmm4,xmm4
+        punpcklwd xmm4,xmm1             ; xmm4=tmp0
+        psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+        ; -- Odd part
+
+        punpckhwd xmm1,xmm0
+        punpckhwd xmm6,xmm3
+        movdqa    xmm5,xmm1
+        movdqa    xmm2,xmm6
+        pmaddwd   xmm1,[rel PW_F256_F089]       ; xmm1=(tmp2)
+        pmaddwd   xmm6,[rel PW_MF060_MF050]     ; xmm6=(tmp2)
+        pmaddwd   xmm5,[rel PW_F106_MF217]      ; xmm5=(tmp0)
+        pmaddwd   xmm2,[rel PW_F145_MF021]      ; xmm2=(tmp0)
+
+        paddd     xmm6,xmm1             ; xmm6=tmp2
+        paddd     xmm2,xmm5             ; xmm2=tmp0
+
+        ; -- Even part
+
+        punpcklwd xmm0,xmm3
+        pmaddwd   xmm0,[rel PW_F184_MF076]      ; xmm0=tmp2
+
+        movdqa    xmm7,xmm4
+        paddd     xmm4,xmm0             ; xmm4=tmp10
+        psubd     xmm7,xmm0             ; xmm7=tmp12
+
+        ; -- Final output stage
+
+        movdqa  xmm1,[rel PD_DESCALE_P2_4]      ; xmm1=[rel PD_DESCALE_P2_4]
+
+        movdqa  xmm5,xmm4
+        movdqa  xmm3,xmm7
+        paddd   xmm4,xmm6               ; xmm4=data0=(00 10 20 30)
+        paddd   xmm7,xmm2               ; xmm7=data1=(01 11 21 31)
+        psubd   xmm5,xmm6               ; xmm5=data3=(03 13 23 33)
+        psubd   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
+
+        paddd   xmm4,xmm1
+        paddd   xmm7,xmm1
+        psrad   xmm4,DESCALE_P2_4
+        psrad   xmm7,DESCALE_P2_4
+        paddd   xmm5,xmm1
+        paddd   xmm3,xmm1
+        psrad   xmm5,DESCALE_P2_4
+        psrad   xmm3,DESCALE_P2_4
+
+        packssdw  xmm4,xmm3             ; xmm4=(00 10 20 30 02 12 22 32)
+        packssdw  xmm7,xmm5             ; xmm7=(01 11 21 31 03 13 23 33)
+
+        movdqa    xmm0,xmm4             ; transpose coefficients(phase 1)
+        punpcklwd xmm4,xmm7             ; xmm4=(00 01 10 11 20 21 30 31)
+        punpckhwd xmm0,xmm7             ; xmm0=(02 03 12 13 22 23 32 33)
+
+        movdqa    xmm6,xmm4             ; transpose coefficients(phase 2)
+        punpckldq xmm4,xmm0             ; xmm4=(00 01 02 03 10 11 12 13)
+        punpckhdq xmm6,xmm0             ; xmm6=(20 21 22 23 30 31 32 33)
+
+        packsswb  xmm4,xmm6             ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+        paddb     xmm4,[rel PB_CENTERJSAMP]
+
+        pshufd    xmm2,xmm4,0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+        pshufd    xmm1,xmm4,0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+        pshufd    xmm3,xmm4,0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+        mov     rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+        movd    XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+        movd    XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+        mov     rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+        mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+        movd    XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+        movd    XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+        uncollect_args
+        mov     rsp,rbp         ; rsp <- aligned rbp
+        pop     rsp             ; rsp <- original rbp
+        pop     rbp
+        ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+        align   16
+        global  EXTN(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+        push    rbp
+        mov     rax,rsp
+        mov     rbp,rsp
+        collect_args
+        push    rbx
+
+        ; ---- Pass 1: process columns from input.
+
+        mov     rdx, r10                ; quantptr
+        mov     rsi, r11                ; inptr
+
+        ; | input:                  | result:        |
+        ; | 00 01 ** 03 ** 05 ** 07 |                |
+        ; | 10 11 ** 13 ** 15 ** 17 |                |
+        ; | ** ** ** ** ** ** ** ** |                |
+        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+        ; | 50 51 ** 53 ** 55 ** 57 |                |
+        ; | ** ** ** ** ** ** ** ** |                |
+        ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+        ; -- Odd part
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+        ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+        pcmpeqd   xmm7,xmm7
+        pslld     xmm7,WORD_BIT         ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+        movdqa    xmm4,xmm0             ; xmm4=(10 11 ** 13 ** 15 ** 17)
+        movdqa    xmm5,xmm2             ; xmm5=(50 51 ** 53 ** 55 ** 57)
+        punpcklwd xmm4,xmm1             ; xmm4=(10 30 11 31 ** ** 13 33)
+        punpcklwd xmm5,xmm3             ; xmm5=(50 70 51 71 ** ** 53 73)
+        pmaddwd   xmm4,[rel PW_F362_MF127]
+        pmaddwd   xmm5,[rel PW_F085_MF072]
+
+        psrld   xmm0,WORD_BIT           ; xmm0=(11 -- 13 -- 15 -- 17 --)
+        pand    xmm1,xmm7               ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+        psrld   xmm2,WORD_BIT           ; xmm2=(51 -- 53 -- 55 -- 57 --)
+        pand    xmm3,xmm7               ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+        por     xmm0,xmm1               ; xmm0=(11 31 13 33 15 35 17 37)
+        por     xmm2,xmm3               ; xmm2=(51 71 53 73 55 75 57 77)
+        pmaddwd xmm0,[rel PW_F362_MF127]
+        pmaddwd xmm2,[rel PW_F085_MF072]
+
+        paddd   xmm4,xmm5               ; xmm4=tmp0[col0 col1 **** col3]
+        paddd   xmm0,xmm2               ; xmm0=tmp0[col1 col3 col5 col7]
+
+        ; -- Even part
+
+        movdqa  xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+        pmullw  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+        movdqa  xmm1,xmm6               ; xmm1=(00 01 ** 03 ** 05 ** 07)
+        pslld   xmm6,WORD_BIT           ; xmm6=(-- 00 -- ** -- ** -- **)
+        pand    xmm1,xmm7               ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+        psrad   xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+        psrad   xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+        ; -- Final output stage
+
+        movdqa  xmm3,xmm6
+        movdqa  xmm5,xmm1
+        paddd   xmm6,xmm4       ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+        paddd   xmm1,xmm0       ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+        psubd   xmm3,xmm4       ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+        psubd   xmm5,xmm0       ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+        movdqa  xmm2,[rel PD_DESCALE_P1_2]      ; xmm2=[rel PD_DESCALE_P1_2]
+
+        punpckldq  xmm6,xmm3            ; xmm6=(A0 B0 ** **)
+
+        movdqa     xmm7,xmm1
+        punpcklqdq xmm1,xmm5            ; xmm1=(A1 A3 B1 B3)
+        punpckhqdq xmm7,xmm5            ; xmm7=(A5 A7 B5 B7)
+
+        paddd   xmm6,xmm2
+        psrad   xmm6,DESCALE_P1_2
+
+        paddd   xmm1,xmm2
+        paddd   xmm7,xmm2
+        psrad   xmm1,DESCALE_P1_2
+        psrad   xmm7,DESCALE_P1_2
+
+        ; -- Prefetch the next coefficient block
+
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+        ; ---- Pass 2: process rows, store into output array.
+
+        mov     rdi, r12        ; (JSAMPROW *)
+        mov     eax, r13d
+
+        ; | input:| result:|
+        ; | A0 B0 |        |
+        ; | A1 B1 | C0 C1  |
+        ; | A3 B3 | D0 D1  |
+        ; | A5 B5 |        |
+        ; | A7 B7 |        |
+
+        ; -- Odd part
+
+        packssdw  xmm1,xmm1             ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+        packssdw  xmm7,xmm7             ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+        pmaddwd   xmm1,[rel PW_F362_MF127]
+        pmaddwd   xmm7,[rel PW_F085_MF072]
+
+        paddd     xmm1,xmm7             ; xmm1=tmp0[row0 row1 row0 row1]
+
+        ; -- Even part
+
+        pslld     xmm6,(CONST_BITS+2)   ; xmm6=tmp10[row0 row1 **** ****]
+
+        ; -- Final output stage
+
+        movdqa    xmm4,xmm6
+        paddd     xmm6,xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+        psubd     xmm4,xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+        punpckldq xmm6,xmm4     ; xmm6=(C0 D0 C1 D1)
+
+        paddd     xmm6,[rel PD_DESCALE_P2_2]
+        psrad     xmm6,DESCALE_P2_2
+
+        packssdw  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+        packsswb  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+        paddb     xmm6,[rel PB_CENTERJSAMP]
+
+        pextrw  ebx,xmm6,0x00           ; ebx=(C0 D0 -- --)
+        pextrw  ecx,xmm6,0x01           ; ecx=(C1 D1 -- --)
+
+        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+        mov     rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+        mov     WORD [rdx+rax*SIZEOF_JSAMPLE], bx
+        mov     WORD [rsi+rax*SIZEOF_JSAMPLE], cx
+
+        pop     rbx
+        uncollect_args
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jidctred-sse2.asm b/simd/jidctred-sse2.asm
new file mode 100644
index 0000000..e48c0c5
--- /dev/null
+++ b/simd/jidctred-sse2.asm
@@ -0,0 +1,594 @@
+;
+; jidctred.asm - reduced-size IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS      13
+%define PASS1_BITS      2
+
+%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211 equ      1730           ; FIX(0.211164243)
+F_0_509 equ      4176           ; FIX(0.509795579)
+F_0_601 equ      4926           ; FIX(0.601344887)
+F_0_720 equ      5906           ; FIX(0.720959822)
+F_0_765 equ      6270           ; FIX(0.765366865)
+F_0_850 equ      6967           ; FIX(0.850430095)
+F_0_899 equ      7373           ; FIX(0.899976223)
+F_1_061 equ      8697           ; FIX(1.061594337)
+F_1_272 equ     10426           ; FIX(1.272758580)
+F_1_451 equ     11893           ; FIX(1.451774981)
+F_1_847 equ     15137           ; FIX(1.847759065)
+F_2_172 equ     17799           ; FIX(2.172734803)
+F_2_562 equ     20995           ; FIX(2.562915447)
+F_3_624 equ     29692           ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
+F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
+F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
+F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
+F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
+F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
+F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
+F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
+F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
+F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
+F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
+F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
+F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+        SECTION SEG_CONST
+
+        alignz  16
+        global  EXTN(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076   times 4 dw  F_1_847,-F_0_765
+PW_F256_F089    times 4 dw  F_2_562, F_0_899
+PW_F106_MF217   times 4 dw  F_1_061,-F_2_172
+PW_MF060_MF050  times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021   times 4 dw  F_1_451,-F_0_211
+PW_F362_MF127   times 4 dw  F_3_624,-F_1_272
+PW_F085_MF072   times 4 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4 times 4 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4 times 4 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2 times 4 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2 times 4 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
+
+        alignz  16
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)    (b)+8           ; void *dct_table
+%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
+%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
+%define output_col(b)   (b)+20          ; JDIMENSION output_col
+
+%define original_ebp    ebp+0
+%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM          2
+
+        align   16
+        global  EXTN(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+        push    ebp
+        mov     eax,esp                         ; eax = original ebp
+        sub     esp, byte 4
+        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+        mov     [esp],eax
+        mov     ebp,esp                         ; ebp = aligned ebp
+        lea     esp, [wk(0)]
+        pushpic ebx
+;       push    ecx             ; unused
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process columns from input.
+
+;       mov     eax, [original_ebp]
+        mov     edx, POINTER [dct_table(eax)]           ; quantptr
+        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        jnz     short .columnDCT
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        por     xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        por     xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        por     xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        por     xmm0,xmm1
+        packsswb xmm0,xmm0
+        packsswb xmm0,xmm0
+        movd    eax,xmm0
+        test    eax,eax
+        jnz     short .columnDCT
+
+        ; -- AC terms all zero
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        psllw   xmm0,PASS1_BITS
+
+        movdqa    xmm3,xmm0     ; xmm0=in0=(00 01 02 03 04 05 06 07)
+        punpcklwd xmm0,xmm0     ; xmm0=(00 00 01 01 02 02 03 03)
+        punpckhwd xmm3,xmm3     ; xmm3=(04 04 05 05 06 06 07 07)
+
+        pshufd  xmm1,xmm0,0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+        pshufd  xmm0,xmm0,0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+        pshufd  xmm6,xmm3,0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+        pshufd  xmm3,xmm3,0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+        jmp     near .column_end
+        alignx  16,7
+%endif
+.columnDCT:
+
+        ; -- Odd part
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        movdqa    xmm4,xmm0
+        movdqa    xmm5,xmm0
+        punpcklwd xmm4,xmm1
+        punpckhwd xmm5,xmm1
+        movdqa    xmm0,xmm4
+        movdqa    xmm1,xmm5
+        pmaddwd   xmm4,[GOTOFF(ebx,PW_F256_F089)]       ; xmm4=(tmp2L)
+        pmaddwd   xmm5,[GOTOFF(ebx,PW_F256_F089)]       ; xmm5=(tmp2H)
+        pmaddwd   xmm0,[GOTOFF(ebx,PW_F106_MF217)]      ; xmm0=(tmp0L)
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_F106_MF217)]      ; xmm1=(tmp0H)
+
+        movdqa    xmm6,xmm2
+        movdqa    xmm7,xmm2
+        punpcklwd xmm6,xmm3
+        punpckhwd xmm7,xmm3
+        movdqa    xmm2,xmm6
+        movdqa    xmm3,xmm7
+        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]     ; xmm6=(tmp2L)
+        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF050)]     ; xmm7=(tmp2H)
+        pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]      ; xmm2=(tmp0L)
+        pmaddwd   xmm3,[GOTOFF(ebx,PW_F145_MF021)]      ; xmm3=(tmp0H)
+
+        paddd   xmm6,xmm4               ; xmm6=tmp2L
+        paddd   xmm7,xmm5               ; xmm7=tmp2H
+        paddd   xmm2,xmm0               ; xmm2=tmp0L
+        paddd   xmm3,xmm1               ; xmm3=tmp0H
+
+        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+        ; -- Even part
+
+        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        pxor      xmm1,xmm1
+        pxor      xmm2,xmm2
+        punpcklwd xmm1,xmm4             ; xmm1=tmp0L
+        punpckhwd xmm2,xmm4             ; xmm2=tmp0H
+        psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+        psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+        movdqa    xmm3,xmm5             ; xmm5=in2=z2
+        punpcklwd xmm5,xmm0             ; xmm0=in6=z3
+        punpckhwd xmm3,xmm0
+        pmaddwd   xmm5,[GOTOFF(ebx,PW_F184_MF076)]      ; xmm5=tmp2L
+        pmaddwd   xmm3,[GOTOFF(ebx,PW_F184_MF076)]      ; xmm3=tmp2H
+
+        movdqa  xmm4,xmm1
+        movdqa  xmm0,xmm2
+        paddd   xmm1,xmm5               ; xmm1=tmp10L
+        paddd   xmm2,xmm3               ; xmm2=tmp10H
+        psubd   xmm4,xmm5               ; xmm4=tmp12L
+        psubd   xmm0,xmm3               ; xmm0=tmp12H
+
+        ; -- Final output stage
+
+        movdqa  xmm5,xmm1
+        movdqa  xmm3,xmm2
+        paddd   xmm1,xmm6               ; xmm1=data0L
+        paddd   xmm2,xmm7               ; xmm2=data0H
+        psubd   xmm5,xmm6               ; xmm5=data3L
+        psubd   xmm3,xmm7               ; xmm3=data3H
+
+        movdqa  xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]      ; xmm6=[PD_DESCALE_P1_4]
+
+        paddd   xmm1,xmm6
+        paddd   xmm2,xmm6
+        psrad   xmm1,DESCALE_P1_4
+        psrad   xmm2,DESCALE_P1_4
+        paddd   xmm5,xmm6
+        paddd   xmm3,xmm6
+        psrad   xmm5,DESCALE_P1_4
+        psrad   xmm3,DESCALE_P1_4
+
+        packssdw  xmm1,xmm2             ; xmm1=data0=(00 01 02 03 04 05 06 07)
+        packssdw  xmm5,xmm3             ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+        movdqa  xmm2,xmm4
+        movdqa  xmm3,xmm0
+        paddd   xmm4,xmm7               ; xmm4=data1L
+        paddd   xmm0,xmm6               ; xmm0=data1H
+        psubd   xmm2,xmm7               ; xmm2=data2L
+        psubd   xmm3,xmm6               ; xmm3=data2H
+
+        movdqa  xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]      ; xmm7=[PD_DESCALE_P1_4]
+
+        paddd   xmm4,xmm7
+        paddd   xmm0,xmm7
+        psrad   xmm4,DESCALE_P1_4
+        psrad   xmm0,DESCALE_P1_4
+        paddd   xmm2,xmm7
+        paddd   xmm3,xmm7
+        psrad   xmm2,DESCALE_P1_4
+        psrad   xmm3,DESCALE_P1_4
+
+        packssdw  xmm4,xmm0             ; xmm4=data1=(10 11 12 13 14 15 16 17)
+        packssdw  xmm2,xmm3             ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+        movdqa    xmm6,xmm1     ; transpose coefficients(phase 1)
+        punpcklwd xmm1,xmm4     ; xmm1=(00 10 01 11 02 12 03 13)
+        punpckhwd xmm6,xmm4     ; xmm6=(04 14 05 15 06 16 07 17)
+        movdqa    xmm7,xmm2     ; transpose coefficients(phase 1)
+        punpcklwd xmm2,xmm5     ; xmm2=(20 30 21 31 22 32 23 33)
+        punpckhwd xmm7,xmm5     ; xmm7=(24 34 25 35 26 36 27 37)
+
+        movdqa    xmm0,xmm1     ; transpose coefficients(phase 2)
+        punpckldq xmm1,xmm2     ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+        punpckhdq xmm0,xmm2     ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+        movdqa    xmm3,xmm6     ; transpose coefficients(phase 2)
+        punpckldq xmm6,xmm7     ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+        punpckhdq xmm3,xmm7     ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+        ; -- Prefetch the next coefficient block
+
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+        ; ---- Pass 2: process rows, store into output array.
+
+        mov     eax, [original_ebp]
+        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
+        mov     eax, JDIMENSION [output_col(eax)]
+
+        ; -- Even part
+
+        pxor      xmm4,xmm4
+        punpcklwd xmm4,xmm1             ; xmm4=tmp0
+        psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+        ; -- Odd part
+
+        punpckhwd xmm1,xmm0
+        punpckhwd xmm6,xmm3
+        movdqa    xmm5,xmm1
+        movdqa    xmm2,xmm6
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_F256_F089)]       ; xmm1=(tmp2)
+        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]     ; xmm6=(tmp2)
+        pmaddwd   xmm5,[GOTOFF(ebx,PW_F106_MF217)]      ; xmm5=(tmp0)
+        pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]      ; xmm2=(tmp0)
+
+        paddd     xmm6,xmm1             ; xmm6=tmp2
+        paddd     xmm2,xmm5             ; xmm2=tmp0
+
+        ; -- Even part
+
+        punpcklwd xmm0,xmm3
+        pmaddwd   xmm0,[GOTOFF(ebx,PW_F184_MF076)]      ; xmm0=tmp2
+
+        movdqa    xmm7,xmm4
+        paddd     xmm4,xmm0             ; xmm4=tmp10
+        psubd     xmm7,xmm0             ; xmm7=tmp12
+
+        ; -- Final output stage
+
+        movdqa  xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)]      ; xmm1=[PD_DESCALE_P2_4]
+
+        movdqa  xmm5,xmm4
+        movdqa  xmm3,xmm7
+        paddd   xmm4,xmm6               ; xmm4=data0=(00 10 20 30)
+        paddd   xmm7,xmm2               ; xmm7=data1=(01 11 21 31)
+        psubd   xmm5,xmm6               ; xmm5=data3=(03 13 23 33)
+        psubd   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
+
+        paddd   xmm4,xmm1
+        paddd   xmm7,xmm1
+        psrad   xmm4,DESCALE_P2_4
+        psrad   xmm7,DESCALE_P2_4
+        paddd   xmm5,xmm1
+        paddd   xmm3,xmm1
+        psrad   xmm5,DESCALE_P2_4
+        psrad   xmm3,DESCALE_P2_4
+
+        packssdw  xmm4,xmm3             ; xmm4=(00 10 20 30 02 12 22 32)
+        packssdw  xmm7,xmm5             ; xmm7=(01 11 21 31 03 13 23 33)
+
+        movdqa    xmm0,xmm4             ; transpose coefficients(phase 1)
+        punpcklwd xmm4,xmm7             ; xmm4=(00 01 10 11 20 21 30 31)
+        punpckhwd xmm0,xmm7             ; xmm0=(02 03 12 13 22 23 32 33)
+
+        movdqa    xmm6,xmm4             ; transpose coefficients(phase 2)
+        punpckldq xmm4,xmm0             ; xmm4=(00 01 02 03 10 11 12 13)
+        punpckhdq xmm6,xmm0             ; xmm6=(20 21 22 23 30 31 32 33)
+
+        packsswb  xmm4,xmm6             ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+        paddb     xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+        pshufd    xmm2,xmm4,0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+        pshufd    xmm1,xmm4,0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+        pshufd    xmm3,xmm4,0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+        movd    XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+        movd    XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+        movd    XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+        movd    XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; unused
+        poppic  ebx
+        mov     esp,ebp         ; esp <- aligned ebp
+        pop     esp             ; esp <- original ebp
+        pop     ebp
+        ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)    (b)+8           ; void *dct_table
+%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
+%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
+%define output_col(b)   (b)+20          ; JDIMENSION output_col
+
+        align   16
+        global  EXTN(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+        push    ebp
+        mov     ebp,esp
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        get_GOT ebx             ; get GOT address
+
+        ; ---- Pass 1: process columns from input.
+
+        mov     edx, POINTER [dct_table(ebp)]           ; quantptr
+        mov     esi, JCOEFPTR [coef_block(ebp)]         ; inptr
+
+        ; | input:                  | result:        |
+        ; | 00 01 ** 03 ** 05 ** 07 |                |
+        ; | 10 11 ** 13 ** 15 ** 17 |                |
+        ; | ** ** ** ** ** ** ** ** |                |
+        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+        ; | 50 51 ** 53 ** 55 ** 57 |                |
+        ; | ** ** ** ** ** ** ** ** |                |
+        ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+        ; -- Odd part
+
+        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+        ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+        pcmpeqd   xmm7,xmm7
+        pslld     xmm7,WORD_BIT         ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+        movdqa    xmm4,xmm0             ; xmm4=(10 11 ** 13 ** 15 ** 17)
+        movdqa    xmm5,xmm2             ; xmm5=(50 51 ** 53 ** 55 ** 57)
+        punpcklwd xmm4,xmm1             ; xmm4=(10 30 11 31 ** ** 13 33)
+        punpcklwd xmm5,xmm3             ; xmm5=(50 70 51 71 ** ** 53 73)
+        pmaddwd   xmm4,[GOTOFF(ebx,PW_F362_MF127)]
+        pmaddwd   xmm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+        psrld   xmm0,WORD_BIT           ; xmm0=(11 -- 13 -- 15 -- 17 --)
+        pand    xmm1,xmm7               ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+        psrld   xmm2,WORD_BIT           ; xmm2=(51 -- 53 -- 55 -- 57 --)
+        pand    xmm3,xmm7               ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+        por     xmm0,xmm1               ; xmm0=(11 31 13 33 15 35 17 37)
+        por     xmm2,xmm3               ; xmm2=(51 71 53 73 55 75 57 77)
+        pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)]
+        pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+        paddd   xmm4,xmm5               ; xmm4=tmp0[col0 col1 **** col3]
+        paddd   xmm0,xmm2               ; xmm0=tmp0[col1 col3 col5 col7]
+
+        ; -- Even part
+
+        movdqa  xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+        pmullw  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+        ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+        movdqa  xmm1,xmm6               ; xmm1=(00 01 ** 03 ** 05 ** 07)
+        pslld   xmm6,WORD_BIT           ; xmm6=(-- 00 -- ** -- ** -- **)
+        pand    xmm1,xmm7               ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+        psrad   xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+        psrad   xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+        ; -- Final output stage
+
+        movdqa  xmm3,xmm6
+        movdqa  xmm5,xmm1
+        paddd   xmm6,xmm4       ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+        paddd   xmm1,xmm0       ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+        psubd   xmm3,xmm4       ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+        psubd   xmm5,xmm0       ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+        movdqa  xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)]      ; xmm2=[PD_DESCALE_P1_2]
+
+        punpckldq  xmm6,xmm3            ; xmm6=(A0 B0 ** **)
+
+        movdqa     xmm7,xmm1
+        punpcklqdq xmm1,xmm5            ; xmm1=(A1 A3 B1 B3)
+        punpckhqdq xmm7,xmm5            ; xmm7=(A5 A7 B5 B7)
+
+        paddd   xmm6,xmm2
+        psrad   xmm6,DESCALE_P1_2
+
+        paddd   xmm1,xmm2
+        paddd   xmm7,xmm2
+        psrad   xmm1,DESCALE_P1_2
+        psrad   xmm7,DESCALE_P1_2
+
+        ; -- Prefetch the next coefficient block
+
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+        ; ---- Pass 2: process rows, store into output array.
+
+        mov     edi, JSAMPARRAY [output_buf(ebp)]       ; (JSAMPROW *)
+        mov     eax, JDIMENSION [output_col(ebp)]
+
+        ; | input:| result:|
+        ; | A0 B0 |        |
+        ; | A1 B1 | C0 C1  |
+        ; | A3 B3 | D0 D1  |
+        ; | A5 B5 |        |
+        ; | A7 B7 |        |
+
+        ; -- Odd part
+
+        packssdw  xmm1,xmm1             ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+        packssdw  xmm7,xmm7             ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+        pmaddwd   xmm1,[GOTOFF(ebx,PW_F362_MF127)]
+        pmaddwd   xmm7,[GOTOFF(ebx,PW_F085_MF072)]
+
+        paddd     xmm1,xmm7             ; xmm1=tmp0[row0 row1 row0 row1]
+
+        ; -- Even part
+
+        pslld     xmm6,(CONST_BITS+2)   ; xmm6=tmp10[row0 row1 **** ****]
+
+        ; -- Final output stage
+
+        movdqa    xmm4,xmm6
+        paddd     xmm6,xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+        psubd     xmm4,xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+        punpckldq xmm6,xmm4     ; xmm6=(C0 D0 C1 D1)
+
+        paddd     xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
+        psrad     xmm6,DESCALE_P2_2
+
+        packssdw  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+        packsswb  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+        paddb     xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+        pextrw  ebx,xmm6,0x00           ; ebx=(C0 D0 -- --)
+        pextrw  ecx,xmm6,0x01           ; ecx=(C1 D1 -- --)
+
+        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+        mov     WORD [edx+eax*SIZEOF_JSAMPLE], bx
+        mov     WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jimmxfst.asm b/simd/jimmxfst.asm
deleted file mode 100644
index 1b535e1..0000000
--- a/simd/jimmxfst.asm
+++ /dev/null
@@ -1,500 +0,0 @@
-;
-; jimmxfst.asm - fast integer IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	8	; 14 is also OK.
-%define PASS1_BITS	2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082	equ	277		; FIX(1.082392200)
-F_1_414	equ	362		; FIX(1.414213562)
-F_1_847	equ	473		; FIX(1.847759065)
-F_2_613	equ	669		; FIX(2.613125930)
-F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
-F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
-F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-	alignz	16
-	global	EXTN(jconst_idct_ifast_mmx) PRIVATE
-
-EXTN(jconst_idct_ifast_mmx):
-
-PW_F1414	times 4 dw  F_1_414 << CONST_SHIFT
-PW_F1847	times 4 dw  F_1_847 << CONST_SHIFT
-PW_MF1613	times 4 dw -F_1_613 << CONST_SHIFT
-PW_F1082	times 4 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_mmx (void * dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
-					; JCOEF workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_ifast_mmx) PRIVATE
-
-EXTN(jsimd_idct_ifast_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [workspace]
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-	lea	edi, [workspace]			; JCOEF * wsptr
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	short .columnDCT
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	mm1,mm0
-	packsswb mm1,mm1
-	movd	eax,mm1
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
-	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
-	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
-
-	movq      mm1,mm0
-	punpckldq mm0,mm0		; mm0=(00 00 00 00)
-	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
-	movq      mm3,mm2
-	punpckldq mm2,mm2		; mm2=(02 02 02 02)
-	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
-	jmp	near .nextcolumn
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movq	mm4,mm0
-	movq	mm5,mm1
-	psubw	mm0,mm2			; mm0=tmp11
-	psubw	mm1,mm3
-	paddw	mm4,mm2			; mm4=tmp10
-	paddw	mm5,mm3			; mm5=tmp13
-
-	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
-	psubw	mm1,mm5			; mm1=tmp12
-
-	movq	mm6,mm4
-	movq	mm7,mm0
-	psubw	mm4,mm5			; mm4=tmp3
-	psubw	mm0,mm1			; mm0=tmp2
-	paddw	mm6,mm5			; mm6=tmp0
-	paddw	mm7,mm1			; mm7=tmp1
-
-	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
-	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
-
-	; -- Odd part
-
-	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movq	mm4,mm2
-	movq	mm0,mm5
-	psubw	mm2,mm1			; mm2=z12
-	psubw	mm5,mm3			; mm5=z10
-	paddw	mm4,mm1			; mm4=z11
-	paddw	mm0,mm3			; mm0=z13
-
-	movq	mm1,mm5			; mm1=z10(unscaled)
-	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
-
-	movq	mm3,mm4
-	psubw	mm4,mm0
-	paddw	mm3,mm0			; mm3=tmp7
-
-	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
-
-	; To avoid overflow...
-	;
-	; (Original)
-	; tmp12 = -2.613125930 * z10 + z5;
-	;
-	; (This implementation)
-	; tmp12 = (-1.613125930 - 1) * z10 + z5;
-	;       = -1.613125930 * z10 - z10 + z5;
-
-	movq	mm0,mm5
-	paddw	mm5,mm2
-	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
-	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
-	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
-	psubw	mm0,mm1
-	psubw	mm2,mm5			; mm2=tmp10
-	paddw	mm0,mm5			; mm0=tmp12
-
-	; -- Final output stage
-
-	psubw	mm0,mm3			; mm0=tmp6
-	movq	mm1,mm6
-	movq	mm5,mm7
-	paddw	mm6,mm3			; mm6=data0=(00 01 02 03)
-	paddw	mm7,mm0			; mm7=data1=(10 11 12 13)
-	psubw	mm1,mm3			; mm1=data7=(70 71 72 73)
-	psubw	mm5,mm0			; mm5=data6=(60 61 62 63)
-	psubw	mm4,mm0			; mm4=tmp5
-
-	movq      mm3,mm6		; transpose coefficients(phase 1)
-	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
-	punpckhwd mm3,mm7		; mm3=(02 12 03 13)
-	movq      mm0,mm5		; transpose coefficients(phase 1)
-	punpcklwd mm5,mm1		; mm5=(60 70 61 71)
-	punpckhwd mm0,mm1		; mm0=(62 72 63 73)
-
-	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
-	movq	mm1, MMWORD [wk(1)]	; mm1=tmp3
-
-	movq	MMWORD [wk(0)], mm5	; wk(0)=(60 70 61 71)
-	movq	MMWORD [wk(1)], mm0	; wk(1)=(62 72 63 73)
-
-	paddw	mm2,mm4			; mm2=tmp4
-	movq	mm5,mm7
-	movq	mm0,mm1
-	paddw	mm7,mm4			; mm7=data2=(20 21 22 23)
-	paddw	mm1,mm2			; mm1=data4=(40 41 42 43)
-	psubw	mm5,mm4			; mm5=data5=(50 51 52 53)
-	psubw	mm0,mm2			; mm0=data3=(30 31 32 33)
-
-	movq      mm4,mm7		; transpose coefficients(phase 1)
-	punpcklwd mm7,mm0		; mm7=(20 30 21 31)
-	punpckhwd mm4,mm0		; mm4=(22 32 23 33)
-	movq      mm2,mm1		; transpose coefficients(phase 1)
-	punpcklwd mm1,mm5		; mm1=(40 50 41 51)
-	punpckhwd mm2,mm5		; mm2=(42 52 43 53)
-
-	movq      mm0,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm7		; mm6=(00 10 20 30)
-	punpckhdq mm0,mm7		; mm0=(01 11 21 31)
-	movq      mm5,mm3		; transpose coefficients(phase 2)
-	punpckldq mm3,mm4		; mm3=(02 12 22 32)
-	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
-
-	movq	mm7, MMWORD [wk(0)]	; mm7=(60 70 61 71)
-	movq	mm4, MMWORD [wk(1)]	; mm4=(62 72 63 73)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
-
-	movq      mm6,mm1		; transpose coefficients(phase 2)
-	punpckldq mm1,mm7		; mm1=(40 50 60 70)
-	punpckhdq mm6,mm7		; mm6=(41 51 61 71)
-	movq      mm0,mm2		; transpose coefficients(phase 2)
-	punpckldq mm2,mm4		; mm2=(42 52 62 72)
-	punpckhdq mm0,mm4		; mm0=(43 53 63 73)
-
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
-	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
-
-.nextcolumn:
-	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
-	add	edx, byte 4*SIZEOF_IFAST_MULT_TYPE	; quantptr
-	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
-	dec	ecx					; ctr
-	jnz	near .columnloop
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	lea	esi, [workspace]			; JCOEF * wsptr
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.rowloop:
-
-	; -- Even part
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-	movq	mm4,mm0
-	movq	mm5,mm1
-	psubw	mm0,mm2			; mm0=tmp11
-	psubw	mm1,mm3
-	paddw	mm4,mm2			; mm4=tmp10
-	paddw	mm5,mm3			; mm5=tmp13
-
-	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
-	psubw	mm1,mm5			; mm1=tmp12
-
-	movq	mm6,mm4
-	movq	mm7,mm0
-	psubw	mm4,mm5			; mm4=tmp3
-	psubw	mm0,mm1			; mm0=tmp2
-	paddw	mm6,mm5			; mm6=tmp0
-	paddw	mm7,mm1			; mm7=tmp1
-
-	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
-	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
-
-	; -- Odd part
-
-	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-	movq	mm4,mm2
-	movq	mm0,mm5
-	psubw	mm2,mm1			; mm2=z12
-	psubw	mm5,mm3			; mm5=z10
-	paddw	mm4,mm1			; mm4=z11
-	paddw	mm0,mm3			; mm0=z13
-
-	movq	mm1,mm5			; mm1=z10(unscaled)
-	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
-
-	movq	mm3,mm4
-	psubw	mm4,mm0
-	paddw	mm3,mm0			; mm3=tmp7
-
-	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
-
-	; To avoid overflow...
-	;
-	; (Original)
-	; tmp12 = -2.613125930 * z10 + z5;
-	;
-	; (This implementation)
-	; tmp12 = (-1.613125930 - 1) * z10 + z5;
-	;       = -1.613125930 * z10 - z10 + z5;
-
-	movq	mm0,mm5
-	paddw	mm5,mm2
-	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
-	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
-	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
-	psubw	mm0,mm1
-	psubw	mm2,mm5			; mm2=tmp10
-	paddw	mm0,mm5			; mm0=tmp12
-
-	; -- Final output stage
-
-	psubw	mm0,mm3			; mm0=tmp6
-	movq	mm1,mm6
-	movq	mm5,mm7
-	paddw	mm6,mm3			; mm6=data0=(00 10 20 30)
-	paddw	mm7,mm0			; mm7=data1=(01 11 21 31)
-	psraw	mm6,(PASS1_BITS+3)	; descale
-	psraw	mm7,(PASS1_BITS+3)	; descale
-	psubw	mm1,mm3			; mm1=data7=(07 17 27 37)
-	psubw	mm5,mm0			; mm5=data6=(06 16 26 36)
-	psraw	mm1,(PASS1_BITS+3)	; descale
-	psraw	mm5,(PASS1_BITS+3)	; descale
-	psubw	mm4,mm0			; mm4=tmp5
-
-	packsswb  mm6,mm5		; mm6=(00 10 20 30 06 16 26 36)
-	packsswb  mm7,mm1		; mm7=(01 11 21 31 07 17 27 37)
-
-	movq	mm3, MMWORD [wk(0)]	; mm3=tmp2
-	movq	mm0, MMWORD [wk(1)]	; mm0=tmp3
-
-	paddw	mm2,mm4			; mm2=tmp4
-	movq	mm5,mm3
-	movq	mm1,mm0
-	paddw	mm3,mm4			; mm3=data2=(02 12 22 32)
-	paddw	mm0,mm2			; mm0=data4=(04 14 24 34)
-	psraw	mm3,(PASS1_BITS+3)	; descale
-	psraw	mm0,(PASS1_BITS+3)	; descale
-	psubw	mm5,mm4			; mm5=data5=(05 15 25 35)
-	psubw	mm1,mm2			; mm1=data3=(03 13 23 33)
-	psraw	mm5,(PASS1_BITS+3)	; descale
-	psraw	mm1,(PASS1_BITS+3)	; descale
-
-	movq      mm4,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm4=[PB_CENTERJSAMP]
-
-	packsswb  mm3,mm0		; mm3=(02 12 22 32 04 14 24 34)
-	packsswb  mm1,mm5		; mm1=(03 13 23 33 05 15 25 35)
-
-	paddb     mm6,mm4
-	paddb     mm7,mm4
-	paddb     mm3,mm4
-	paddb     mm1,mm4
-
-	movq      mm2,mm6		; transpose coefficients(phase 1)
-	punpcklbw mm6,mm7		; mm6=(00 01 10 11 20 21 30 31)
-	punpckhbw mm2,mm7		; mm2=(06 07 16 17 26 27 36 37)
-	movq      mm0,mm3		; transpose coefficients(phase 1)
-	punpcklbw mm3,mm1		; mm3=(02 03 12 13 22 23 32 33)
-	punpckhbw mm0,mm1		; mm0=(04 05 14 15 24 25 34 35)
-
-	movq      mm5,mm6		; transpose coefficients(phase 2)
-	punpcklwd mm6,mm3		; mm6=(00 01 02 03 10 11 12 13)
-	punpckhwd mm5,mm3		; mm5=(20 21 22 23 30 31 32 33)
-	movq      mm4,mm0		; transpose coefficients(phase 2)
-	punpcklwd mm0,mm2		; mm0=(04 05 06 07 14 15 16 17)
-	punpckhwd mm4,mm2		; mm4=(24 25 26 27 34 35 36 37)
-
-	movq      mm7,mm6		; transpose coefficients(phase 3)
-	punpckldq mm6,mm0		; mm6=(00 01 02 03 04 05 06 07)
-	punpckhdq mm7,mm0		; mm7=(10 11 12 13 14 15 16 17)
-	movq      mm1,mm5		; transpose coefficients(phase 3)
-	punpckldq mm5,mm4		; mm5=(20 21 22 23 24 25 26 27)
-	punpckhdq mm1,mm4		; mm1=(30 31 32 33 34 35 36 37)
-
-	pushpic	ebx			; save GOT address
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
-	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
-
-	poppic	ebx			; restore GOT address
-
-	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
-	add	edi, byte 4*SIZEOF_JSAMPROW
-	dec	ecx				; ctr
-	jnz	near .rowloop
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jimmxint.asm b/simd/jimmxint.asm
deleted file mode 100644
index 2b84f62..0000000
--- a/simd/jimmxint.asm
+++ /dev/null
@@ -1,852 +0,0 @@
-;
-; jimmxint.asm - accurate integer IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
-%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298	equ	 2446		; FIX(0.298631336)
-F_0_390	equ	 3196		; FIX(0.390180644)
-F_0_541	equ	 4433		; FIX(0.541196100)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_175	equ	 9633		; FIX(1.175875602)
-F_1_501	equ	12299		; FIX(1.501321110)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_1_961	equ	16069		; FIX(1.961570560)
-F_2_053	equ	16819		; FIX(2.053119869)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_072	equ	25172		; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
-F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
-F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
-F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_islow_mmx) PRIVATE
-
-EXTN(jconst_idct_islow_mmx):
-
-PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_mmx (void * dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		12
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
-					; JCOEF workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_islow_mmx) PRIVATE
-
-EXTN(jsimd_idct_islow_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [workspace]
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-	lea	edi, [workspace]			; JCOEF * wsptr
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	short .columnDCT
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	mm1,mm0
-	packsswb mm1,mm1
-	movd	eax,mm1
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	psllw	mm0,PASS1_BITS
-
-	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
-	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
-	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
-
-	movq      mm1,mm0
-	punpckldq mm0,mm0		; mm0=(00 00 00 00)
-	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
-	movq      mm3,mm2
-	punpckldq mm2,mm2		; mm2=(02 02 02 02)
-	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
-	jmp	near .nextcolumn
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; (Original)
-	; z1 = (z2 + z3) * 0.541196100;
-	; tmp2 = z1 + z3 * -1.847759065;
-	; tmp3 = z1 + z2 * 0.765366865;
-	;
-	; (This implementation)
-	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-	movq      mm4,mm1		; mm1=in2=z2
-	movq      mm5,mm1
-	punpcklwd mm4,mm3		; mm3=in6=z3
-	punpckhwd mm5,mm3
-	movq      mm1,mm4
-	movq      mm3,mm5
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
-	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
-	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
-
-	movq      mm6,mm0
-	paddw     mm0,mm2		; mm0=in0+in4
-	psubw     mm6,mm2		; mm6=in0-in4
-
-	pxor      mm7,mm7
-	pxor      mm2,mm2
-	punpcklwd mm7,mm0		; mm7=tmp0L
-	punpckhwd mm2,mm0		; mm2=tmp0H
-	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
-	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
-
-	movq	mm0,mm7
-	paddd	mm7,mm4			; mm7=tmp10L
-	psubd	mm0,mm4			; mm0=tmp13L
-	movq	mm4,mm2
-	paddd	mm2,mm5			; mm2=tmp10H
-	psubd	mm4,mm5			; mm4=tmp13H
-
-	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
-	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
-	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
-	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
-
-	pxor      mm5,mm5
-	pxor      mm7,mm7
-	punpcklwd mm5,mm6		; mm5=tmp1L
-	punpckhwd mm7,mm6		; mm7=tmp1H
-	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
-	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
-
-	movq	mm2,mm5
-	paddd	mm5,mm1			; mm5=tmp11L
-	psubd	mm2,mm1			; mm2=tmp12L
-	movq	mm0,mm7
-	paddd	mm7,mm3			; mm7=tmp11H
-	psubd	mm0,mm3			; mm0=tmp12H
-
-	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
-	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
-	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
-	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
-
-	; -- Odd part
-
-	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movq	mm5,mm6
-	movq	mm7,mm4
-	paddw	mm5,mm3			; mm5=z3
-	paddw	mm7,mm1			; mm7=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movq      mm2,mm5
-	movq      mm0,mm5
-	punpcklwd mm2,mm7
-	punpckhwd mm0,mm7
-	movq      mm5,mm2
-	movq      mm7,mm0
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
-	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
-
-	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
-	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
-
-	; (Original)
-	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-	;
-	; (This implementation)
-	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-	; tmp0 += z3;  tmp1 += z4;
-	; tmp2 += z3;  tmp3 += z4;
-
-	movq      mm2,mm3
-	movq      mm0,mm3
-	punpcklwd mm2,mm4
-	punpckhwd mm0,mm4
-	movq      mm3,mm2
-	movq      mm4,mm0
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
-	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
-	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
-
-	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
-	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
-	paddd	mm3,mm5			; mm3=tmp3L
-	paddd	mm4,mm7			; mm4=tmp3H
-
-	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
-	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
-
-	movq      mm2,mm1
-	movq      mm0,mm1
-	punpcklwd mm2,mm6
-	punpckhwd mm0,mm6
-	movq      mm1,mm2
-	movq      mm6,mm0
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
-	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
-
-	paddd	mm2,mm5			; mm2=tmp1L
-	paddd	mm0,mm7			; mm0=tmp1H
-	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
-	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
-
-	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
-	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
-
-	; -- Final output stage
-
-	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
-	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
-
-	movq	mm2,mm5
-	movq	mm0,mm7
-	paddd	mm5,mm3			; mm5=data0L
-	paddd	mm7,mm4			; mm7=data0H
-	psubd	mm2,mm3			; mm2=data7L
-	psubd	mm0,mm4			; mm0=data7H
-
-	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm3=[PD_DESCALE_P1]
-
-	paddd	mm5,mm3
-	paddd	mm7,mm3
-	psrad	mm5,DESCALE_P1
-	psrad	mm7,DESCALE_P1
-	paddd	mm2,mm3
-	paddd	mm0,mm3
-	psrad	mm2,DESCALE_P1
-	psrad	mm0,DESCALE_P1
-
-	packssdw  mm5,mm7		; mm5=data0=(00 01 02 03)
-	packssdw  mm2,mm0		; mm2=data7=(70 71 72 73)
-
-	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
-	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
-
-	movq	mm7,mm4
-	movq	mm0,mm3
-	paddd	mm4,mm1			; mm4=data1L
-	paddd	mm3,mm6			; mm3=data1H
-	psubd	mm7,mm1			; mm7=data6L
-	psubd	mm0,mm6			; mm0=data6H
-
-	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm1=[PD_DESCALE_P1]
-
-	paddd	mm4,mm1
-	paddd	mm3,mm1
-	psrad	mm4,DESCALE_P1
-	psrad	mm3,DESCALE_P1
-	paddd	mm7,mm1
-	paddd	mm0,mm1
-	psrad	mm7,DESCALE_P1
-	psrad	mm0,DESCALE_P1
-
-	packssdw  mm4,mm3		; mm4=data1=(10 11 12 13)
-	packssdw  mm7,mm0		; mm7=data6=(60 61 62 63)
-
-	movq      mm6,mm5		; transpose coefficients(phase 1)
-	punpcklwd mm5,mm4		; mm5=(00 10 01 11)
-	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
-	movq      mm1,mm7		; transpose coefficients(phase 1)
-	punpcklwd mm7,mm2		; mm7=(60 70 61 71)
-	punpckhwd mm1,mm2		; mm1=(62 72 63 73)
-
-	movq	mm3, MMWORD [wk(6)]	; mm3=tmp12L
-	movq	mm0, MMWORD [wk(7)]	; mm0=tmp12H
-	movq	mm4, MMWORD [wk(10)]	; mm4=tmp1L
-	movq	mm2, MMWORD [wk(11)]	; mm2=tmp1H
-
-	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 01 11)
-	movq	MMWORD [wk(1)], mm6	; wk(1)=(02 12 03 13)
-	movq	MMWORD [wk(4)], mm7	; wk(4)=(60 70 61 71)
-	movq	MMWORD [wk(5)], mm1	; wk(5)=(62 72 63 73)
-
-	movq	mm5,mm3
-	movq	mm6,mm0
-	paddd	mm3,mm4			; mm3=data2L
-	paddd	mm0,mm2			; mm0=data2H
-	psubd	mm5,mm4			; mm5=data5L
-	psubd	mm6,mm2			; mm6=data5H
-
-	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm7=[PD_DESCALE_P1]
-
-	paddd	mm3,mm7
-	paddd	mm0,mm7
-	psrad	mm3,DESCALE_P1
-	psrad	mm0,DESCALE_P1
-	paddd	mm5,mm7
-	paddd	mm6,mm7
-	psrad	mm5,DESCALE_P1
-	psrad	mm6,DESCALE_P1
-
-	packssdw  mm3,mm0		; mm3=data2=(20 21 22 23)
-	packssdw  mm5,mm6		; mm5=data5=(50 51 52 53)
-
-	movq	mm1, MMWORD [wk(2)]	; mm1=tmp13L
-	movq	mm4, MMWORD [wk(3)]	; mm4=tmp13H
-	movq	mm2, MMWORD [wk(8)]	; mm2=tmp0L
-	movq	mm7, MMWORD [wk(9)]	; mm7=tmp0H
-
-	movq	mm0,mm1
-	movq	mm6,mm4
-	paddd	mm1,mm2			; mm1=data3L
-	paddd	mm4,mm7			; mm4=data3H
-	psubd	mm0,mm2			; mm0=data4L
-	psubd	mm6,mm7			; mm6=data4H
-
-	movq	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm2=[PD_DESCALE_P1]
-
-	paddd	mm1,mm2
-	paddd	mm4,mm2
-	psrad	mm1,DESCALE_P1
-	psrad	mm4,DESCALE_P1
-	paddd	mm0,mm2
-	paddd	mm6,mm2
-	psrad	mm0,DESCALE_P1
-	psrad	mm6,DESCALE_P1
-
-	packssdw  mm1,mm4		; mm1=data3=(30 31 32 33)
-	packssdw  mm0,mm6		; mm0=data4=(40 41 42 43)
-
-	movq	mm7, MMWORD [wk(0)]	; mm7=(00 10 01 11)
-	movq	mm2, MMWORD [wk(1)]	; mm2=(02 12 03 13)
-
-	movq      mm4,mm3		; transpose coefficients(phase 1)
-	punpcklwd mm3,mm1		; mm3=(20 30 21 31)
-	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
-	movq      mm6,mm0		; transpose coefficients(phase 1)
-	punpcklwd mm0,mm5		; mm0=(40 50 41 51)
-	punpckhwd mm6,mm5		; mm6=(42 52 43 53)
-
-	movq      mm1,mm7		; transpose coefficients(phase 2)
-	punpckldq mm7,mm3		; mm7=(00 10 20 30)
-	punpckhdq mm1,mm3		; mm1=(01 11 21 31)
-	movq      mm5,mm2		; transpose coefficients(phase 2)
-	punpckldq mm2,mm4		; mm2=(02 12 22 32)
-	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
-
-	movq	mm3, MMWORD [wk(4)]	; mm3=(60 70 61 71)
-	movq	mm4, MMWORD [wk(5)]	; mm4=(62 72 63 73)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
-
-	movq      mm7,mm0		; transpose coefficients(phase 2)
-	punpckldq mm0,mm3		; mm0=(40 50 60 70)
-	punpckhdq mm7,mm3		; mm7=(41 51 61 71)
-	movq      mm1,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm4		; mm6=(42 52 62 72)
-	punpckhdq mm1,mm4		; mm1=(43 53 63 73)
-
-	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
-	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
-	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
-
-.nextcolumn:
-	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
-	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
-	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
-	dec	ecx					; ctr
-	jnz	near .columnloop
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	lea	esi, [workspace]			; JCOEF * wsptr
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.rowloop:
-
-	; -- Even part
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-	; (Original)
-	; z1 = (z2 + z3) * 0.541196100;
-	; tmp2 = z1 + z3 * -1.847759065;
-	; tmp3 = z1 + z2 * 0.765366865;
-	;
-	; (This implementation)
-	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-	movq      mm4,mm1		; mm1=in2=z2
-	movq      mm5,mm1
-	punpcklwd mm4,mm3		; mm3=in6=z3
-	punpckhwd mm5,mm3
-	movq      mm1,mm4
-	movq      mm3,mm5
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
-	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
-	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
-
-	movq      mm6,mm0
-	paddw     mm0,mm2		; mm0=in0+in4
-	psubw     mm6,mm2		; mm6=in0-in4
-
-	pxor      mm7,mm7
-	pxor      mm2,mm2
-	punpcklwd mm7,mm0		; mm7=tmp0L
-	punpckhwd mm2,mm0		; mm2=tmp0H
-	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
-	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
-
-	movq	mm0,mm7
-	paddd	mm7,mm4			; mm7=tmp10L
-	psubd	mm0,mm4			; mm0=tmp13L
-	movq	mm4,mm2
-	paddd	mm2,mm5			; mm2=tmp10H
-	psubd	mm4,mm5			; mm4=tmp13H
-
-	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
-	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
-	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
-	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
-
-	pxor      mm5,mm5
-	pxor      mm7,mm7
-	punpcklwd mm5,mm6		; mm5=tmp1L
-	punpckhwd mm7,mm6		; mm7=tmp1H
-	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
-	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
-
-	movq	mm2,mm5
-	paddd	mm5,mm1			; mm5=tmp11L
-	psubd	mm2,mm1			; mm2=tmp12L
-	movq	mm0,mm7
-	paddd	mm7,mm3			; mm7=tmp11H
-	psubd	mm0,mm3			; mm0=tmp12H
-
-	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
-	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
-	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
-	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
-
-	; -- Odd part
-
-	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-	movq	mm5,mm6
-	movq	mm7,mm4
-	paddw	mm5,mm3			; mm5=z3
-	paddw	mm7,mm1			; mm7=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movq      mm2,mm5
-	movq      mm0,mm5
-	punpcklwd mm2,mm7
-	punpckhwd mm0,mm7
-	movq      mm5,mm2
-	movq      mm7,mm0
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
-	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
-
-	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
-	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
-
-	; (Original)
-	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-	;
-	; (This implementation)
-	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-	; tmp0 += z3;  tmp1 += z4;
-	; tmp2 += z3;  tmp3 += z4;
-
-	movq      mm2,mm3
-	movq      mm0,mm3
-	punpcklwd mm2,mm4
-	punpckhwd mm0,mm4
-	movq      mm3,mm2
-	movq      mm4,mm0
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
-	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
-	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
-
-	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
-	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
-	paddd	mm3,mm5			; mm3=tmp3L
-	paddd	mm4,mm7			; mm4=tmp3H
-
-	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
-	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
-
-	movq      mm2,mm1
-	movq      mm0,mm1
-	punpcklwd mm2,mm6
-	punpckhwd mm0,mm6
-	movq      mm1,mm2
-	movq      mm6,mm0
-	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
-	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
-	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
-	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
-
-	paddd	mm2,mm5			; mm2=tmp1L
-	paddd	mm0,mm7			; mm0=tmp1H
-	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
-	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
-
-	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
-	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
-
-	; -- Final output stage
-
-	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
-	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
-
-	movq	mm2,mm5
-	movq	mm0,mm7
-	paddd	mm5,mm3			; mm5=data0L
-	paddd	mm7,mm4			; mm7=data0H
-	psubd	mm2,mm3			; mm2=data7L
-	psubd	mm0,mm4			; mm0=data7H
-
-	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm3=[PD_DESCALE_P2]
-
-	paddd	mm5,mm3
-	paddd	mm7,mm3
-	psrad	mm5,DESCALE_P2
-	psrad	mm7,DESCALE_P2
-	paddd	mm2,mm3
-	paddd	mm0,mm3
-	psrad	mm2,DESCALE_P2
-	psrad	mm0,DESCALE_P2
-
-	packssdw  mm5,mm7		; mm5=data0=(00 10 20 30)
-	packssdw  mm2,mm0		; mm2=data7=(07 17 27 37)
-
-	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
-	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
-
-	movq	mm7,mm4
-	movq	mm0,mm3
-	paddd	mm4,mm1			; mm4=data1L
-	paddd	mm3,mm6			; mm3=data1H
-	psubd	mm7,mm1			; mm7=data6L
-	psubd	mm0,mm6			; mm0=data6H
-
-	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm1=[PD_DESCALE_P2]
-
-	paddd	mm4,mm1
-	paddd	mm3,mm1
-	psrad	mm4,DESCALE_P2
-	psrad	mm3,DESCALE_P2
-	paddd	mm7,mm1
-	paddd	mm0,mm1
-	psrad	mm7,DESCALE_P2
-	psrad	mm0,DESCALE_P2
-
-	packssdw  mm4,mm3		; mm4=data1=(01 11 21 31)
-	packssdw  mm7,mm0		; mm7=data6=(06 16 26 36)
-
-	packsswb  mm5,mm7		; mm5=(00 10 20 30 06 16 26 36)
-	packsswb  mm4,mm2		; mm4=(01 11 21 31 07 17 27 37)
-
-	movq	mm6, MMWORD [wk(6)]	; mm6=tmp12L
-	movq	mm1, MMWORD [wk(7)]	; mm1=tmp12H
-	movq	mm3, MMWORD [wk(10)]	; mm3=tmp1L
-	movq	mm0, MMWORD [wk(11)]	; mm0=tmp1H
-
-	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 20 30 06 16 26 36)
-	movq	MMWORD [wk(1)], mm4	; wk(1)=(01 11 21 31 07 17 27 37)
-
-	movq	mm7,mm6
-	movq	mm2,mm1
-	paddd	mm6,mm3			; mm6=data2L
-	paddd	mm1,mm0			; mm1=data2H
-	psubd	mm7,mm3			; mm7=data5L
-	psubd	mm2,mm0			; mm2=data5H
-
-	movq	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm5=[PD_DESCALE_P2]
-
-	paddd	mm6,mm5
-	paddd	mm1,mm5
-	psrad	mm6,DESCALE_P2
-	psrad	mm1,DESCALE_P2
-	paddd	mm7,mm5
-	paddd	mm2,mm5
-	psrad	mm7,DESCALE_P2
-	psrad	mm2,DESCALE_P2
-
-	packssdw  mm6,mm1		; mm6=data2=(02 12 22 32)
-	packssdw  mm7,mm2		; mm7=data5=(05 15 25 35)
-
-	movq	mm4, MMWORD [wk(2)]	; mm4=tmp13L
-	movq	mm3, MMWORD [wk(3)]	; mm3=tmp13H
-	movq	mm0, MMWORD [wk(8)]	; mm0=tmp0L
-	movq	mm5, MMWORD [wk(9)]	; mm5=tmp0H
-
-	movq	mm1,mm4
-	movq	mm2,mm3
-	paddd	mm4,mm0			; mm4=data3L
-	paddd	mm3,mm5			; mm3=data3H
-	psubd	mm1,mm0			; mm1=data4L
-	psubd	mm2,mm5			; mm2=data4H
-
-	movq	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm0=[PD_DESCALE_P2]
-
-	paddd	mm4,mm0
-	paddd	mm3,mm0
-	psrad	mm4,DESCALE_P2
-	psrad	mm3,DESCALE_P2
-	paddd	mm1,mm0
-	paddd	mm2,mm0
-	psrad	mm1,DESCALE_P2
-	psrad	mm2,DESCALE_P2
-
-	movq      mm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm5=[PB_CENTERJSAMP]
-
-	packssdw  mm4,mm3		; mm4=data3=(03 13 23 33)
-	packssdw  mm1,mm2		; mm1=data4=(04 14 24 34)
-
-	movq      mm0, MMWORD [wk(0)]	; mm0=(00 10 20 30 06 16 26 36)
-	movq      mm3, MMWORD [wk(1)]	; mm3=(01 11 21 31 07 17 27 37)
-
-	packsswb  mm6,mm1		; mm6=(02 12 22 32 04 14 24 34)
-	packsswb  mm4,mm7		; mm4=(03 13 23 33 05 15 25 35)
-
-	paddb     mm0,mm5
-	paddb     mm3,mm5
-	paddb     mm6,mm5
-	paddb     mm4,mm5
-
-	movq      mm2,mm0		; transpose coefficients(phase 1)
-	punpcklbw mm0,mm3		; mm0=(00 01 10 11 20 21 30 31)
-	punpckhbw mm2,mm3		; mm2=(06 07 16 17 26 27 36 37)
-	movq      mm1,mm6		; transpose coefficients(phase 1)
-	punpcklbw mm6,mm4		; mm6=(02 03 12 13 22 23 32 33)
-	punpckhbw mm1,mm4		; mm1=(04 05 14 15 24 25 34 35)
-
-	movq      mm7,mm0		; transpose coefficients(phase 2)
-	punpcklwd mm0,mm6		; mm0=(00 01 02 03 10 11 12 13)
-	punpckhwd mm7,mm6		; mm7=(20 21 22 23 30 31 32 33)
-	movq      mm5,mm1		; transpose coefficients(phase 2)
-	punpcklwd mm1,mm2		; mm1=(04 05 06 07 14 15 16 17)
-	punpckhwd mm5,mm2		; mm5=(24 25 26 27 34 35 36 37)
-
-	movq      mm3,mm0		; transpose coefficients(phase 3)
-	punpckldq mm0,mm1		; mm0=(00 01 02 03 04 05 06 07)
-	punpckhdq mm3,mm1		; mm3=(10 11 12 13 14 15 16 17)
-	movq      mm4,mm7		; transpose coefficients(phase 3)
-	punpckldq mm7,mm5		; mm7=(20 21 22 23 24 25 26 27)
-	punpckhdq mm4,mm5		; mm4=(30 31 32 33 34 35 36 37)
-
-	pushpic	ebx			; save GOT address
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
-	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
-
-	poppic	ebx			; restore GOT address
-
-	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
-	add	edi, byte 4*SIZEOF_JSAMPROW
-	dec	ecx				; ctr
-	jnz	near .rowloop
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jimmxred.asm b/simd/jimmxred.asm
deleted file mode 100644
index f8e61d1..0000000
--- a/simd/jimmxred.asm
+++ /dev/null
@@ -1,706 +0,0 @@
-;
-; jimmxred.asm - reduced-size IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211	equ	 1730		; FIX(0.211164243)
-F_0_509	equ	 4176		; FIX(0.509795579)
-F_0_601	equ	 4926		; FIX(0.601344887)
-F_0_720	equ	 5906		; FIX(0.720959822)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_850	equ	 6967		; FIX(0.850430095)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_061	equ	 8697		; FIX(1.061594337)
-F_1_272	equ	10426		; FIX(1.272758580)
-F_1_451	equ	11893		; FIX(1.451774981)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_2_172	equ	17799		; FIX(2.172734803)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_624	equ	29692		; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
-F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
-F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
-F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
-F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
-F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_red_mmx) PRIVATE
-
-EXTN(jconst_idct_red_mmx):
-
-PW_F184_MF076	times 2 dw  F_1_847,-F_0_765
-PW_F256_F089	times 2 dw  F_2_562, F_0_899
-PW_F106_MF217	times 2 dw  F_1_061,-F_2_172
-PW_MF060_MF050	times 2 dw -F_0_601,-F_0_509
-PW_F145_MF021	times 2 dw  F_1_451,-F_0_211
-PW_F362_MF127	times 2 dw  F_3_624,-F_1_272
-PW_F085_MF072	times 2 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4	times 2 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4	times 2 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2	times 2 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2	times 2 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_mmx (void * dct_table, JCOEFPTR coef_block,
-;                     JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
-%define WK_NUM		2
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
-					; JCOEF workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_4x4_mmx) PRIVATE
-
-EXTN(jsimd_idct_4x4_mmx):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [workspace]
-	pushpic	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-	lea	edi, [workspace]			; JCOEF * wsptr
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	short .columnDCT
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	mm0,mm1
-	packsswb mm0,mm0
-	movd	eax,mm0
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	psllw	mm0,PASS1_BITS
-
-	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
-	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
-	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
-
-	movq      mm1,mm0
-	punpckldq mm0,mm0		; mm0=(00 00 00 00)
-	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
-	movq      mm3,mm2
-	punpckldq mm2,mm2		; mm2=(02 02 02 02)
-	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-	jmp	near .nextcolumn
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Odd part
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movq      mm4,mm0
-	movq      mm5,mm0
-	punpcklwd mm4,mm1
-	punpckhwd mm5,mm1
-	movq      mm0,mm4
-	movq      mm1,mm5
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
-	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
-	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
-
-	movq      mm6,mm2
-	movq      mm7,mm2
-	punpcklwd mm6,mm3
-	punpckhwd mm7,mm3
-	movq      mm2,mm6
-	movq      mm3,mm7
-	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
-	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
-	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
-	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
-
-	paddd	mm6,mm4			; mm6=tmp2L
-	paddd	mm7,mm5			; mm7=tmp2H
-	paddd	mm2,mm0			; mm2=tmp0L
-	paddd	mm3,mm1			; mm3=tmp0H
-
-	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
-	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
-
-	; -- Even part
-
-	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	pxor      mm1,mm1
-	pxor      mm2,mm2
-	punpcklwd mm1,mm4		; mm1=tmp0L
-	punpckhwd mm2,mm4		; mm2=tmp0H
-	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
-	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
-
-	movq      mm3,mm5		; mm5=in2=z2
-	punpcklwd mm5,mm0		; mm0=in6=z3
-	punpckhwd mm3,mm0
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
-	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
-
-	movq	mm4,mm1
-	movq	mm0,mm2
-	paddd	mm1,mm5			; mm1=tmp10L
-	paddd	mm2,mm3			; mm2=tmp10H
-	psubd	mm4,mm5			; mm4=tmp12L
-	psubd	mm0,mm3			; mm0=tmp12H
-
-	; -- Final output stage
-
-	movq	mm5,mm1
-	movq	mm3,mm2
-	paddd	mm1,mm6			; mm1=data0L
-	paddd	mm2,mm7			; mm2=data0H
-	psubd	mm5,mm6			; mm5=data3L
-	psubd	mm3,mm7			; mm3=data3H
-
-	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm6=[PD_DESCALE_P1_4]
-
-	paddd	mm1,mm6
-	paddd	mm2,mm6
-	psrad	mm1,DESCALE_P1_4
-	psrad	mm2,DESCALE_P1_4
-	paddd	mm5,mm6
-	paddd	mm3,mm6
-	psrad	mm5,DESCALE_P1_4
-	psrad	mm3,DESCALE_P1_4
-
-	packssdw  mm1,mm2		; mm1=data0=(00 01 02 03)
-	packssdw  mm5,mm3		; mm5=data3=(30 31 32 33)
-
-	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
-	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
-
-	movq	mm2,mm4
-	movq	mm3,mm0
-	paddd	mm4,mm7			; mm4=data1L
-	paddd	mm0,mm6			; mm0=data1H
-	psubd	mm2,mm7			; mm2=data2L
-	psubd	mm3,mm6			; mm3=data2H
-
-	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm7=[PD_DESCALE_P1_4]
-
-	paddd	mm4,mm7
-	paddd	mm0,mm7
-	psrad	mm4,DESCALE_P1_4
-	psrad	mm0,DESCALE_P1_4
-	paddd	mm2,mm7
-	paddd	mm3,mm7
-	psrad	mm2,DESCALE_P1_4
-	psrad	mm3,DESCALE_P1_4
-
-	packssdw  mm4,mm0		; mm4=data1=(10 11 12 13)
-	packssdw  mm2,mm3		; mm2=data2=(20 21 22 23)
-
-	movq      mm6,mm1		; transpose coefficients(phase 1)
-	punpcklwd mm1,mm4		; mm1=(00 10 01 11)
-	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
-	movq      mm7,mm2		; transpose coefficients(phase 1)
-	punpcklwd mm2,mm5		; mm2=(20 30 21 31)
-	punpckhwd mm7,mm5		; mm7=(22 32 23 33)
-
-	movq      mm0,mm1		; transpose coefficients(phase 2)
-	punpckldq mm1,mm2		; mm1=(00 10 20 30)
-	punpckhdq mm0,mm2		; mm0=(01 11 21 31)
-	movq      mm3,mm6		; transpose coefficients(phase 2)
-	punpckldq mm6,mm7		; mm6=(02 12 22 32)
-	punpckhdq mm3,mm7		; mm3=(03 13 23 33)
-
-	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
-	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
-	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
-	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-
-.nextcolumn:
-	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
-	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
-	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
-	dec	ecx					; ctr
-	jnz	near .columnloop
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	lea	esi, [workspace]			; JCOEF * wsptr
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-
-	; -- Odd part
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-	movq      mm4,mm0
-	movq      mm5,mm0
-	punpcklwd mm4,mm1
-	punpckhwd mm5,mm1
-	movq      mm0,mm4
-	movq      mm1,mm5
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
-	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
-	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
-
-	movq      mm6,mm2
-	movq      mm7,mm2
-	punpcklwd mm6,mm3
-	punpckhwd mm7,mm3
-	movq      mm2,mm6
-	movq      mm3,mm7
-	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
-	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
-	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
-	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
-
-	paddd	mm6,mm4			; mm6=tmp2L
-	paddd	mm7,mm5			; mm7=tmp2H
-	paddd	mm2,mm0			; mm2=tmp0L
-	paddd	mm3,mm1			; mm3=tmp0H
-
-	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
-	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
-
-	; -- Even part
-
-	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-	pxor      mm1,mm1
-	pxor      mm2,mm2
-	punpcklwd mm1,mm4		; mm1=tmp0L
-	punpckhwd mm2,mm4		; mm2=tmp0H
-	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
-	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
-
-	movq      mm3,mm5		; mm5=in2=z2
-	punpcklwd mm5,mm0		; mm0=in6=z3
-	punpckhwd mm3,mm0
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
-	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
-
-	movq	mm4,mm1
-	movq	mm0,mm2
-	paddd	mm1,mm5			; mm1=tmp10L
-	paddd	mm2,mm3			; mm2=tmp10H
-	psubd	mm4,mm5			; mm4=tmp12L
-	psubd	mm0,mm3			; mm0=tmp12H
-
-	; -- Final output stage
-
-	movq	mm5,mm1
-	movq	mm3,mm2
-	paddd	mm1,mm6			; mm1=data0L
-	paddd	mm2,mm7			; mm2=data0H
-	psubd	mm5,mm6			; mm5=data3L
-	psubd	mm3,mm7			; mm3=data3H
-
-	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm6=[PD_DESCALE_P2_4]
-
-	paddd	mm1,mm6
-	paddd	mm2,mm6
-	psrad	mm1,DESCALE_P2_4
-	psrad	mm2,DESCALE_P2_4
-	paddd	mm5,mm6
-	paddd	mm3,mm6
-	psrad	mm5,DESCALE_P2_4
-	psrad	mm3,DESCALE_P2_4
-
-	packssdw  mm1,mm2		; mm1=data0=(00 10 20 30)
-	packssdw  mm5,mm3		; mm5=data3=(03 13 23 33)
-
-	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
-	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
-
-	movq	mm2,mm4
-	movq	mm3,mm0
-	paddd	mm4,mm7			; mm4=data1L
-	paddd	mm0,mm6			; mm0=data1H
-	psubd	mm2,mm7			; mm2=data2L
-	psubd	mm3,mm6			; mm3=data2H
-
-	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm7=[PD_DESCALE_P2_4]
-
-	paddd	mm4,mm7
-	paddd	mm0,mm7
-	psrad	mm4,DESCALE_P2_4
-	psrad	mm0,DESCALE_P2_4
-	paddd	mm2,mm7
-	paddd	mm3,mm7
-	psrad	mm2,DESCALE_P2_4
-	psrad	mm3,DESCALE_P2_4
-
-	packssdw  mm4,mm0		; mm4=data1=(01 11 21 31)
-	packssdw  mm2,mm3		; mm2=data2=(02 12 22 32)
-
-	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
-
-	packsswb  mm1,mm2		; mm1=(00 10 20 30 02 12 22 32)
-	packsswb  mm4,mm5		; mm4=(01 11 21 31 03 13 23 33)
-	paddb     mm1,mm6
-	paddb     mm4,mm6
-
-	movq      mm7,mm1		; transpose coefficients(phase 1)
-	punpcklbw mm1,mm4		; mm1=(00 01 10 11 20 21 30 31)
-	punpckhbw mm7,mm4		; mm7=(02 03 12 13 22 23 32 33)
-
-	movq      mm0,mm1		; transpose coefficients(phase 2)
-	punpcklwd mm1,mm7		; mm1=(00 01 02 03 10 11 12 13)
-	punpckhwd mm0,mm7		; mm0=(20 21 22 23 30 31 32 33)
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
-
-	psrlq	mm1,4*BYTE_BIT
-	psrlq	mm0,4*BYTE_BIT
-
-	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_mmx (void * dct_table, JCOEFPTR coef_block,
-;                     JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-	align	16
-	global	EXTN(jsimd_idct_2x2_mmx) PRIVATE
-
-EXTN(jsimd_idct_2x2_mmx):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input.
-
-	mov	edx, POINTER [dct_table(ebp)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
-
-	; | input:                  | result:        |
-	; | 00 01 ** 03 ** 05 ** 07 |                |
-	; | 10 11 ** 13 ** 15 ** 17 |                |
-	; | ** ** ** ** ** ** ** ** |                |
-	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-	; | 50 51 ** 53 ** 55 ** 57 |                |
-	; | ** ** ** ** ** ** ** ** |                |
-	; | 70 71 ** 73 ** 75 ** 77 |                |
-
-	; -- Odd part
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
-	; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
-
-	pcmpeqd   mm7,mm7
-	pslld     mm7,WORD_BIT		; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
-
-	movq      mm4,mm0		; mm4=(10 11 ** 13)
-	movq      mm5,mm2		; mm5=(50 51 ** 53)
-	punpcklwd mm4,mm1		; mm4=(10 30 11 31)
-	punpcklwd mm5,mm3		; mm5=(50 70 51 71)
-	pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-	psrld	mm0,WORD_BIT		; mm0=(11 -- 13 --)
-	pand	mm1,mm7			; mm1=(-- 31 -- 33)
-	psrld	mm2,WORD_BIT		; mm2=(51 -- 53 --)
-	pand	mm3,mm7			; mm3=(-- 71 -- 73)
-	por	mm0,mm1			; mm0=(11 31 13 33)
-	por	mm2,mm3			; mm2=(51 71 53 73)
-	pmaddwd	mm0,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd	mm2,[GOTOFF(ebx,PW_F085_MF072)]
-
-	paddd	mm4,mm5			; mm4=tmp0[col0 col1]
-
-	movq	mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
-	pmullw	mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movq	mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
-	movq	mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
-	pmullw	mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
-	; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
-
-	psrld	mm6,WORD_BIT		; mm6=(15 -- 17 --)
-	pand	mm1,mm7			; mm1=(-- 35 -- 37)
-	psrld	mm3,WORD_BIT		; mm3=(55 -- 57 --)
-	pand	mm5,mm7			; mm5=(-- 75 -- 77)
-	por	mm6,mm1			; mm6=(15 35 17 37)
-	por	mm3,mm5			; mm3=(55 75 57 77)
-	pmaddwd	mm6,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd	mm3,[GOTOFF(ebx,PW_F085_MF072)]
-
-	paddd	mm0,mm2			; mm0=tmp0[col1 col3]
-	paddd	mm6,mm3			; mm6=tmp0[col5 col7]
-
-	; -- Even part
-
-	movq	mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq	mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
-	pmullw	mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
-
-	movq	mm2,mm1				; mm2=(00 01 ** 03)
-	pslld	mm1,WORD_BIT			; mm1=(-- 00 -- **)
-	psrad	mm1,(WORD_BIT-CONST_BITS-2)	; mm1=tmp10[col0 ****]
-
-	pand	mm2,mm7				; mm2=(-- 01 -- 03)
-	pand	mm5,mm7				; mm5=(-- 05 -- 07)
-	psrad	mm2,(WORD_BIT-CONST_BITS-2)	; mm2=tmp10[col1 col3]
-	psrad	mm5,(WORD_BIT-CONST_BITS-2)	; mm5=tmp10[col5 col7]
-
-	; -- Final output stage
-
-	movq      mm3,mm1
-	paddd     mm1,mm4		; mm1=data0[col0 ****]=(A0 **)
-	psubd     mm3,mm4		; mm3=data1[col0 ****]=(B0 **)
-	punpckldq mm1,mm3		; mm1=(A0 B0)
-
-	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; mm7=[PD_DESCALE_P1_2]
-
-	movq	mm4,mm2
-	movq	mm3,mm5
-	paddd	mm2,mm0			; mm2=data0[col1 col3]=(A1 A3)
-	paddd	mm5,mm6			; mm5=data0[col5 col7]=(A5 A7)
-	psubd	mm4,mm0			; mm4=data1[col1 col3]=(B1 B3)
-	psubd	mm3,mm6			; mm3=data1[col5 col7]=(B5 B7)
-
-	paddd	mm1,mm7
-	psrad	mm1,DESCALE_P1_2
-
-	paddd	mm2,mm7
-	paddd	mm5,mm7
-	psrad	mm2,DESCALE_P1_2
-	psrad	mm5,DESCALE_P1_2
-	paddd	mm4,mm7
-	paddd	mm3,mm7
-	psrad	mm4,DESCALE_P1_2
-	psrad	mm3,DESCALE_P1_2
-
-	; ---- Pass 2: process rows, store into output array.
-
-	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(ebp)]
-
-	; | input:| result:|
-	; | A0 B0 |        |
-	; | A1 B1 | C0 C1  |
-	; | A3 B3 | D0 D1  |
-	; | A5 B5 |        |
-	; | A7 B7 |        |
-
-	; -- Odd part
-
-	packssdw  mm2,mm4		; mm2=(A1 A3 B1 B3)
-	packssdw  mm5,mm3		; mm5=(A5 A7 B5 B7)
-	pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-	paddd     mm2,mm5		; mm2=tmp0[row0 row1]
-
-	; -- Even part
-
-	pslld     mm1,(CONST_BITS+2)	; mm1=tmp10[row0 row1]
-
-	; -- Final output stage
-
-	movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]	; mm0=[PD_DESCALE_P2_2]
-
-	movq      mm6,mm1
-	paddd     mm1,mm2		; mm1=data0[row0 row1]=(C0 C1)
-	psubd     mm6,mm2		; mm6=data1[row0 row1]=(D0 D1)
-
-	paddd     mm1,mm0
-	paddd     mm6,mm0
-	psrad     mm1,DESCALE_P2_2
-	psrad     mm6,DESCALE_P2_2
-
-	movq      mm7,mm1		; transpose coefficients
-	punpckldq mm1,mm6		; mm1=(C0 D0)
-	punpckhdq mm7,mm6		; mm7=(C1 D1)
-
-	packssdw  mm1,mm7		; mm1=(C0 D0 C1 D1)
-	packsswb  mm1,mm1		; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
-	paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-	movd	ecx,mm1
-	movd	ebx,mm1			; ebx=(C0 D0 C1 D1)
-	shr	ecx,2*BYTE_BIT		; ecx=(C1 D1 -- --)
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
-	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2flt-64.asm b/simd/jiss2flt-64.asm
deleted file mode 100644
index 5acfcf7..0000000
--- a/simd/jiss2flt-64.asm
+++ /dev/null
@@ -1,483 +0,0 @@
-;
-; jiss2flt-64.asm - floating-point IDCT (64-bit SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-	shufps	%1,%2,0x44
-%endmacro
-
-%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-	shufps	%1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_float_sse2) PRIVATE
-
-EXTN(jconst_idct_float_sse2):
-
-PD_1_414	times 4 dd  1.414213562373095048801689
-PD_1_847	times 4 dd  1.847759065022573512256366
-PD_1_082	times 4 dd  1.082392200292393968799446
-PD_M2_613	times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void * dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp	rbp+0
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-					; FAST_FLOAT workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_float_sse2) PRIVATE
-
-EXTN(jsimd_idct_float_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [workspace]
-	collect_args
-	push	rbx
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-	mov	rdx, r10	; quantptr
-	mov	rsi, r11		; inptr
-	lea	rdi, [workspace]			; FAST_FLOAT * wsptr
-	mov	rcx, DCTSIZE/4				; ctr
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1,xmm2
-	por	xmm3,xmm4
-	por	xmm5,xmm6
-	por	xmm1,xmm3
-	por	xmm5,xmm7
-	por	xmm1,xmm5
-	packsswb xmm1,xmm1
-	movd	eax,xmm1
-	test	rax,rax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-
-	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
-	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
-	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
-
-	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm1,xmm0
-	movaps	xmm2,xmm0
-	movaps	xmm3,xmm0
-
-	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
-	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
-	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
-	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
-	jmp	near .nextcolumn
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-
-	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
-	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
-	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
-	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
-	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
-	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
-
-	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
-	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
-	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
-	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
-	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
-	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
-
-	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm4,xmm0
-	movaps	xmm5,xmm1
-	subps	xmm0,xmm2		; xmm0=tmp11
-	subps	xmm1,xmm3
-	addps	xmm4,xmm2		; xmm4=tmp10
-	addps	xmm5,xmm3		; xmm5=tmp13
-
-	mulps	xmm1,[rel PD_1_414]
-	subps	xmm1,xmm5		; xmm1=tmp12
-
-	movaps	xmm6,xmm4
-	movaps	xmm7,xmm0
-	subps	xmm4,xmm5		; xmm4=tmp3
-	subps	xmm0,xmm1		; xmm0=tmp2
-	addps	xmm6,xmm5		; xmm6=tmp0
-	addps	xmm7,xmm1		; xmm7=tmp1
-
-	movaps	XMMWORD [wk(1)], xmm4	; tmp3
-	movaps	XMMWORD [wk(0)], xmm0	; tmp2
-
-	; -- Odd part
-
-	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-
-	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
-	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
-	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
-	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
-	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
-	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
-
-	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
-	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
-	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
-	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
-	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
-	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
-
-	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm4,xmm2
-	movaps	xmm0,xmm5
-	addps	xmm2,xmm1		; xmm2=z11
-	addps	xmm5,xmm3		; xmm5=z13
-	subps	xmm4,xmm1		; xmm4=z12
-	subps	xmm0,xmm3		; xmm0=z10
-
-	movaps	xmm1,xmm2
-	subps	xmm2,xmm5
-	addps	xmm1,xmm5		; xmm1=tmp7
-
-	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11
-
-	movaps	xmm3,xmm0
-	addps	xmm0,xmm4
-	mulps	xmm0,[rel PD_1_847]	; xmm0=z5
-	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)
-	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)
-	addps	xmm3,xmm0		; xmm3=tmp12
-	subps	xmm4,xmm0		; xmm4=tmp10
-
-	; -- Final output stage
-
-	subps	xmm3,xmm1		; xmm3=tmp6
-	movaps	xmm5,xmm6
-	movaps	xmm0,xmm7
-	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
-	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
-	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
-	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
-	subps	xmm2,xmm3		; xmm2=tmp5
-
-	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
-	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
-	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
-	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
-	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
-	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
-
-	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
-	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
-
-	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
-	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
-
-	addps	xmm4,xmm2		; xmm4=tmp4
-	movaps	xmm0,xmm7
-	movaps	xmm3,xmm5
-	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
-	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
-	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
-	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
-
-	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
-	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
-	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
-	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
-	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
-	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
-
-	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
-	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
-	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
-	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
-
-	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
-	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
-	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
-
-	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
-	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
-	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
-	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
-	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
-
-	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
-	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
-	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-	add	rsi, byte 4*SIZEOF_JCOEF		; coef_block
-	add	rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
-	add	rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
-	dec	rcx					; ctr
-	jnz	near .columnloop
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	rax, [original_rbp]
-	lea	rsi, [workspace]			; FAST_FLOAT * wsptr
-	mov	rdi, r12	; (JSAMPROW *)
-	mov	eax, r13d
-	mov	rcx, DCTSIZE/4				; ctr
-.rowloop:
-
-	; -- Even part
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
-
-	movaps	xmm4,xmm0
-	movaps	xmm5,xmm1
-	subps	xmm0,xmm2		; xmm0=tmp11
-	subps	xmm1,xmm3
-	addps	xmm4,xmm2		; xmm4=tmp10
-	addps	xmm5,xmm3		; xmm5=tmp13
-
-	mulps	xmm1,[rel PD_1_414]
-	subps	xmm1,xmm5		; xmm1=tmp12
-
-	movaps	xmm6,xmm4
-	movaps	xmm7,xmm0
-	subps	xmm4,xmm5		; xmm4=tmp3
-	subps	xmm0,xmm1		; xmm0=tmp2
-	addps	xmm6,xmm5		; xmm6=tmp0
-	addps	xmm7,xmm1		; xmm7=tmp1
-
-	movaps	XMMWORD [wk(1)], xmm4	; tmp3
-	movaps	XMMWORD [wk(0)], xmm0	; tmp2
-
-	; -- Odd part
-
-	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
-
-	movaps	xmm4,xmm2
-	movaps	xmm0,xmm5
-	addps	xmm2,xmm1		; xmm2=z11
-	addps	xmm5,xmm3		; xmm5=z13
-	subps	xmm4,xmm1		; xmm4=z12
-	subps	xmm0,xmm3		; xmm0=z10
-
-	movaps	xmm1,xmm2
-	subps	xmm2,xmm5
-	addps	xmm1,xmm5		; xmm1=tmp7
-
-	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11
-
-	movaps	xmm3,xmm0
-	addps	xmm0,xmm4
-	mulps	xmm0,[rel PD_1_847]	; xmm0=z5
-	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)
-	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)
-	addps	xmm3,xmm0		; xmm3=tmp12
-	subps	xmm4,xmm0		; xmm4=tmp10
-
-	; -- Final output stage
-
-	subps	xmm3,xmm1		; xmm3=tmp6
-	movaps	xmm5,xmm6
-	movaps	xmm0,xmm7
-	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
-	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
-	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
-	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
-	subps	xmm2,xmm3		; xmm2=tmp5
-
-	movaps	xmm1,[rel PD_RNDINT_MAGIC]	; xmm1=[rel PD_RNDINT_MAGIC]
-	pcmpeqd	xmm3,xmm3
-	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
-	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
-	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
-	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
-	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
-	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
-	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
-	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
-	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
-	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
-
-	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
-	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
-
-	addps	xmm4,xmm2		; xmm4=tmp4
-	movaps	xmm7,xmm1
-	movaps	xmm5,xmm3
-	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
-	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
-	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
-	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
-
-	movaps	xmm2,[rel PD_RNDINT_MAGIC]	; xmm2=[rel PD_RNDINT_MAGIC]
-	pcmpeqd	xmm4,xmm4
-	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
-	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
-	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
-	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
-	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
-	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
-	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
-	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
-	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
-	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
-
-	movdqa    xmm2,[rel PB_CENTERJSAMP]	; xmm2=[rel PB_CENTERJSAMP]
-
-	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
-	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
-	paddb     xmm6,xmm2
-	paddb     xmm1,xmm2
-
-	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
-	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
-	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
-	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
-	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
-	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-	mov	rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
-	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-	mov	rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
-	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
-
-	add	rsi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
-	add	rdi, byte 4*SIZEOF_JSAMPROW
-	dec	rcx				; ctr
-	jnz	near .rowloop
-
-	pop	rbx
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2flt.asm b/simd/jiss2flt.asm
deleted file mode 100644
index 6eebe88..0000000
--- a/simd/jiss2flt.asm
+++ /dev/null
@@ -1,498 +0,0 @@
-;
-; jiss2flt.asm - floating-point IDCT (SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-	shufps	%1,%2,0x44
-%endmacro
-
-%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-	shufps	%1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_float_sse2) PRIVATE
-
-EXTN(jconst_idct_float_sse2):
-
-PD_1_414	times 4 dd  1.414213562373095048801689
-PD_1_847	times 4 dd  1.847759065022573512256366
-PD_1_082	times 4 dd  1.082392200292393968799446
-PD_M2_613	times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-					; FAST_FLOAT workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_float_sse2) PRIVATE
-
-EXTN(jsimd_idct_float_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [workspace]
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-	lea	edi, [workspace]			; FAST_FLOAT * wsptr
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	xmm1,xmm2
-	por	xmm3,xmm4
-	por	xmm5,xmm6
-	por	xmm1,xmm3
-	por	xmm5,xmm7
-	por	xmm1,xmm5
-	packsswb xmm1,xmm1
-	movd	eax,xmm1
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
-	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
-	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
-
-	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm1,xmm0
-	movaps	xmm2,xmm0
-	movaps	xmm3,xmm0
-
-	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
-	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
-	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
-	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-	jmp	near .nextcolumn
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
-	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
-	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
-	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
-	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
-	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
-
-	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
-	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
-	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
-	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
-	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
-	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
-
-	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm4,xmm0
-	movaps	xmm5,xmm1
-	subps	xmm0,xmm2		; xmm0=tmp11
-	subps	xmm1,xmm3
-	addps	xmm4,xmm2		; xmm4=tmp10
-	addps	xmm5,xmm3		; xmm5=tmp13
-
-	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
-	subps	xmm1,xmm5		; xmm1=tmp12
-
-	movaps	xmm6,xmm4
-	movaps	xmm7,xmm0
-	subps	xmm4,xmm5		; xmm4=tmp3
-	subps	xmm0,xmm1		; xmm0=tmp2
-	addps	xmm6,xmm5		; xmm6=tmp0
-	addps	xmm7,xmm1		; xmm7=tmp1
-
-	movaps	XMMWORD [wk(1)], xmm4	; tmp3
-	movaps	XMMWORD [wk(0)], xmm0	; tmp2
-
-	; -- Odd part
-
-	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
-	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
-	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
-	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
-	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
-	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
-
-	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
-	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
-	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
-	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
-	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
-	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
-
-	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm4,xmm2
-	movaps	xmm0,xmm5
-	addps	xmm2,xmm1		; xmm2=z11
-	addps	xmm5,xmm3		; xmm5=z13
-	subps	xmm4,xmm1		; xmm4=z12
-	subps	xmm0,xmm3		; xmm0=z10
-
-	movaps	xmm1,xmm2
-	subps	xmm2,xmm5
-	addps	xmm1,xmm5		; xmm1=tmp7
-
-	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
-
-	movaps	xmm3,xmm0
-	addps	xmm0,xmm4
-	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
-	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
-	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
-	addps	xmm3,xmm0		; xmm3=tmp12
-	subps	xmm4,xmm0		; xmm4=tmp10
-
-	; -- Final output stage
-
-	subps	xmm3,xmm1		; xmm3=tmp6
-	movaps	xmm5,xmm6
-	movaps	xmm0,xmm7
-	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
-	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
-	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
-	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
-	subps	xmm2,xmm3		; xmm2=tmp5
-
-	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
-	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
-	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
-	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
-	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
-	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
-
-	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
-	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
-
-	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
-	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
-
-	addps	xmm4,xmm2		; xmm4=tmp4
-	movaps	xmm0,xmm7
-	movaps	xmm3,xmm5
-	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
-	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
-	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
-	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
-
-	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
-	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
-	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
-	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
-	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
-	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
-
-	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
-	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
-	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
-	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
-
-	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
-	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
-	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
-	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
-	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
-	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
-	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
-	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
-
-	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
-	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
-	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
-	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
-	dec	ecx					; ctr
-	jnz	near .columnloop
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	lea	esi, [workspace]			; FAST_FLOAT * wsptr
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.rowloop:
-
-	; -- Even part
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-	movaps	xmm4,xmm0
-	movaps	xmm5,xmm1
-	subps	xmm0,xmm2		; xmm0=tmp11
-	subps	xmm1,xmm3
-	addps	xmm4,xmm2		; xmm4=tmp10
-	addps	xmm5,xmm3		; xmm5=tmp13
-
-	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
-	subps	xmm1,xmm5		; xmm1=tmp12
-
-	movaps	xmm6,xmm4
-	movaps	xmm7,xmm0
-	subps	xmm4,xmm5		; xmm4=tmp3
-	subps	xmm0,xmm1		; xmm0=tmp2
-	addps	xmm6,xmm5		; xmm6=tmp0
-	addps	xmm7,xmm1		; xmm7=tmp1
-
-	movaps	XMMWORD [wk(1)], xmm4	; tmp3
-	movaps	XMMWORD [wk(0)], xmm0	; tmp2
-
-	; -- Odd part
-
-	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-	movaps	xmm4,xmm2
-	movaps	xmm0,xmm5
-	addps	xmm2,xmm1		; xmm2=z11
-	addps	xmm5,xmm3		; xmm5=z13
-	subps	xmm4,xmm1		; xmm4=z12
-	subps	xmm0,xmm3		; xmm0=z10
-
-	movaps	xmm1,xmm2
-	subps	xmm2,xmm5
-	addps	xmm1,xmm5		; xmm1=tmp7
-
-	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
-
-	movaps	xmm3,xmm0
-	addps	xmm0,xmm4
-	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
-	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
-	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
-	addps	xmm3,xmm0		; xmm3=tmp12
-	subps	xmm4,xmm0		; xmm4=tmp10
-
-	; -- Final output stage
-
-	subps	xmm3,xmm1		; xmm3=tmp6
-	movaps	xmm5,xmm6
-	movaps	xmm0,xmm7
-	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
-	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
-	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
-	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
-	subps	xmm2,xmm3		; xmm2=tmp5
-
-	movaps	xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm1=[PD_RNDINT_MAGIC]
-	pcmpeqd	xmm3,xmm3
-	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
-	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
-	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
-	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
-	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
-	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
-	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
-	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
-	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
-	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
-
-	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
-	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
-
-	addps	xmm4,xmm2		; xmm4=tmp4
-	movaps	xmm7,xmm1
-	movaps	xmm5,xmm3
-	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
-	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
-	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
-	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
-
-	movaps	xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm2=[PD_RNDINT_MAGIC]
-	pcmpeqd	xmm4,xmm4
-	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
-	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
-	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
-	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
-	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
-	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
-	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
-	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
-	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
-	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
-
-	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
-
-	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
-	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
-	paddb     xmm6,xmm2
-	paddb     xmm1,xmm2
-
-	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
-	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
-	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
-	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
-	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
-	pushpic	ebx			; save GOT address
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
-	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
-	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
-
-	poppic	ebx			; restore GOT address
-
-	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
-	add	edi, byte 4*SIZEOF_JSAMPROW
-	dec	ecx				; ctr
-	jnz	near .rowloop
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2fst-64.asm b/simd/jiss2fst-64.asm
deleted file mode 100644
index a620461..0000000
--- a/simd/jiss2fst-64.asm
+++ /dev/null
@@ -1,492 +0,0 @@
-;
-; jiss2fst-64.asm - fast integer IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/projecpt/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	8	; 14 is also OK.
-%define PASS1_BITS	2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082	equ	277		; FIX(1.082392200)
-F_1_414	equ	362		; FIX(1.414213562)
-F_1_847	equ	473		; FIX(1.847759065)
-F_2_613	equ	669		; FIX(2.613125930)
-F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
-F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
-F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-	alignz	16
-	global	EXTN(jconst_idct_ifast_sse2) PRIVATE
-
-EXTN(jconst_idct_ifast_sse2):
-
-PW_F1414	times 8 dw  F_1_414 << CONST_SHIFT
-PW_F1847	times 8 dw  F_1_847 << CONST_SHIFT
-PW_MF1613	times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082	times 8 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = jpeg_component_info * compptr
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp	rbp+0
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_idct_ifast_sse2) PRIVATE
-
-EXTN(jsimd_idct_ifast_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-
-	; ---- Pass 1: process columns from input.
-
-	mov	rdx, r10	; quantptr
-	mov	rsi, r11		; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1,xmm0
-	packsswb xmm1,xmm1
-	packsswb xmm1,xmm1
-	movd	eax,xmm1
-	test	rax,rax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movdqa    xmm7,xmm0		; xmm0=in0=(00 01 02 03 04 05 06 07)
-	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
-	punpckhwd xmm7,xmm7		; xmm7=(04 04 05 05 06 06 07 07)
-
-	pshufd	xmm6,xmm0,0x00		; xmm6=col0=(00 00 00 00 00 00 00 00)
-	pshufd	xmm2,xmm0,0x55		; xmm2=col1=(01 01 01 01 01 01 01 01)
-	pshufd	xmm5,xmm0,0xAA		; xmm5=col2=(02 02 02 02 02 02 02 02)
-	pshufd	xmm0,xmm0,0xFF		; xmm0=col3=(03 03 03 03 03 03 03 03)
-	pshufd	xmm1,xmm7,0x00		; xmm1=col4=(04 04 04 04 04 04 04 04)
-	pshufd	xmm4,xmm7,0x55		; xmm4=col5=(05 05 05 05 05 05 05 05)
-	pshufd	xmm3,xmm7,0xAA		; xmm3=col6=(06 06 06 06 06 06 06 06)
-	pshufd	xmm7,xmm7,0xFF		; xmm7=col7=(07 07 07 07 07 07 07 07)
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=col1
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=col3
-	jmp	near .column_end
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movdqa	xmm4,xmm0
-	movdqa	xmm5,xmm1
-	psubw	xmm0,xmm2		; xmm0=tmp11
-	psubw	xmm1,xmm3
-	paddw	xmm4,xmm2		; xmm4=tmp10
-	paddw	xmm5,xmm3		; xmm5=tmp13
-
-	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm1,[rel PW_F1414]
-	psubw	xmm1,xmm5		; xmm1=tmp12
-
-	movdqa	xmm6,xmm4
-	movdqa	xmm7,xmm0
-	psubw	xmm4,xmm5		; xmm4=tmp3
-	psubw	xmm0,xmm1		; xmm0=tmp2
-	paddw	xmm6,xmm5		; xmm6=tmp0
-	paddw	xmm7,xmm1		; xmm7=tmp1
-
-	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=tmp3
-	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=tmp2
-
-	; -- Odd part
-
-	movdqa	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-	movdqa	xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movdqa	xmm4,xmm2
-	movdqa	xmm0,xmm5
-	psubw	xmm2,xmm1		; xmm2=z12
-	psubw	xmm5,xmm3		; xmm5=z10
-	paddw	xmm4,xmm1		; xmm4=z11
-	paddw	xmm0,xmm3		; xmm0=z13
-
-	movdqa	xmm1,xmm5		; xmm1=z10(unscaled)
-	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-
-	movdqa	xmm3,xmm4
-	psubw	xmm4,xmm0
-	paddw	xmm3,xmm0		; xmm3=tmp7
-
-	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm4,[rel PW_F1414]	; xmm4=tmp11
-
-	; To avoid overflow...
-	;
-	; (Original)
-	; tmp12 = -2.613125930 * z10 + z5;
-	;
-	; (This implementation)
-	; tmp12 = (-1.613125930 - 1) * z10 + z5;
-	;       = -1.613125930 * z10 - z10 + z5;
-
-	movdqa	xmm0,xmm5
-	paddw	xmm5,xmm2
-	pmulhw	xmm5,[rel PW_F1847]	; xmm5=z5
-	pmulhw	xmm0,[rel PW_MF1613]
-	pmulhw	xmm2,[rel PW_F1082]
-	psubw	xmm0,xmm1
-	psubw	xmm2,xmm5		; xmm2=tmp10
-	paddw	xmm0,xmm5		; xmm0=tmp12
-
-	; -- Final output stage
-
-	psubw	xmm0,xmm3		; xmm0=tmp6
-	movdqa	xmm1,xmm6
-	movdqa	xmm5,xmm7
-	paddw	xmm6,xmm3		; xmm6=data0=(00 01 02 03 04 05 06 07)
-	paddw	xmm7,xmm0		; xmm7=data1=(10 11 12 13 14 15 16 17)
-	psubw	xmm1,xmm3		; xmm1=data7=(70 71 72 73 74 75 76 77)
-	psubw	xmm5,xmm0		; xmm5=data6=(60 61 62 63 64 65 66 67)
-	psubw	xmm4,xmm0		; xmm4=tmp5
-
-	movdqa    xmm3,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm7		; xmm6=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm3,xmm7		; xmm3=(04 14 05 15 06 16 07 17)
-	movdqa    xmm0,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm1		; xmm5=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm0,xmm1		; xmm0=(64 74 65 75 66 76 67 77)
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
-	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
-
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(60 70 61 71 62 72 63 73)
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(64 74 65 75 66 76 67 77)
-
-	paddw	xmm2,xmm4		; xmm2=tmp4
-	movdqa	xmm5,xmm7
-	movdqa	xmm0,xmm1
-	paddw	xmm7,xmm4		; xmm7=data2=(20 21 22 23 24 25 26 27)
-	paddw	xmm1,xmm2		; xmm1=data4=(40 41 42 43 44 45 46 47)
-	psubw	xmm5,xmm4		; xmm5=data5=(50 51 52 53 54 55 56 57)
-	psubw	xmm0,xmm2		; xmm0=data3=(30 31 32 33 34 35 36 37)
-
-	movdqa    xmm4,xmm7		; transpose coefficients(phase 1)
-	punpcklwd xmm7,xmm0		; xmm7=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm4,xmm0		; xmm4=(24 34 25 35 26 36 27 37)
-	movdqa    xmm2,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm5		; xmm1=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm2,xmm5		; xmm2=(44 54 45 55 46 56 47 57)
-
-	movdqa    xmm0,xmm3		; transpose coefficients(phase 2)
-	punpckldq xmm3,xmm4		; xmm3=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm0,xmm4		; xmm0=(06 16 26 36 07 17 27 37)
-	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm7		; xmm6=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm5,xmm7		; xmm5=(02 12 22 32 03 13 23 33)
-
-	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(60 70 61 71 62 72 63 73)
-	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=(64 74 65 75 66 76 67 77)
-
-	movdqa	XMMWORD [wk(0)], xmm3	; wk(0)=(04 14 24 34 05 15 25 35)
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(06 16 26 36 07 17 27 37)
-
-	movdqa    xmm3,xmm1		; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm4		; xmm1=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm3,xmm4		; xmm3=(42 52 62 72 43 53 63 73)
-	movdqa    xmm0,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm7		; xmm2=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm0,xmm7		; xmm0=(46 56 66 76 47 57 67 77)
-
-	movdqa     xmm4,xmm6		; transpose coefficients(phase 3)
-	punpcklqdq xmm6,xmm1		; xmm6=col0=(00 10 20 30 40 50 60 70)
-	punpckhqdq xmm4,xmm1		; xmm4=col1=(01 11 21 31 41 51 61 71)
-	movdqa     xmm7,xmm5		; transpose coefficients(phase 3)
-	punpcklqdq xmm5,xmm3		; xmm5=col2=(02 12 22 32 42 52 62 72)
-	punpckhqdq xmm7,xmm3		; xmm7=col3=(03 13 23 33 43 53 63 73)
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(04 14 24 34 05 15 25 35)
-	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(06 16 26 36 07 17 27 37)
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=col1
-	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=col3
-
-	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm2		; xmm1=col4=(04 14 24 34 44 54 64 74)
-	punpckhqdq xmm4,xmm2		; xmm4=col5=(05 15 25 35 45 55 65 75)
-	movdqa     xmm7,xmm3		; transpose coefficients(phase 3)
-	punpcklqdq xmm3,xmm0		; xmm3=col6=(06 16 26 36 46 56 66 76)
-	punpckhqdq xmm7,xmm0		; xmm7=col7=(07 17 27 37 47 57 67 77)
-.column_end:
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	rax, [original_rbp]
-	mov	rdi, r12	; (JSAMPROW *)
-	mov	eax, r13d
-
-	; -- Even part
-
-	; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
-	movdqa	xmm2,xmm6
-	movdqa	xmm0,xmm5
-	psubw	xmm6,xmm1		; xmm6=tmp11
-	psubw	xmm5,xmm3
-	paddw	xmm2,xmm1		; xmm2=tmp10
-	paddw	xmm0,xmm3		; xmm0=tmp13
-
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm5,[rel PW_F1414]
-	psubw	xmm5,xmm0		; xmm5=tmp12
-
-	movdqa	xmm1,xmm2
-	movdqa	xmm3,xmm6
-	psubw	xmm2,xmm0		; xmm2=tmp3
-	psubw	xmm6,xmm5		; xmm6=tmp2
-	paddw	xmm1,xmm0		; xmm1=tmp0
-	paddw	xmm3,xmm5		; xmm3=tmp1
-
-	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=col1
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=col3
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp3
-	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp2
-
-	; -- Odd part
-
-	; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
-	movdqa	xmm2,xmm0
-	movdqa	xmm6,xmm4
-	psubw	xmm0,xmm7		; xmm0=z12
-	psubw	xmm4,xmm5		; xmm4=z10
-	paddw	xmm2,xmm7		; xmm2=z11
-	paddw	xmm6,xmm5		; xmm6=z13
-
-	movdqa	xmm7,xmm4		; xmm7=z10(unscaled)
-	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
-
-	movdqa	xmm5,xmm2
-	psubw	xmm2,xmm6
-	paddw	xmm5,xmm6		; xmm5=tmp7
-
-	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm2,[rel PW_F1414]	; xmm2=tmp11
-
-	; To avoid overflow...
-	;
-	; (Original)
-	; tmp12 = -2.613125930 * z10 + z5;
-	;
-	; (This implementation)
-	; tmp12 = (-1.613125930 - 1) * z10 + z5;
-	;       = -1.613125930 * z10 - z10 + z5;
-
-	movdqa	xmm6,xmm4
-	paddw	xmm4,xmm0
-	pmulhw	xmm4,[rel PW_F1847]	; xmm4=z5
-	pmulhw	xmm6,[rel PW_MF1613]
-	pmulhw	xmm0,[rel PW_F1082]
-	psubw	xmm6,xmm7
-	psubw	xmm0,xmm4		; xmm0=tmp10
-	paddw	xmm6,xmm4		; xmm6=tmp12
-
-	; -- Final output stage
-
-	psubw	xmm6,xmm5		; xmm6=tmp6
-	movdqa	xmm7,xmm1
-	movdqa	xmm4,xmm3
-	paddw	xmm1,xmm5		; xmm1=data0=(00 10 20 30 40 50 60 70)
-	paddw	xmm3,xmm6		; xmm3=data1=(01 11 21 31 41 51 61 71)
-	psraw	xmm1,(PASS1_BITS+3)	; descale
-	psraw	xmm3,(PASS1_BITS+3)	; descale
-	psubw	xmm7,xmm5		; xmm7=data7=(07 17 27 37 47 57 67 77)
-	psubw	xmm4,xmm6		; xmm4=data6=(06 16 26 36 46 56 66 76)
-	psraw	xmm7,(PASS1_BITS+3)	; descale
-	psraw	xmm4,(PASS1_BITS+3)	; descale
-	psubw	xmm2,xmm6		; xmm2=tmp5
-
-	packsswb  xmm1,xmm4	; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	packsswb  xmm3,xmm7	; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp2
-	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=tmp3
-
-	paddw	xmm0,xmm2		; xmm0=tmp4
-	movdqa	xmm4,xmm5
-	movdqa	xmm7,xmm6
-	paddw	xmm5,xmm2		; xmm5=data2=(02 12 22 32 42 52 62 72)
-	paddw	xmm6,xmm0		; xmm6=data4=(04 14 24 34 44 54 64 74)
-	psraw	xmm5,(PASS1_BITS+3)	; descale
-	psraw	xmm6,(PASS1_BITS+3)	; descale
-	psubw	xmm4,xmm2		; xmm4=data5=(05 15 25 35 45 55 65 75)
-	psubw	xmm7,xmm0		; xmm7=data3=(03 13 23 33 43 53 63 73)
-	psraw	xmm4,(PASS1_BITS+3)	; descale
-	psraw	xmm7,(PASS1_BITS+3)	; descale
-
-	movdqa    xmm2,[rel PB_CENTERJSAMP]	; xmm2=[rel PB_CENTERJSAMP]
-
-	packsswb  xmm5,xmm6	; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-	packsswb  xmm7,xmm4	; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-	paddb     xmm1,xmm2
-	paddb     xmm3,xmm2
-	paddb     xmm5,xmm2
-	paddb     xmm7,xmm2
-
-	movdqa    xmm0,xmm1	; transpose coefficients(phase 1)
-	punpcklbw xmm1,xmm3	; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-	punpckhbw xmm0,xmm3	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-	movdqa    xmm6,xmm5	; transpose coefficients(phase 1)
-	punpcklbw xmm5,xmm7	; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-	punpckhbw xmm6,xmm7	; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-	movdqa    xmm4,xmm1	; transpose coefficients(phase 2)
-	punpcklwd xmm1,xmm5	; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-	punpckhwd xmm4,xmm5	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-	movdqa    xmm2,xmm6	; transpose coefficients(phase 2)
-	punpcklwd xmm6,xmm0	; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-	punpckhwd xmm2,xmm0	; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-	movdqa    xmm3,xmm1	; transpose coefficients(phase 3)
-	punpckldq xmm1,xmm6	; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-	punpckhdq xmm3,xmm6	; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-	movdqa    xmm7,xmm4	; transpose coefficients(phase 3)
-	punpckldq xmm4,xmm2	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-	punpckhdq xmm7,xmm2	; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-	pshufd	xmm5,xmm1,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-	pshufd	xmm0,xmm3,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-	pshufd	xmm6,xmm4,0x4E	; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-	pshufd	xmm2,xmm7,0x4E	; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-	mov	rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
-
-	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-	mov	rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2fst.asm b/simd/jiss2fst.asm
deleted file mode 100644
index 84b54b9..0000000
--- a/simd/jiss2fst.asm
+++ /dev/null
@@ -1,502 +0,0 @@
-;
-; jiss2fst.asm - fast integer IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	8	; 14 is also OK.
-%define PASS1_BITS	2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082	equ	277		; FIX(1.082392200)
-F_1_414	equ	362		; FIX(1.414213562)
-F_1_847	equ	473		; FIX(1.847759065)
-F_2_613	equ	669		; FIX(2.613125930)
-F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
-F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
-F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-	alignz	16
-	global	EXTN(jconst_idct_ifast_sse2) PRIVATE
-
-EXTN(jconst_idct_ifast_sse2):
-
-PW_F1414	times 8 dw  F_1_414 << CONST_SHIFT
-PW_F1847	times 8 dw  F_1_847 << CONST_SHIFT
-PW_MF1613	times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082	times 8 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_idct_ifast_sse2) PRIVATE
-
-EXTN(jsimd_idct_ifast_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	xmm1,xmm0
-	packsswb xmm1,xmm1
-	packsswb xmm1,xmm1
-	movd	eax,xmm1
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movdqa    xmm7,xmm0		; xmm0=in0=(00 01 02 03 04 05 06 07)
-	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
-	punpckhwd xmm7,xmm7		; xmm7=(04 04 05 05 06 06 07 07)
-
-	pshufd	xmm6,xmm0,0x00		; xmm6=col0=(00 00 00 00 00 00 00 00)
-	pshufd	xmm2,xmm0,0x55		; xmm2=col1=(01 01 01 01 01 01 01 01)
-	pshufd	xmm5,xmm0,0xAA		; xmm5=col2=(02 02 02 02 02 02 02 02)
-	pshufd	xmm0,xmm0,0xFF		; xmm0=col3=(03 03 03 03 03 03 03 03)
-	pshufd	xmm1,xmm7,0x00		; xmm1=col4=(04 04 04 04 04 04 04 04)
-	pshufd	xmm4,xmm7,0x55		; xmm4=col5=(05 05 05 05 05 05 05 05)
-	pshufd	xmm3,xmm7,0xAA		; xmm3=col6=(06 06 06 06 06 06 06 06)
-	pshufd	xmm7,xmm7,0xFF		; xmm7=col7=(07 07 07 07 07 07 07 07)
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=col1
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=col3
-	jmp	near .column_end
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movdqa	xmm4,xmm0
-	movdqa	xmm5,xmm1
-	psubw	xmm0,xmm2		; xmm0=tmp11
-	psubw	xmm1,xmm3
-	paddw	xmm4,xmm2		; xmm4=tmp10
-	paddw	xmm5,xmm3		; xmm5=tmp13
-
-	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm1,[GOTOFF(ebx,PW_F1414)]
-	psubw	xmm1,xmm5		; xmm1=tmp12
-
-	movdqa	xmm6,xmm4
-	movdqa	xmm7,xmm0
-	psubw	xmm4,xmm5		; xmm4=tmp3
-	psubw	xmm0,xmm1		; xmm0=tmp2
-	paddw	xmm6,xmm5		; xmm6=tmp0
-	paddw	xmm7,xmm1		; xmm7=tmp1
-
-	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=tmp3
-	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=tmp2
-
-	; -- Odd part
-
-	movdqa	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	movdqa	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-	movdqa	xmm4,xmm2
-	movdqa	xmm0,xmm5
-	psubw	xmm2,xmm1		; xmm2=z12
-	psubw	xmm5,xmm3		; xmm5=z10
-	paddw	xmm4,xmm1		; xmm4=z11
-	paddw	xmm0,xmm3		; xmm0=z13
-
-	movdqa	xmm1,xmm5		; xmm1=z10(unscaled)
-	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-
-	movdqa	xmm3,xmm4
-	psubw	xmm4,xmm0
-	paddw	xmm3,xmm0		; xmm3=tmp7
-
-	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm4,[GOTOFF(ebx,PW_F1414)]	; xmm4=tmp11
-
-	; To avoid overflow...
-	;
-	; (Original)
-	; tmp12 = -2.613125930 * z10 + z5;
-	;
-	; (This implementation)
-	; tmp12 = (-1.613125930 - 1) * z10 + z5;
-	;       = -1.613125930 * z10 - z10 + z5;
-
-	movdqa	xmm0,xmm5
-	paddw	xmm5,xmm2
-	pmulhw	xmm5,[GOTOFF(ebx,PW_F1847)]	; xmm5=z5
-	pmulhw	xmm0,[GOTOFF(ebx,PW_MF1613)]
-	pmulhw	xmm2,[GOTOFF(ebx,PW_F1082)]
-	psubw	xmm0,xmm1
-	psubw	xmm2,xmm5		; xmm2=tmp10
-	paddw	xmm0,xmm5		; xmm0=tmp12
-
-	; -- Final output stage
-
-	psubw	xmm0,xmm3		; xmm0=tmp6
-	movdqa	xmm1,xmm6
-	movdqa	xmm5,xmm7
-	paddw	xmm6,xmm3		; xmm6=data0=(00 01 02 03 04 05 06 07)
-	paddw	xmm7,xmm0		; xmm7=data1=(10 11 12 13 14 15 16 17)
-	psubw	xmm1,xmm3		; xmm1=data7=(70 71 72 73 74 75 76 77)
-	psubw	xmm5,xmm0		; xmm5=data6=(60 61 62 63 64 65 66 67)
-	psubw	xmm4,xmm0		; xmm4=tmp5
-
-	movdqa    xmm3,xmm6		; transpose coefficients(phase 1)
-	punpcklwd xmm6,xmm7		; xmm6=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm3,xmm7		; xmm3=(04 14 05 15 06 16 07 17)
-	movdqa    xmm0,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm1		; xmm5=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm0,xmm1		; xmm0=(64 74 65 75 66 76 67 77)
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
-	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
-
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(60 70 61 71 62 72 63 73)
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(64 74 65 75 66 76 67 77)
-
-	paddw	xmm2,xmm4		; xmm2=tmp4
-	movdqa	xmm5,xmm7
-	movdqa	xmm0,xmm1
-	paddw	xmm7,xmm4		; xmm7=data2=(20 21 22 23 24 25 26 27)
-	paddw	xmm1,xmm2		; xmm1=data4=(40 41 42 43 44 45 46 47)
-	psubw	xmm5,xmm4		; xmm5=data5=(50 51 52 53 54 55 56 57)
-	psubw	xmm0,xmm2		; xmm0=data3=(30 31 32 33 34 35 36 37)
-
-	movdqa    xmm4,xmm7		; transpose coefficients(phase 1)
-	punpcklwd xmm7,xmm0		; xmm7=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm4,xmm0		; xmm4=(24 34 25 35 26 36 27 37)
-	movdqa    xmm2,xmm1		; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm5		; xmm1=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm2,xmm5		; xmm2=(44 54 45 55 46 56 47 57)
-
-	movdqa    xmm0,xmm3		; transpose coefficients(phase 2)
-	punpckldq xmm3,xmm4		; xmm3=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm0,xmm4		; xmm0=(06 16 26 36 07 17 27 37)
-	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm7		; xmm6=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm5,xmm7		; xmm5=(02 12 22 32 03 13 23 33)
-
-	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(60 70 61 71 62 72 63 73)
-	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=(64 74 65 75 66 76 67 77)
-
-	movdqa	XMMWORD [wk(0)], xmm3	; wk(0)=(04 14 24 34 05 15 25 35)
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(06 16 26 36 07 17 27 37)
-
-	movdqa    xmm3,xmm1		; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm4		; xmm1=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm3,xmm4		; xmm3=(42 52 62 72 43 53 63 73)
-	movdqa    xmm0,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm7		; xmm2=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm0,xmm7		; xmm0=(46 56 66 76 47 57 67 77)
-
-	movdqa     xmm4,xmm6		; transpose coefficients(phase 3)
-	punpcklqdq xmm6,xmm1		; xmm6=col0=(00 10 20 30 40 50 60 70)
-	punpckhqdq xmm4,xmm1		; xmm4=col1=(01 11 21 31 41 51 61 71)
-	movdqa     xmm7,xmm5		; transpose coefficients(phase 3)
-	punpcklqdq xmm5,xmm3		; xmm5=col2=(02 12 22 32 42 52 62 72)
-	punpckhqdq xmm7,xmm3		; xmm7=col3=(03 13 23 33 43 53 63 73)
-
-	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(04 14 24 34 05 15 25 35)
-	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(06 16 26 36 07 17 27 37)
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=col1
-	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=col3
-
-	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm2		; xmm1=col4=(04 14 24 34 44 54 64 74)
-	punpckhqdq xmm4,xmm2		; xmm4=col5=(05 15 25 35 45 55 65 75)
-	movdqa     xmm7,xmm3		; transpose coefficients(phase 3)
-	punpcklqdq xmm3,xmm0		; xmm3=col6=(06 16 26 36 46 56 66 76)
-	punpckhqdq xmm7,xmm0		; xmm7=col7=(07 17 27 37 47 57 67 77)
-.column_end:
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-
-	; -- Even part
-
-	; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
-	movdqa	xmm2,xmm6
-	movdqa	xmm0,xmm5
-	psubw	xmm6,xmm1		; xmm6=tmp11
-	psubw	xmm5,xmm3
-	paddw	xmm2,xmm1		; xmm2=tmp10
-	paddw	xmm0,xmm3		; xmm0=tmp13
-
-	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm5,[GOTOFF(ebx,PW_F1414)]
-	psubw	xmm5,xmm0		; xmm5=tmp12
-
-	movdqa	xmm1,xmm2
-	movdqa	xmm3,xmm6
-	psubw	xmm2,xmm0		; xmm2=tmp3
-	psubw	xmm6,xmm5		; xmm6=tmp2
-	paddw	xmm1,xmm0		; xmm1=tmp0
-	paddw	xmm3,xmm5		; xmm3=tmp1
-
-	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=col1
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=col3
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp3
-	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp2
-
-	; -- Odd part
-
-	; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
-	movdqa	xmm2,xmm0
-	movdqa	xmm6,xmm4
-	psubw	xmm0,xmm7		; xmm0=z12
-	psubw	xmm4,xmm5		; xmm4=z10
-	paddw	xmm2,xmm7		; xmm2=z11
-	paddw	xmm6,xmm5		; xmm6=z13
-
-	movdqa	xmm7,xmm4		; xmm7=z10(unscaled)
-	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
-	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
-
-	movdqa	xmm5,xmm2
-	psubw	xmm2,xmm6
-	paddw	xmm5,xmm6		; xmm5=tmp7
-
-	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
-	pmulhw	xmm2,[GOTOFF(ebx,PW_F1414)]	; xmm2=tmp11
-
-	; To avoid overflow...
-	;
-	; (Original)
-	; tmp12 = -2.613125930 * z10 + z5;
-	;
-	; (This implementation)
-	; tmp12 = (-1.613125930 - 1) * z10 + z5;
-	;       = -1.613125930 * z10 - z10 + z5;
-
-	movdqa	xmm6,xmm4
-	paddw	xmm4,xmm0
-	pmulhw	xmm4,[GOTOFF(ebx,PW_F1847)]	; xmm4=z5
-	pmulhw	xmm6,[GOTOFF(ebx,PW_MF1613)]
-	pmulhw	xmm0,[GOTOFF(ebx,PW_F1082)]
-	psubw	xmm6,xmm7
-	psubw	xmm0,xmm4		; xmm0=tmp10
-	paddw	xmm6,xmm4		; xmm6=tmp12
-
-	; -- Final output stage
-
-	psubw	xmm6,xmm5		; xmm6=tmp6
-	movdqa	xmm7,xmm1
-	movdqa	xmm4,xmm3
-	paddw	xmm1,xmm5		; xmm1=data0=(00 10 20 30 40 50 60 70)
-	paddw	xmm3,xmm6		; xmm3=data1=(01 11 21 31 41 51 61 71)
-	psraw	xmm1,(PASS1_BITS+3)	; descale
-	psraw	xmm3,(PASS1_BITS+3)	; descale
-	psubw	xmm7,xmm5		; xmm7=data7=(07 17 27 37 47 57 67 77)
-	psubw	xmm4,xmm6		; xmm4=data6=(06 16 26 36 46 56 66 76)
-	psraw	xmm7,(PASS1_BITS+3)	; descale
-	psraw	xmm4,(PASS1_BITS+3)	; descale
-	psubw	xmm2,xmm6		; xmm2=tmp5
-
-	packsswb  xmm1,xmm4	; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	packsswb  xmm3,xmm7	; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp2
-	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=tmp3
-
-	paddw	xmm0,xmm2		; xmm0=tmp4
-	movdqa	xmm4,xmm5
-	movdqa	xmm7,xmm6
-	paddw	xmm5,xmm2		; xmm5=data2=(02 12 22 32 42 52 62 72)
-	paddw	xmm6,xmm0		; xmm6=data4=(04 14 24 34 44 54 64 74)
-	psraw	xmm5,(PASS1_BITS+3)	; descale
-	psraw	xmm6,(PASS1_BITS+3)	; descale
-	psubw	xmm4,xmm2		; xmm4=data5=(05 15 25 35 45 55 65 75)
-	psubw	xmm7,xmm0		; xmm7=data3=(03 13 23 33 43 53 63 73)
-	psraw	xmm4,(PASS1_BITS+3)	; descale
-	psraw	xmm7,(PASS1_BITS+3)	; descale
-
-	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
-
-	packsswb  xmm5,xmm6	; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-	packsswb  xmm7,xmm4	; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-	paddb     xmm1,xmm2
-	paddb     xmm3,xmm2
-	paddb     xmm5,xmm2
-	paddb     xmm7,xmm2
-
-	movdqa    xmm0,xmm1	; transpose coefficients(phase 1)
-	punpcklbw xmm1,xmm3	; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-	punpckhbw xmm0,xmm3	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-	movdqa    xmm6,xmm5	; transpose coefficients(phase 1)
-	punpcklbw xmm5,xmm7	; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-	punpckhbw xmm6,xmm7	; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-	movdqa    xmm4,xmm1	; transpose coefficients(phase 2)
-	punpcklwd xmm1,xmm5	; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-	punpckhwd xmm4,xmm5	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-	movdqa    xmm2,xmm6	; transpose coefficients(phase 2)
-	punpcklwd xmm6,xmm0	; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-	punpckhwd xmm2,xmm0	; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-	movdqa    xmm3,xmm1	; transpose coefficients(phase 3)
-	punpckldq xmm1,xmm6	; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-	punpckhdq xmm3,xmm6	; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-	movdqa    xmm7,xmm4	; transpose coefficients(phase 3)
-	punpckldq xmm4,xmm2	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-	punpckhdq xmm7,xmm2	; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-	pshufd	xmm5,xmm1,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-	pshufd	xmm0,xmm3,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-	pshufd	xmm6,xmm4,0x4E	; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-	pshufd	xmm2,xmm7,0x4E	; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-	mov	edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
-
-	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
-	mov	edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2int-64.asm b/simd/jiss2int-64.asm
deleted file mode 100644
index 0730818..0000000
--- a/simd/jiss2int-64.asm
+++ /dev/null
@@ -1,848 +0,0 @@
-;
-; jiss2int-64.asm - accurate integer IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
-%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298	equ	 2446		; FIX(0.298631336)
-F_0_390	equ	 3196		; FIX(0.390180644)
-F_0_541	equ	 4433		; FIX(0.541196100)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_175	equ	 9633		; FIX(1.175875602)
-F_1_501	equ	12299		; FIX(1.501321110)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_1_961	equ	16069		; FIX(1.961570560)
-F_2_053	equ	16819		; FIX(2.053119869)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_072	equ	25172		; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
-F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
-F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
-F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_islow_sse2) PRIVATE
-
-EXTN(jconst_idct_islow_sse2):
-
-PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = jpeg_component_info * compptr
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp	rbp+0
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		12
-
-	align	16
-	global	EXTN(jsimd_idct_islow_sse2) PRIVATE
-
-EXTN(jsimd_idct_islow_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-
-	; ---- Pass 1: process columns from input.
-
-	mov	rdx, r10	; quantptr
-	mov	rsi, r11		; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1,xmm0
-	packsswb xmm1,xmm1
-	packsswb xmm1,xmm1
-	movd	eax,xmm1
-	test	rax,rax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movdqa	xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	psllw	xmm5,PASS1_BITS
-
-	movdqa    xmm4,xmm5		; xmm5=in0=(00 01 02 03 04 05 06 07)
-	punpcklwd xmm5,xmm5		; xmm5=(00 00 01 01 02 02 03 03)
-	punpckhwd xmm4,xmm4		; xmm4=(04 04 05 05 06 06 07 07)
-
-	pshufd	xmm7,xmm5,0x00		; xmm7=col0=(00 00 00 00 00 00 00 00)
-	pshufd	xmm6,xmm5,0x55		; xmm6=col1=(01 01 01 01 01 01 01 01)
-	pshufd	xmm1,xmm5,0xAA		; xmm1=col2=(02 02 02 02 02 02 02 02)
-	pshufd	xmm5,xmm5,0xFF		; xmm5=col3=(03 03 03 03 03 03 03 03)
-	pshufd	xmm0,xmm4,0x00		; xmm0=col4=(04 04 04 04 04 04 04 04)
-	pshufd	xmm3,xmm4,0x55		; xmm3=col5=(05 05 05 05 05 05 05 05)
-	pshufd	xmm2,xmm4,0xAA		; xmm2=col6=(06 06 06 06 06 06 06 06)
-	pshufd	xmm4,xmm4,0xFF		; xmm4=col7=(07 07 07 07 07 07 07 07)
-
-	movdqa	XMMWORD [wk(8)], xmm6	; wk(8)=col1
-	movdqa	XMMWORD [wk(9)], xmm5	; wk(9)=col3
-	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
-	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
-	jmp	near .column_end
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; (Original)
-	; z1 = (z2 + z3) * 0.541196100;
-	; tmp2 = z1 + z3 * -1.847759065;
-	; tmp3 = z1 + z2 * 0.765366865;
-	;
-	; (This implementation)
-	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-	movdqa    xmm4,xmm1		; xmm1=in2=z2
-	movdqa    xmm5,xmm1
-	punpcklwd xmm4,xmm3		; xmm3=in6=z3
-	punpckhwd xmm5,xmm3
-	movdqa    xmm1,xmm4
-	movdqa    xmm3,xmm5
-	pmaddwd   xmm4,[rel PW_F130_F054]	; xmm4=tmp3L
-	pmaddwd   xmm5,[rel PW_F130_F054]	; xmm5=tmp3H
-	pmaddwd   xmm1,[rel PW_F054_MF130]	; xmm1=tmp2L
-	pmaddwd   xmm3,[rel PW_F054_MF130]	; xmm3=tmp2H
-
-	movdqa    xmm6,xmm0
-	paddw     xmm0,xmm2		; xmm0=in0+in4
-	psubw     xmm6,xmm2		; xmm6=in0-in4
-
-	pxor      xmm7,xmm7
-	pxor      xmm2,xmm2
-	punpcklwd xmm7,xmm0		; xmm7=tmp0L
-	punpckhwd xmm2,xmm0		; xmm2=tmp0H
-	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
-	psrad     xmm2,(16-CONST_BITS)	; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
-	movdqa	xmm0,xmm7
-	paddd	xmm7,xmm4		; xmm7=tmp10L
-	psubd	xmm0,xmm4		; xmm0=tmp13L
-	movdqa	xmm4,xmm2
-	paddd	xmm2,xmm5		; xmm2=tmp10H
-	psubd	xmm4,xmm5		; xmm4=tmp13H
-
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=tmp10L
-	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=tmp10H
-	movdqa	XMMWORD [wk(2)], xmm0	; wk(2)=tmp13L
-	movdqa	XMMWORD [wk(3)], xmm4	; wk(3)=tmp13H
-
-	pxor      xmm5,xmm5
-	pxor      xmm7,xmm7
-	punpcklwd xmm5,xmm6		; xmm5=tmp1L
-	punpckhwd xmm7,xmm6		; xmm7=tmp1H
-	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
-	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
-	movdqa	xmm2,xmm5
-	paddd	xmm5,xmm1		; xmm5=tmp11L
-	psubd	xmm2,xmm1		; xmm2=tmp12L
-	movdqa	xmm0,xmm7
-	paddd	xmm7,xmm3		; xmm7=tmp11H
-	psubd	xmm0,xmm3		; xmm0=tmp12H
-
-	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
-	movdqa	XMMWORD [wk(5)], xmm7	; wk(5)=tmp11H
-	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=tmp12L
-	movdqa	XMMWORD [wk(7)], xmm0	; wk(7)=tmp12H
-
-	; -- Odd part
-
-	movdqa	xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movdqa	xmm5,xmm6
-	movdqa	xmm7,xmm4
-	paddw	xmm5,xmm3		; xmm5=z3
-	paddw	xmm7,xmm1		; xmm7=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm2,xmm5
-	movdqa    xmm0,xmm5
-	punpcklwd xmm2,xmm7
-	punpckhwd xmm0,xmm7
-	movdqa    xmm5,xmm2
-	movdqa    xmm7,xmm0
-	pmaddwd   xmm2,[rel PW_MF078_F117]	; xmm2=z3L
-	pmaddwd   xmm0,[rel PW_MF078_F117]	; xmm0=z3H
-	pmaddwd   xmm5,[rel PW_F117_F078]	; xmm5=z4L
-	pmaddwd   xmm7,[rel PW_F117_F078]	; xmm7=z4H
-
-	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=z3L
-	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=z3H
-
-	; (Original)
-	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-	;
-	; (This implementation)
-	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-	; tmp0 += z3;  tmp1 += z4;
-	; tmp2 += z3;  tmp3 += z4;
-
-	movdqa    xmm2,xmm3
-	movdqa    xmm0,xmm3
-	punpcklwd xmm2,xmm4
-	punpckhwd xmm0,xmm4
-	movdqa    xmm3,xmm2
-	movdqa    xmm4,xmm0
-	pmaddwd   xmm2,[rel PW_MF060_MF089]	; xmm2=tmp0L
-	pmaddwd   xmm0,[rel PW_MF060_MF089]	; xmm0=tmp0H
-	pmaddwd   xmm3,[rel PW_MF089_F060]	; xmm3=tmp3L
-	pmaddwd   xmm4,[rel PW_MF089_F060]	; xmm4=tmp3H
-
-	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp0L
-	paddd	xmm0, XMMWORD [wk(11)]	; xmm0=tmp0H
-	paddd	xmm3,xmm5		; xmm3=tmp3L
-	paddd	xmm4,xmm7		; xmm4=tmp3H
-
-	movdqa	XMMWORD [wk(8)], xmm2	; wk(8)=tmp0L
-	movdqa	XMMWORD [wk(9)], xmm0	; wk(9)=tmp0H
-
-	movdqa    xmm2,xmm1
-	movdqa    xmm0,xmm1
-	punpcklwd xmm2,xmm6
-	punpckhwd xmm0,xmm6
-	movdqa    xmm1,xmm2
-	movdqa    xmm6,xmm0
-	pmaddwd   xmm2,[rel PW_MF050_MF256]	; xmm2=tmp1L
-	pmaddwd   xmm0,[rel PW_MF050_MF256]	; xmm0=tmp1H
-	pmaddwd   xmm1,[rel PW_MF256_F050]	; xmm1=tmp2L
-	pmaddwd   xmm6,[rel PW_MF256_F050]	; xmm6=tmp2H
-
-	paddd	xmm2,xmm5		; xmm2=tmp1L
-	paddd	xmm0,xmm7		; xmm0=tmp1H
-	paddd	xmm1, XMMWORD [wk(10)]	; xmm1=tmp2L
-	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
-
-	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=tmp1L
-	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=tmp1H
-
-	; -- Final output stage
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
-	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=tmp10H
-
-	movdqa	xmm2,xmm5
-	movdqa	xmm0,xmm7
-	paddd	xmm5,xmm3		; xmm5=data0L
-	paddd	xmm7,xmm4		; xmm7=data0H
-	psubd	xmm2,xmm3		; xmm2=data7L
-	psubd	xmm0,xmm4		; xmm0=data7H
-
-	movdqa	xmm3,[rel PD_DESCALE_P1]	; xmm3=[rel PD_DESCALE_P1]
-
-	paddd	xmm5,xmm3
-	paddd	xmm7,xmm3
-	psrad	xmm5,DESCALE_P1
-	psrad	xmm7,DESCALE_P1
-	paddd	xmm2,xmm3
-	paddd	xmm0,xmm3
-	psrad	xmm2,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-
-	packssdw  xmm5,xmm7		; xmm5=data0=(00 01 02 03 04 05 06 07)
-	packssdw  xmm2,xmm0		; xmm2=data7=(70 71 72 73 74 75 76 77)
-
-	movdqa	xmm4, XMMWORD [wk(4)]	; xmm4=tmp11L
-	movdqa	xmm3, XMMWORD [wk(5)]	; xmm3=tmp11H
-
-	movdqa	xmm7,xmm4
-	movdqa	xmm0,xmm3
-	paddd	xmm4,xmm1		; xmm4=data1L
-	paddd	xmm3,xmm6		; xmm3=data1H
-	psubd	xmm7,xmm1		; xmm7=data6L
-	psubd	xmm0,xmm6		; xmm0=data6H
-
-	movdqa	xmm1,[rel PD_DESCALE_P1]	; xmm1=[rel PD_DESCALE_P1]
-
-	paddd	xmm4,xmm1
-	paddd	xmm3,xmm1
-	psrad	xmm4,DESCALE_P1
-	psrad	xmm3,DESCALE_P1
-	paddd	xmm7,xmm1
-	paddd	xmm0,xmm1
-	psrad	xmm7,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-
-	packssdw  xmm4,xmm3		; xmm4=data1=(10 11 12 13 14 15 16 17)
-	packssdw  xmm7,xmm0		; xmm7=data6=(60 61 62 63 64 65 66 67)
-
-	movdqa    xmm6,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm4		; xmm5=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm6,xmm4		; xmm6=(04 14 05 15 06 16 07 17)
-	movdqa    xmm1,xmm7		; transpose coefficients(phase 1)
-	punpcklwd xmm7,xmm2		; xmm7=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm1,xmm2		; xmm1=(64 74 65 75 66 76 67 77)
-
-	movdqa	xmm3, XMMWORD [wk(6)]	; xmm3=tmp12L
-	movdqa	xmm0, XMMWORD [wk(7)]	; xmm0=tmp12H
-	movdqa	xmm4, XMMWORD [wk(10)]	; xmm4=tmp1L
-	movdqa	xmm2, XMMWORD [wk(11)]	; xmm2=tmp1H
-
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 01 11 02 12 03 13)
-	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=(04 14 05 15 06 16 07 17)
-	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=(60 70 61 71 62 72 63 73)
-	movdqa	XMMWORD [wk(5)], xmm1	; wk(5)=(64 74 65 75 66 76 67 77)
-
-	movdqa	xmm5,xmm3
-	movdqa	xmm6,xmm0
-	paddd	xmm3,xmm4		; xmm3=data2L
-	paddd	xmm0,xmm2		; xmm0=data2H
-	psubd	xmm5,xmm4		; xmm5=data5L
-	psubd	xmm6,xmm2		; xmm6=data5H
-
-	movdqa	xmm7,[rel PD_DESCALE_P1]	; xmm7=[rel PD_DESCALE_P1]
-
-	paddd	xmm3,xmm7
-	paddd	xmm0,xmm7
-	psrad	xmm3,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-	paddd	xmm5,xmm7
-	paddd	xmm6,xmm7
-	psrad	xmm5,DESCALE_P1
-	psrad	xmm6,DESCALE_P1
-
-	packssdw  xmm3,xmm0		; xmm3=data2=(20 21 22 23 24 25 26 27)
-	packssdw  xmm5,xmm6		; xmm5=data5=(50 51 52 53 54 55 56 57)
-
-	movdqa	xmm1, XMMWORD [wk(2)]	; xmm1=tmp13L
-	movdqa	xmm4, XMMWORD [wk(3)]	; xmm4=tmp13H
-	movdqa	xmm2, XMMWORD [wk(8)]	; xmm2=tmp0L
-	movdqa	xmm7, XMMWORD [wk(9)]	; xmm7=tmp0H
-
-	movdqa	xmm0,xmm1
-	movdqa	xmm6,xmm4
-	paddd	xmm1,xmm2		; xmm1=data3L
-	paddd	xmm4,xmm7		; xmm4=data3H
-	psubd	xmm0,xmm2		; xmm0=data4L
-	psubd	xmm6,xmm7		; xmm6=data4H
-
-	movdqa	xmm2,[rel PD_DESCALE_P1]	; xmm2=[rel PD_DESCALE_P1]
-
-	paddd	xmm1,xmm2
-	paddd	xmm4,xmm2
-	psrad	xmm1,DESCALE_P1
-	psrad	xmm4,DESCALE_P1
-	paddd	xmm0,xmm2
-	paddd	xmm6,xmm2
-	psrad	xmm0,DESCALE_P1
-	psrad	xmm6,DESCALE_P1
-
-	packssdw  xmm1,xmm4		; xmm1=data3=(30 31 32 33 34 35 36 37)
-	packssdw  xmm0,xmm6		; xmm0=data4=(40 41 42 43 44 45 46 47)
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 01 11 02 12 03 13)
-	movdqa	xmm2, XMMWORD [wk(1)]	; xmm2=(04 14 05 15 06 16 07 17)
-
-	movdqa    xmm4,xmm3		; transpose coefficients(phase 1)
-	punpcklwd xmm3,xmm1		; xmm3=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm4,xmm1		; xmm4=(24 34 25 35 26 36 27 37)
-	movdqa    xmm6,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm5		; xmm0=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm6,xmm5		; xmm6=(44 54 45 55 46 56 47 57)
-
-	movdqa    xmm1,xmm7		; transpose coefficients(phase 2)
-	punpckldq xmm7,xmm3		; xmm7=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm1,xmm3		; xmm1=(02 12 22 32 03 13 23 33)
-	movdqa    xmm5,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm4		; xmm2=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm5,xmm4		; xmm5=(06 16 26 36 07 17 27 37)
-
-	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=(60 70 61 71 62 72 63 73)
-	movdqa	xmm4, XMMWORD [wk(5)]	; xmm4=(64 74 65 75 66 76 67 77)
-
-	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=(04 14 24 34 05 15 25 35)
-	movdqa	XMMWORD [wk(7)], xmm5	; wk(7)=(06 16 26 36 07 17 27 37)
-
-	movdqa    xmm2,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm3		; xmm0=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm2,xmm3		; xmm2=(42 52 62 72 43 53 63 73)
-	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm4		; xmm6=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm5,xmm4		; xmm5=(46 56 66 76 47 57 67 77)
-
-	movdqa     xmm3,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm0		; xmm7=col0=(00 10 20 30 40 50 60 70)
-	punpckhqdq xmm3,xmm0		; xmm3=col1=(01 11 21 31 41 51 61 71)
-	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm2		; xmm1=col2=(02 12 22 32 42 52 62 72)
-	punpckhqdq xmm4,xmm2		; xmm4=col3=(03 13 23 33 43 53 63 73)
-
-	movdqa	xmm0, XMMWORD [wk(6)]	; xmm0=(04 14 24 34 05 15 25 35)
-	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=(06 16 26 36 07 17 27 37)
-
-	movdqa	XMMWORD [wk(8)], xmm3	; wk(8)=col1
-	movdqa	XMMWORD [wk(9)], xmm4	; wk(9)=col3
-
-	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm6		; xmm0=col4=(04 14 24 34 44 54 64 74)
-	punpckhqdq xmm3,xmm6		; xmm3=col5=(05 15 25 35 45 55 65 75)
-	movdqa     xmm4,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm5		; xmm2=col6=(06 16 26 36 46 56 66 76)
-	punpckhqdq xmm4,xmm5		; xmm4=col7=(07 17 27 37 47 57 67 77)
-
-	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
-	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
-.column_end:
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	rax, [original_rbp]
-	mov	rdi, r12	; (JSAMPROW *)
-	mov	eax, r13d
-
-	; -- Even part
-
-	; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
-	; (Original)
-	; z1 = (z2 + z3) * 0.541196100;
-	; tmp2 = z1 + z3 * -1.847759065;
-	; tmp3 = z1 + z2 * 0.765366865;
-	;
-	; (This implementation)
-	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-	movdqa    xmm6,xmm1		; xmm1=in2=z2
-	movdqa    xmm5,xmm1
-	punpcklwd xmm6,xmm2		; xmm2=in6=z3
-	punpckhwd xmm5,xmm2
-	movdqa    xmm1,xmm6
-	movdqa    xmm2,xmm5
-	pmaddwd   xmm6,[rel PW_F130_F054]	; xmm6=tmp3L
-	pmaddwd   xmm5,[rel PW_F130_F054]	; xmm5=tmp3H
-	pmaddwd   xmm1,[rel PW_F054_MF130]	; xmm1=tmp2L
-	pmaddwd   xmm2,[rel PW_F054_MF130]	; xmm2=tmp2H
-
-	movdqa    xmm3,xmm7
-	paddw     xmm7,xmm0		; xmm7=in0+in4
-	psubw     xmm3,xmm0		; xmm3=in0-in4
-
-	pxor      xmm4,xmm4
-	pxor      xmm0,xmm0
-	punpcklwd xmm4,xmm7		; xmm4=tmp0L
-	punpckhwd xmm0,xmm7		; xmm0=tmp0H
-	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
-	psrad     xmm0,(16-CONST_BITS)	; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
-	movdqa	xmm7,xmm4
-	paddd	xmm4,xmm6		; xmm4=tmp10L
-	psubd	xmm7,xmm6		; xmm7=tmp13L
-	movdqa	xmm6,xmm0
-	paddd	xmm0,xmm5		; xmm0=tmp10H
-	psubd	xmm6,xmm5		; xmm6=tmp13H
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=tmp10L
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp10H
-	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=tmp13L
-	movdqa	XMMWORD [wk(3)], xmm6	; wk(3)=tmp13H
-
-	pxor      xmm5,xmm5
-	pxor      xmm4,xmm4
-	punpcklwd xmm5,xmm3		; xmm5=tmp1L
-	punpckhwd xmm4,xmm3		; xmm4=tmp1H
-	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
-	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
-	movdqa	xmm0,xmm5
-	paddd	xmm5,xmm1		; xmm5=tmp11L
-	psubd	xmm0,xmm1		; xmm0=tmp12L
-	movdqa	xmm7,xmm4
-	paddd	xmm4,xmm2		; xmm4=tmp11H
-	psubd	xmm7,xmm2		; xmm7=tmp12H
-
-	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
-	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=tmp11H
-	movdqa	XMMWORD [wk(6)], xmm0	; wk(6)=tmp12L
-	movdqa	XMMWORD [wk(7)], xmm7	; wk(7)=tmp12H
-
-	; -- Odd part
-
-	movdqa	xmm6, XMMWORD [wk(9)]	; xmm6=col3
-	movdqa	xmm3, XMMWORD [wk(8)]	; xmm3=col1
-	movdqa	xmm1, XMMWORD [wk(11)]	; xmm1=col7
-	movdqa	xmm2, XMMWORD [wk(10)]	; xmm2=col5
-
-	movdqa	xmm5,xmm6
-	movdqa	xmm4,xmm3
-	paddw	xmm5,xmm1		; xmm5=z3
-	paddw	xmm4,xmm2		; xmm4=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm0,xmm5
-	movdqa    xmm7,xmm5
-	punpcklwd xmm0,xmm4
-	punpckhwd xmm7,xmm4
-	movdqa    xmm5,xmm0
-	movdqa    xmm4,xmm7
-	pmaddwd   xmm0,[rel PW_MF078_F117]	; xmm0=z3L
-	pmaddwd   xmm7,[rel PW_MF078_F117]	; xmm7=z3H
-	pmaddwd   xmm5,[rel PW_F117_F078]	; xmm5=z4L
-	pmaddwd   xmm4,[rel PW_F117_F078]	; xmm4=z4H
-
-	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=z3L
-	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=z3H
-
-	; (Original)
-	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-	;
-	; (This implementation)
-	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-	; tmp0 += z3;  tmp1 += z4;
-	; tmp2 += z3;  tmp3 += z4;
-
-	movdqa    xmm0,xmm1
-	movdqa    xmm7,xmm1
-	punpcklwd xmm0,xmm3
-	punpckhwd xmm7,xmm3
-	movdqa    xmm1,xmm0
-	movdqa    xmm3,xmm7
-	pmaddwd   xmm0,[rel PW_MF060_MF089]	; xmm0=tmp0L
-	pmaddwd   xmm7,[rel PW_MF060_MF089]	; xmm7=tmp0H
-	pmaddwd   xmm1,[rel PW_MF089_F060]	; xmm1=tmp3L
-	pmaddwd   xmm3,[rel PW_MF089_F060]	; xmm3=tmp3H
-
-	paddd	xmm0, XMMWORD [wk(10)]	; xmm0=tmp0L
-	paddd	xmm7, XMMWORD [wk(11)]	; xmm7=tmp0H
-	paddd	xmm1,xmm5		; xmm1=tmp3L
-	paddd	xmm3,xmm4		; xmm3=tmp3H
-
-	movdqa	XMMWORD [wk(8)], xmm0	; wk(8)=tmp0L
-	movdqa	XMMWORD [wk(9)], xmm7	; wk(9)=tmp0H
-
-	movdqa    xmm0,xmm2
-	movdqa    xmm7,xmm2
-	punpcklwd xmm0,xmm6
-	punpckhwd xmm7,xmm6
-	movdqa    xmm2,xmm0
-	movdqa    xmm6,xmm7
-	pmaddwd   xmm0,[rel PW_MF050_MF256]	; xmm0=tmp1L
-	pmaddwd   xmm7,[rel PW_MF050_MF256]	; xmm7=tmp1H
-	pmaddwd   xmm2,[rel PW_MF256_F050]	; xmm2=tmp2L
-	pmaddwd   xmm6,[rel PW_MF256_F050]	; xmm6=tmp2H
-
-	paddd	xmm0,xmm5		; xmm0=tmp1L
-	paddd	xmm7,xmm4		; xmm7=tmp1H
-	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp2L
-	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
-
-	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=tmp1L
-	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=tmp1H
-
-	; -- Final output stage
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
-	movdqa	xmm4, XMMWORD [wk(1)]	; xmm4=tmp10H
-
-	movdqa	xmm0,xmm5
-	movdqa	xmm7,xmm4
-	paddd	xmm5,xmm1		; xmm5=data0L
-	paddd	xmm4,xmm3		; xmm4=data0H
-	psubd	xmm0,xmm1		; xmm0=data7L
-	psubd	xmm7,xmm3		; xmm7=data7H
-
-	movdqa	xmm1,[rel PD_DESCALE_P2]	; xmm1=[rel PD_DESCALE_P2]
-
-	paddd	xmm5,xmm1
-	paddd	xmm4,xmm1
-	psrad	xmm5,DESCALE_P2
-	psrad	xmm4,DESCALE_P2
-	paddd	xmm0,xmm1
-	paddd	xmm7,xmm1
-	psrad	xmm0,DESCALE_P2
-	psrad	xmm7,DESCALE_P2
-
-	packssdw  xmm5,xmm4		; xmm5=data0=(00 10 20 30 40 50 60 70)
-	packssdw  xmm0,xmm7		; xmm0=data7=(07 17 27 37 47 57 67 77)
-
-	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=tmp11L
-	movdqa	xmm1, XMMWORD [wk(5)]	; xmm1=tmp11H
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm7,xmm1
-	paddd	xmm3,xmm2		; xmm3=data1L
-	paddd	xmm1,xmm6		; xmm1=data1H
-	psubd	xmm4,xmm2		; xmm4=data6L
-	psubd	xmm7,xmm6		; xmm7=data6H
-
-	movdqa	xmm2,[rel PD_DESCALE_P2]	; xmm2=[rel PD_DESCALE_P2]
-
-	paddd	xmm3,xmm2
-	paddd	xmm1,xmm2
-	psrad	xmm3,DESCALE_P2
-	psrad	xmm1,DESCALE_P2
-	paddd	xmm4,xmm2
-	paddd	xmm7,xmm2
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm7,DESCALE_P2
-
-	packssdw  xmm3,xmm1		; xmm3=data1=(01 11 21 31 41 51 61 71)
-	packssdw  xmm4,xmm7		; xmm4=data6=(06 16 26 36 46 56 66 76)
-
-	packsswb  xmm5,xmm4		; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	packsswb  xmm3,xmm0		; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	movdqa	xmm6, XMMWORD [wk(6)]	; xmm6=tmp12L
-	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=tmp12H
-	movdqa	xmm1, XMMWORD [wk(10)]	; xmm1=tmp1L
-	movdqa	xmm7, XMMWORD [wk(11)]	; xmm7=tmp1H
-
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	movdqa	xmm4,xmm6
-	movdqa	xmm0,xmm2
-	paddd	xmm6,xmm1		; xmm6=data2L
-	paddd	xmm2,xmm7		; xmm2=data2H
-	psubd	xmm4,xmm1		; xmm4=data5L
-	psubd	xmm0,xmm7		; xmm0=data5H
-
-	movdqa	xmm5,[rel PD_DESCALE_P2]	; xmm5=[rel PD_DESCALE_P2]
-
-	paddd	xmm6,xmm5
-	paddd	xmm2,xmm5
-	psrad	xmm6,DESCALE_P2
-	psrad	xmm2,DESCALE_P2
-	paddd	xmm4,xmm5
-	paddd	xmm0,xmm5
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm0,DESCALE_P2
-
-	packssdw  xmm6,xmm2		; xmm6=data2=(02 12 22 32 42 52 62 72)
-	packssdw  xmm4,xmm0		; xmm4=data5=(05 15 25 35 45 55 65 75)
-
-	movdqa	xmm3, XMMWORD [wk(2)]	; xmm3=tmp13L
-	movdqa	xmm1, XMMWORD [wk(3)]	; xmm1=tmp13H
-	movdqa	xmm7, XMMWORD [wk(8)]	; xmm7=tmp0L
-	movdqa	xmm5, XMMWORD [wk(9)]	; xmm5=tmp0H
-
-	movdqa	xmm2,xmm3
-	movdqa	xmm0,xmm1
-	paddd	xmm3,xmm7		; xmm3=data3L
-	paddd	xmm1,xmm5		; xmm1=data3H
-	psubd	xmm2,xmm7		; xmm2=data4L
-	psubd	xmm0,xmm5		; xmm0=data4H
-
-	movdqa	xmm7,[rel PD_DESCALE_P2]	; xmm7=[rel PD_DESCALE_P2]
-
-	paddd	xmm3,xmm7
-	paddd	xmm1,xmm7
-	psrad	xmm3,DESCALE_P2
-	psrad	xmm1,DESCALE_P2
-	paddd	xmm2,xmm7
-	paddd	xmm0,xmm7
-	psrad	xmm2,DESCALE_P2
-	psrad	xmm0,DESCALE_P2
-
-	movdqa    xmm5,[rel PB_CENTERJSAMP]	; xmm5=[rel PB_CENTERJSAMP]
-
-	packssdw  xmm3,xmm1		; xmm3=data3=(03 13 23 33 43 53 63 73)
-	packssdw  xmm2,xmm0		; xmm2=data4=(04 14 24 34 44 54 64 74)
-
-	movdqa    xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	packsswb  xmm6,xmm2		; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-	packsswb  xmm3,xmm4		; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-	paddb     xmm7,xmm5
-	paddb     xmm1,xmm5
-	paddb     xmm6,xmm5
-	paddb     xmm3,xmm5
-
-	movdqa    xmm0,xmm7	; transpose coefficients(phase 1)
-	punpcklbw xmm7,xmm1	; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-	punpckhbw xmm0,xmm1	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-	movdqa    xmm2,xmm6	; transpose coefficients(phase 1)
-	punpcklbw xmm6,xmm3	; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-	punpckhbw xmm2,xmm3	; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-	movdqa    xmm4,xmm7	; transpose coefficients(phase 2)
-	punpcklwd xmm7,xmm6	; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-	punpckhwd xmm4,xmm6	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-	movdqa    xmm5,xmm2	; transpose coefficients(phase 2)
-	punpcklwd xmm2,xmm0	; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-	punpckhwd xmm5,xmm0	; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-	movdqa    xmm1,xmm7	; transpose coefficients(phase 3)
-	punpckldq xmm7,xmm2	; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-	punpckhdq xmm1,xmm2	; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-	movdqa    xmm3,xmm4	; transpose coefficients(phase 3)
-	punpckldq xmm4,xmm5	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-	punpckhdq xmm3,xmm5	; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-	pshufd	xmm6,xmm7,0x4E	; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-	pshufd	xmm0,xmm1,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-	pshufd	xmm2,xmm4,0x4E	; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-	pshufd	xmm5,xmm3,0x4E	; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
-	mov	rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
-	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-	mov	rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
-	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
-
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2int.asm b/simd/jiss2int.asm
deleted file mode 100644
index 17a23f3..0000000
--- a/simd/jiss2int.asm
+++ /dev/null
@@ -1,859 +0,0 @@
-;
-; jiss2int.asm - accurate integer IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
-%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298	equ	 2446		; FIX(0.298631336)
-F_0_390	equ	 3196		; FIX(0.390180644)
-F_0_541	equ	 4433		; FIX(0.541196100)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_175	equ	 9633		; FIX(1.175875602)
-F_1_501	equ	12299		; FIX(1.501321110)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_1_961	equ	16069		; FIX(1.961570560)
-F_2_053	equ	16819		; FIX(2.053119869)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_072	equ	25172		; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
-F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
-F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
-F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
-F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_islow_sse2) PRIVATE
-
-EXTN(jconst_idct_islow_sse2):
-
-PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		12
-
-	align	16
-	global	EXTN(jsimd_idct_islow_sse2) PRIVATE
-
-EXTN(jsimd_idct_islow_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	xmm1,xmm0
-	packsswb xmm1,xmm1
-	packsswb xmm1,xmm1
-	movd	eax,xmm1
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movdqa	xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	psllw	xmm5,PASS1_BITS
-
-	movdqa    xmm4,xmm5		; xmm5=in0=(00 01 02 03 04 05 06 07)
-	punpcklwd xmm5,xmm5		; xmm5=(00 00 01 01 02 02 03 03)
-	punpckhwd xmm4,xmm4		; xmm4=(04 04 05 05 06 06 07 07)
-
-	pshufd	xmm7,xmm5,0x00		; xmm7=col0=(00 00 00 00 00 00 00 00)
-	pshufd	xmm6,xmm5,0x55		; xmm6=col1=(01 01 01 01 01 01 01 01)
-	pshufd	xmm1,xmm5,0xAA		; xmm1=col2=(02 02 02 02 02 02 02 02)
-	pshufd	xmm5,xmm5,0xFF		; xmm5=col3=(03 03 03 03 03 03 03 03)
-	pshufd	xmm0,xmm4,0x00		; xmm0=col4=(04 04 04 04 04 04 04 04)
-	pshufd	xmm3,xmm4,0x55		; xmm3=col5=(05 05 05 05 05 05 05 05)
-	pshufd	xmm2,xmm4,0xAA		; xmm2=col6=(06 06 06 06 06 06 06 06)
-	pshufd	xmm4,xmm4,0xFF		; xmm4=col7=(07 07 07 07 07 07 07 07)
-
-	movdqa	XMMWORD [wk(8)], xmm6	; wk(8)=col1
-	movdqa	XMMWORD [wk(9)], xmm5	; wk(9)=col3
-	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
-	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
-	jmp	near .column_end
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; (Original)
-	; z1 = (z2 + z3) * 0.541196100;
-	; tmp2 = z1 + z3 * -1.847759065;
-	; tmp3 = z1 + z2 * 0.765366865;
-	;
-	; (This implementation)
-	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-	movdqa    xmm4,xmm1		; xmm1=in2=z2
-	movdqa    xmm5,xmm1
-	punpcklwd xmm4,xmm3		; xmm3=in6=z3
-	punpckhwd xmm5,xmm3
-	movdqa    xmm1,xmm4
-	movdqa    xmm3,xmm5
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]	; xmm4=tmp3L
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]	; xmm5=tmp3H
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=tmp2L
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_F054_MF130)]	; xmm3=tmp2H
-
-	movdqa    xmm6,xmm0
-	paddw     xmm0,xmm2		; xmm0=in0+in4
-	psubw     xmm6,xmm2		; xmm6=in0-in4
-
-	pxor      xmm7,xmm7
-	pxor      xmm2,xmm2
-	punpcklwd xmm7,xmm0		; xmm7=tmp0L
-	punpckhwd xmm2,xmm0		; xmm2=tmp0H
-	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
-	psrad     xmm2,(16-CONST_BITS)	; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
-	movdqa	xmm0,xmm7
-	paddd	xmm7,xmm4		; xmm7=tmp10L
-	psubd	xmm0,xmm4		; xmm0=tmp13L
-	movdqa	xmm4,xmm2
-	paddd	xmm2,xmm5		; xmm2=tmp10H
-	psubd	xmm4,xmm5		; xmm4=tmp13H
-
-	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=tmp10L
-	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=tmp10H
-	movdqa	XMMWORD [wk(2)], xmm0	; wk(2)=tmp13L
-	movdqa	XMMWORD [wk(3)], xmm4	; wk(3)=tmp13H
-
-	pxor      xmm5,xmm5
-	pxor      xmm7,xmm7
-	punpcklwd xmm5,xmm6		; xmm5=tmp1L
-	punpckhwd xmm7,xmm6		; xmm7=tmp1H
-	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
-	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
-	movdqa	xmm2,xmm5
-	paddd	xmm5,xmm1		; xmm5=tmp11L
-	psubd	xmm2,xmm1		; xmm2=tmp12L
-	movdqa	xmm0,xmm7
-	paddd	xmm7,xmm3		; xmm7=tmp11H
-	psubd	xmm0,xmm3		; xmm0=tmp12H
-
-	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
-	movdqa	XMMWORD [wk(5)], xmm7	; wk(5)=tmp11H
-	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=tmp12L
-	movdqa	XMMWORD [wk(7)], xmm0	; wk(7)=tmp12H
-
-	; -- Odd part
-
-	movdqa	xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movdqa	xmm5,xmm6
-	movdqa	xmm7,xmm4
-	paddw	xmm5,xmm3		; xmm5=z3
-	paddw	xmm7,xmm1		; xmm7=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm2,xmm5
-	movdqa    xmm0,xmm5
-	punpcklwd xmm2,xmm7
-	punpckhwd xmm0,xmm7
-	movdqa    xmm5,xmm2
-	movdqa    xmm7,xmm0
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF078_F117)]	; xmm2=z3L
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]	; xmm0=z3H
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]	; xmm5=z4L
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_F117_F078)]	; xmm7=z4H
-
-	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=z3L
-	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=z3H
-
-	; (Original)
-	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-	;
-	; (This implementation)
-	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-	; tmp0 += z3;  tmp1 += z4;
-	; tmp2 += z3;  tmp3 += z4;
-
-	movdqa    xmm2,xmm3
-	movdqa    xmm0,xmm3
-	punpcklwd xmm2,xmm4
-	punpckhwd xmm0,xmm4
-	movdqa    xmm3,xmm2
-	movdqa    xmm4,xmm0
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm2=tmp0L
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm0=tmp0H
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]	; xmm3=tmp3L
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF089_F060)]	; xmm4=tmp3H
-
-	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp0L
-	paddd	xmm0, XMMWORD [wk(11)]	; xmm0=tmp0H
-	paddd	xmm3,xmm5		; xmm3=tmp3L
-	paddd	xmm4,xmm7		; xmm4=tmp3H
-
-	movdqa	XMMWORD [wk(8)], xmm2	; wk(8)=tmp0L
-	movdqa	XMMWORD [wk(9)], xmm0	; wk(9)=tmp0H
-
-	movdqa    xmm2,xmm1
-	movdqa    xmm0,xmm1
-	punpcklwd xmm2,xmm6
-	punpckhwd xmm0,xmm6
-	movdqa    xmm1,xmm2
-	movdqa    xmm6,xmm0
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm2=tmp1L
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm0=tmp1H
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF256_F050)]	; xmm1=tmp2L
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]	; xmm6=tmp2H
-
-	paddd	xmm2,xmm5		; xmm2=tmp1L
-	paddd	xmm0,xmm7		; xmm0=tmp1H
-	paddd	xmm1, XMMWORD [wk(10)]	; xmm1=tmp2L
-	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
-
-	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=tmp1L
-	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=tmp1H
-
-	; -- Final output stage
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
-	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=tmp10H
-
-	movdqa	xmm2,xmm5
-	movdqa	xmm0,xmm7
-	paddd	xmm5,xmm3		; xmm5=data0L
-	paddd	xmm7,xmm4		; xmm7=data0H
-	psubd	xmm2,xmm3		; xmm2=data7L
-	psubd	xmm0,xmm4		; xmm0=data7H
-
-	movdqa	xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm3=[PD_DESCALE_P1]
-
-	paddd	xmm5,xmm3
-	paddd	xmm7,xmm3
-	psrad	xmm5,DESCALE_P1
-	psrad	xmm7,DESCALE_P1
-	paddd	xmm2,xmm3
-	paddd	xmm0,xmm3
-	psrad	xmm2,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-
-	packssdw  xmm5,xmm7		; xmm5=data0=(00 01 02 03 04 05 06 07)
-	packssdw  xmm2,xmm0		; xmm2=data7=(70 71 72 73 74 75 76 77)
-
-	movdqa	xmm4, XMMWORD [wk(4)]	; xmm4=tmp11L
-	movdqa	xmm3, XMMWORD [wk(5)]	; xmm3=tmp11H
-
-	movdqa	xmm7,xmm4
-	movdqa	xmm0,xmm3
-	paddd	xmm4,xmm1		; xmm4=data1L
-	paddd	xmm3,xmm6		; xmm3=data1H
-	psubd	xmm7,xmm1		; xmm7=data6L
-	psubd	xmm0,xmm6		; xmm0=data6H
-
-	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm1=[PD_DESCALE_P1]
-
-	paddd	xmm4,xmm1
-	paddd	xmm3,xmm1
-	psrad	xmm4,DESCALE_P1
-	psrad	xmm3,DESCALE_P1
-	paddd	xmm7,xmm1
-	paddd	xmm0,xmm1
-	psrad	xmm7,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-
-	packssdw  xmm4,xmm3		; xmm4=data1=(10 11 12 13 14 15 16 17)
-	packssdw  xmm7,xmm0		; xmm7=data6=(60 61 62 63 64 65 66 67)
-
-	movdqa    xmm6,xmm5		; transpose coefficients(phase 1)
-	punpcklwd xmm5,xmm4		; xmm5=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm6,xmm4		; xmm6=(04 14 05 15 06 16 07 17)
-	movdqa    xmm1,xmm7		; transpose coefficients(phase 1)
-	punpcklwd xmm7,xmm2		; xmm7=(60 70 61 71 62 72 63 73)
-	punpckhwd xmm1,xmm2		; xmm1=(64 74 65 75 66 76 67 77)
-
-	movdqa	xmm3, XMMWORD [wk(6)]	; xmm3=tmp12L
-	movdqa	xmm0, XMMWORD [wk(7)]	; xmm0=tmp12H
-	movdqa	xmm4, XMMWORD [wk(10)]	; xmm4=tmp1L
-	movdqa	xmm2, XMMWORD [wk(11)]	; xmm2=tmp1H
-
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 01 11 02 12 03 13)
-	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=(04 14 05 15 06 16 07 17)
-	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=(60 70 61 71 62 72 63 73)
-	movdqa	XMMWORD [wk(5)], xmm1	; wk(5)=(64 74 65 75 66 76 67 77)
-
-	movdqa	xmm5,xmm3
-	movdqa	xmm6,xmm0
-	paddd	xmm3,xmm4		; xmm3=data2L
-	paddd	xmm0,xmm2		; xmm0=data2H
-	psubd	xmm5,xmm4		; xmm5=data5L
-	psubd	xmm6,xmm2		; xmm6=data5H
-
-	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm7=[PD_DESCALE_P1]
-
-	paddd	xmm3,xmm7
-	paddd	xmm0,xmm7
-	psrad	xmm3,DESCALE_P1
-	psrad	xmm0,DESCALE_P1
-	paddd	xmm5,xmm7
-	paddd	xmm6,xmm7
-	psrad	xmm5,DESCALE_P1
-	psrad	xmm6,DESCALE_P1
-
-	packssdw  xmm3,xmm0		; xmm3=data2=(20 21 22 23 24 25 26 27)
-	packssdw  xmm5,xmm6		; xmm5=data5=(50 51 52 53 54 55 56 57)
-
-	movdqa	xmm1, XMMWORD [wk(2)]	; xmm1=tmp13L
-	movdqa	xmm4, XMMWORD [wk(3)]	; xmm4=tmp13H
-	movdqa	xmm2, XMMWORD [wk(8)]	; xmm2=tmp0L
-	movdqa	xmm7, XMMWORD [wk(9)]	; xmm7=tmp0H
-
-	movdqa	xmm0,xmm1
-	movdqa	xmm6,xmm4
-	paddd	xmm1,xmm2		; xmm1=data3L
-	paddd	xmm4,xmm7		; xmm4=data3H
-	psubd	xmm0,xmm2		; xmm0=data4L
-	psubd	xmm6,xmm7		; xmm6=data4H
-
-	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm2=[PD_DESCALE_P1]
-
-	paddd	xmm1,xmm2
-	paddd	xmm4,xmm2
-	psrad	xmm1,DESCALE_P1
-	psrad	xmm4,DESCALE_P1
-	paddd	xmm0,xmm2
-	paddd	xmm6,xmm2
-	psrad	xmm0,DESCALE_P1
-	psrad	xmm6,DESCALE_P1
-
-	packssdw  xmm1,xmm4		; xmm1=data3=(30 31 32 33 34 35 36 37)
-	packssdw  xmm0,xmm6		; xmm0=data4=(40 41 42 43 44 45 46 47)
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 01 11 02 12 03 13)
-	movdqa	xmm2, XMMWORD [wk(1)]	; xmm2=(04 14 05 15 06 16 07 17)
-
-	movdqa    xmm4,xmm3		; transpose coefficients(phase 1)
-	punpcklwd xmm3,xmm1		; xmm3=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm4,xmm1		; xmm4=(24 34 25 35 26 36 27 37)
-	movdqa    xmm6,xmm0		; transpose coefficients(phase 1)
-	punpcklwd xmm0,xmm5		; xmm0=(40 50 41 51 42 52 43 53)
-	punpckhwd xmm6,xmm5		; xmm6=(44 54 45 55 46 56 47 57)
-
-	movdqa    xmm1,xmm7		; transpose coefficients(phase 2)
-	punpckldq xmm7,xmm3		; xmm7=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm1,xmm3		; xmm1=(02 12 22 32 03 13 23 33)
-	movdqa    xmm5,xmm2		; transpose coefficients(phase 2)
-	punpckldq xmm2,xmm4		; xmm2=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm5,xmm4		; xmm5=(06 16 26 36 07 17 27 37)
-
-	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=(60 70 61 71 62 72 63 73)
-	movdqa	xmm4, XMMWORD [wk(5)]	; xmm4=(64 74 65 75 66 76 67 77)
-
-	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=(04 14 24 34 05 15 25 35)
-	movdqa	XMMWORD [wk(7)], xmm5	; wk(7)=(06 16 26 36 07 17 27 37)
-
-	movdqa    xmm2,xmm0		; transpose coefficients(phase 2)
-	punpckldq xmm0,xmm3		; xmm0=(40 50 60 70 41 51 61 71)
-	punpckhdq xmm2,xmm3		; xmm2=(42 52 62 72 43 53 63 73)
-	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm4		; xmm6=(44 54 64 74 45 55 65 75)
-	punpckhdq xmm5,xmm4		; xmm5=(46 56 66 76 47 57 67 77)
-
-	movdqa     xmm3,xmm7		; transpose coefficients(phase 3)
-	punpcklqdq xmm7,xmm0		; xmm7=col0=(00 10 20 30 40 50 60 70)
-	punpckhqdq xmm3,xmm0		; xmm3=col1=(01 11 21 31 41 51 61 71)
-	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
-	punpcklqdq xmm1,xmm2		; xmm1=col2=(02 12 22 32 42 52 62 72)
-	punpckhqdq xmm4,xmm2		; xmm4=col3=(03 13 23 33 43 53 63 73)
-
-	movdqa	xmm0, XMMWORD [wk(6)]	; xmm0=(04 14 24 34 05 15 25 35)
-	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=(06 16 26 36 07 17 27 37)
-
-	movdqa	XMMWORD [wk(8)], xmm3	; wk(8)=col1
-	movdqa	XMMWORD [wk(9)], xmm4	; wk(9)=col3
-
-	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
-	punpcklqdq xmm0,xmm6		; xmm0=col4=(04 14 24 34 44 54 64 74)
-	punpckhqdq xmm3,xmm6		; xmm3=col5=(05 15 25 35 45 55 65 75)
-	movdqa     xmm4,xmm2		; transpose coefficients(phase 3)
-	punpcklqdq xmm2,xmm5		; xmm2=col6=(06 16 26 36 46 56 66 76)
-	punpckhqdq xmm4,xmm5		; xmm4=col7=(07 17 27 37 47 57 67 77)
-
-	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
-	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
-.column_end:
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-
-	; -- Even part
-
-	; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
-	; (Original)
-	; z1 = (z2 + z3) * 0.541196100;
-	; tmp2 = z1 + z3 * -1.847759065;
-	; tmp3 = z1 + z2 * 0.765366865;
-	;
-	; (This implementation)
-	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-	movdqa    xmm6,xmm1		; xmm1=in2=z2
-	movdqa    xmm5,xmm1
-	punpcklwd xmm6,xmm2		; xmm2=in6=z3
-	punpckhwd xmm5,xmm2
-	movdqa    xmm1,xmm6
-	movdqa    xmm2,xmm5
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]	; xmm6=tmp3L
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]	; xmm5=tmp3H
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=tmp2L
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_F054_MF130)]	; xmm2=tmp2H
-
-	movdqa    xmm3,xmm7
-	paddw     xmm7,xmm0		; xmm7=in0+in4
-	psubw     xmm3,xmm0		; xmm3=in0-in4
-
-	pxor      xmm4,xmm4
-	pxor      xmm0,xmm0
-	punpcklwd xmm4,xmm7		; xmm4=tmp0L
-	punpckhwd xmm0,xmm7		; xmm0=tmp0H
-	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
-	psrad     xmm0,(16-CONST_BITS)	; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
-	movdqa	xmm7,xmm4
-	paddd	xmm4,xmm6		; xmm4=tmp10L
-	psubd	xmm7,xmm6		; xmm7=tmp13L
-	movdqa	xmm6,xmm0
-	paddd	xmm0,xmm5		; xmm0=tmp10H
-	psubd	xmm6,xmm5		; xmm6=tmp13H
-
-	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=tmp10L
-	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp10H
-	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=tmp13L
-	movdqa	XMMWORD [wk(3)], xmm6	; wk(3)=tmp13H
-
-	pxor      xmm5,xmm5
-	pxor      xmm4,xmm4
-	punpcklwd xmm5,xmm3		; xmm5=tmp1L
-	punpckhwd xmm4,xmm3		; xmm4=tmp1H
-	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
-	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
-	movdqa	xmm0,xmm5
-	paddd	xmm5,xmm1		; xmm5=tmp11L
-	psubd	xmm0,xmm1		; xmm0=tmp12L
-	movdqa	xmm7,xmm4
-	paddd	xmm4,xmm2		; xmm4=tmp11H
-	psubd	xmm7,xmm2		; xmm7=tmp12H
-
-	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
-	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=tmp11H
-	movdqa	XMMWORD [wk(6)], xmm0	; wk(6)=tmp12L
-	movdqa	XMMWORD [wk(7)], xmm7	; wk(7)=tmp12H
-
-	; -- Odd part
-
-	movdqa	xmm6, XMMWORD [wk(9)]	; xmm6=col3
-	movdqa	xmm3, XMMWORD [wk(8)]	; xmm3=col1
-	movdqa	xmm1, XMMWORD [wk(11)]	; xmm1=col7
-	movdqa	xmm2, XMMWORD [wk(10)]	; xmm2=col5
-
-	movdqa	xmm5,xmm6
-	movdqa	xmm4,xmm3
-	paddw	xmm5,xmm1		; xmm5=z3
-	paddw	xmm4,xmm2		; xmm4=z4
-
-	; (Original)
-	; z5 = (z3 + z4) * 1.175875602;
-	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-	; z3 += z5;  z4 += z5;
-	;
-	; (This implementation)
-	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-	movdqa    xmm0,xmm5
-	movdqa    xmm7,xmm5
-	punpcklwd xmm0,xmm4
-	punpckhwd xmm7,xmm4
-	movdqa    xmm5,xmm0
-	movdqa    xmm4,xmm7
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]	; xmm0=z3L
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]	; xmm7=z3H
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]	; xmm5=z4L
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_F117_F078)]	; xmm4=z4H
-
-	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=z3L
-	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=z3H
-
-	; (Original)
-	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-	;
-	; (This implementation)
-	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-	; tmp0 += z3;  tmp1 += z4;
-	; tmp2 += z3;  tmp3 += z4;
-
-	movdqa    xmm0,xmm1
-	movdqa    xmm7,xmm1
-	punpcklwd xmm0,xmm3
-	punpckhwd xmm7,xmm3
-	movdqa    xmm1,xmm0
-	movdqa    xmm3,xmm7
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm0=tmp0L
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm7=tmp0H
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]	; xmm1=tmp3L
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]	; xmm3=tmp3H
-
-	paddd	xmm0, XMMWORD [wk(10)]	; xmm0=tmp0L
-	paddd	xmm7, XMMWORD [wk(11)]	; xmm7=tmp0H
-	paddd	xmm1,xmm5		; xmm1=tmp3L
-	paddd	xmm3,xmm4		; xmm3=tmp3H
-
-	movdqa	XMMWORD [wk(8)], xmm0	; wk(8)=tmp0L
-	movdqa	XMMWORD [wk(9)], xmm7	; wk(9)=tmp0H
-
-	movdqa    xmm0,xmm2
-	movdqa    xmm7,xmm2
-	punpcklwd xmm0,xmm6
-	punpckhwd xmm7,xmm6
-	movdqa    xmm2,xmm0
-	movdqa    xmm6,xmm7
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm0=tmp1L
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm7=tmp1H
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF256_F050)]	; xmm2=tmp2L
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]	; xmm6=tmp2H
-
-	paddd	xmm0,xmm5		; xmm0=tmp1L
-	paddd	xmm7,xmm4		; xmm7=tmp1H
-	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp2L
-	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
-
-	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=tmp1L
-	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=tmp1H
-
-	; -- Final output stage
-
-	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
-	movdqa	xmm4, XMMWORD [wk(1)]	; xmm4=tmp10H
-
-	movdqa	xmm0,xmm5
-	movdqa	xmm7,xmm4
-	paddd	xmm5,xmm1		; xmm5=data0L
-	paddd	xmm4,xmm3		; xmm4=data0H
-	psubd	xmm0,xmm1		; xmm0=data7L
-	psubd	xmm7,xmm3		; xmm7=data7H
-
-	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm1=[PD_DESCALE_P2]
-
-	paddd	xmm5,xmm1
-	paddd	xmm4,xmm1
-	psrad	xmm5,DESCALE_P2
-	psrad	xmm4,DESCALE_P2
-	paddd	xmm0,xmm1
-	paddd	xmm7,xmm1
-	psrad	xmm0,DESCALE_P2
-	psrad	xmm7,DESCALE_P2
-
-	packssdw  xmm5,xmm4		; xmm5=data0=(00 10 20 30 40 50 60 70)
-	packssdw  xmm0,xmm7		; xmm0=data7=(07 17 27 37 47 57 67 77)
-
-	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=tmp11L
-	movdqa	xmm1, XMMWORD [wk(5)]	; xmm1=tmp11H
-
-	movdqa	xmm4,xmm3
-	movdqa	xmm7,xmm1
-	paddd	xmm3,xmm2		; xmm3=data1L
-	paddd	xmm1,xmm6		; xmm1=data1H
-	psubd	xmm4,xmm2		; xmm4=data6L
-	psubd	xmm7,xmm6		; xmm7=data6H
-
-	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm2=[PD_DESCALE_P2]
-
-	paddd	xmm3,xmm2
-	paddd	xmm1,xmm2
-	psrad	xmm3,DESCALE_P2
-	psrad	xmm1,DESCALE_P2
-	paddd	xmm4,xmm2
-	paddd	xmm7,xmm2
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm7,DESCALE_P2
-
-	packssdw  xmm3,xmm1		; xmm3=data1=(01 11 21 31 41 51 61 71)
-	packssdw  xmm4,xmm7		; xmm4=data6=(06 16 26 36 46 56 66 76)
-
-	packsswb  xmm5,xmm4		; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	packsswb  xmm3,xmm0		; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	movdqa	xmm6, XMMWORD [wk(6)]	; xmm6=tmp12L
-	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=tmp12H
-	movdqa	xmm1, XMMWORD [wk(10)]	; xmm1=tmp1L
-	movdqa	xmm7, XMMWORD [wk(11)]	; xmm7=tmp1H
-
-	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	movdqa	xmm4,xmm6
-	movdqa	xmm0,xmm2
-	paddd	xmm6,xmm1		; xmm6=data2L
-	paddd	xmm2,xmm7		; xmm2=data2H
-	psubd	xmm4,xmm1		; xmm4=data5L
-	psubd	xmm0,xmm7		; xmm0=data5H
-
-	movdqa	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm5=[PD_DESCALE_P2]
-
-	paddd	xmm6,xmm5
-	paddd	xmm2,xmm5
-	psrad	xmm6,DESCALE_P2
-	psrad	xmm2,DESCALE_P2
-	paddd	xmm4,xmm5
-	paddd	xmm0,xmm5
-	psrad	xmm4,DESCALE_P2
-	psrad	xmm0,DESCALE_P2
-
-	packssdw  xmm6,xmm2		; xmm6=data2=(02 12 22 32 42 52 62 72)
-	packssdw  xmm4,xmm0		; xmm4=data5=(05 15 25 35 45 55 65 75)
-
-	movdqa	xmm3, XMMWORD [wk(2)]	; xmm3=tmp13L
-	movdqa	xmm1, XMMWORD [wk(3)]	; xmm1=tmp13H
-	movdqa	xmm7, XMMWORD [wk(8)]	; xmm7=tmp0L
-	movdqa	xmm5, XMMWORD [wk(9)]	; xmm5=tmp0H
-
-	movdqa	xmm2,xmm3
-	movdqa	xmm0,xmm1
-	paddd	xmm3,xmm7		; xmm3=data3L
-	paddd	xmm1,xmm5		; xmm1=data3H
-	psubd	xmm2,xmm7		; xmm2=data4L
-	psubd	xmm0,xmm5		; xmm0=data4H
-
-	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm7=[PD_DESCALE_P2]
-
-	paddd	xmm3,xmm7
-	paddd	xmm1,xmm7
-	psrad	xmm3,DESCALE_P2
-	psrad	xmm1,DESCALE_P2
-	paddd	xmm2,xmm7
-	paddd	xmm0,xmm7
-	psrad	xmm2,DESCALE_P2
-	psrad	xmm0,DESCALE_P2
-
-	movdqa    xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm5=[PB_CENTERJSAMP]
-
-	packssdw  xmm3,xmm1		; xmm3=data3=(03 13 23 33 43 53 63 73)
-	packssdw  xmm2,xmm0		; xmm2=data4=(04 14 24 34 44 54 64 74)
-
-	movdqa    xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-	packsswb  xmm6,xmm2		; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-	packsswb  xmm3,xmm4		; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-	paddb     xmm7,xmm5
-	paddb     xmm1,xmm5
-	paddb     xmm6,xmm5
-	paddb     xmm3,xmm5
-
-	movdqa    xmm0,xmm7	; transpose coefficients(phase 1)
-	punpcklbw xmm7,xmm1	; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-	punpckhbw xmm0,xmm1	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-	movdqa    xmm2,xmm6	; transpose coefficients(phase 1)
-	punpcklbw xmm6,xmm3	; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-	punpckhbw xmm2,xmm3	; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-	movdqa    xmm4,xmm7	; transpose coefficients(phase 2)
-	punpcklwd xmm7,xmm6	; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-	punpckhwd xmm4,xmm6	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-	movdqa    xmm5,xmm2	; transpose coefficients(phase 2)
-	punpcklwd xmm2,xmm0	; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-	punpckhwd xmm5,xmm0	; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-	movdqa    xmm1,xmm7	; transpose coefficients(phase 3)
-	punpckldq xmm7,xmm2	; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-	punpckhdq xmm1,xmm2	; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-	movdqa    xmm3,xmm4	; transpose coefficients(phase 3)
-	punpckldq xmm4,xmm5	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-	punpckhdq xmm3,xmm5	; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-	pshufd	xmm6,xmm7,0x4E	; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-	pshufd	xmm0,xmm1,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-	pshufd	xmm2,xmm4,0x4E	; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-	pshufd	xmm5,xmm3,0x4E	; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
-	mov	edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
-	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
-	mov	edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
-	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
-	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2red-64.asm b/simd/jiss2red-64.asm
deleted file mode 100644
index 95f893f..0000000
--- a/simd/jiss2red-64.asm
+++ /dev/null
@@ -1,576 +0,0 @@
-;
-; jiss2red-64.asm - reduced-size IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211	equ	 1730		; FIX(0.211164243)
-F_0_509	equ	 4176		; FIX(0.509795579)
-F_0_601	equ	 4926		; FIX(0.601344887)
-F_0_720	equ	 5906		; FIX(0.720959822)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_850	equ	 6967		; FIX(0.850430095)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_061	equ	 8697		; FIX(1.061594337)
-F_1_272	equ	10426		; FIX(1.272758580)
-F_1_451	equ	11893		; FIX(1.451774981)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_2_172	equ	17799		; FIX(2.172734803)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_624	equ	29692		; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
-F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
-F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
-F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
-F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
-F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_red_sse2) PRIVATE
-
-EXTN(jconst_idct_red_sse2):
-
-PW_F184_MF076	times 4 dw  F_1_847,-F_0_765
-PW_F256_F089	times 4 dw  F_2_562, F_0_899
-PW_F106_MF217	times 4 dw  F_1_061,-F_2_172
-PW_MF060_MF050	times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021	times 4 dw  F_1_451,-F_0_211
-PW_F362_MF127	times 4 dw  F_3_624,-F_1_272
-PW_F085_MF072	times 4 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4	times 4 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4	times 4 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2	times 4 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2	times 4 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	64
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void * dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp	rbp+0
-%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_idct_4x4_sse2) PRIVATE
-
-EXTN(jsimd_idct_4x4_sse2):
-	push	rbp
-	mov	rax,rsp				; rax = original rbp
-	sub	rsp, byte 4
-	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[rsp],rax
-	mov	rbp,rsp				; rbp = aligned rbp
-	lea	rsp, [wk(0)]
-	collect_args
-
-	; ---- Pass 1: process columns from input.
-
-	mov	rdx, r10	; quantptr
-	mov	rsi, r11		; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	jnz	short .columnDCT
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	por	xmm0,xmm1
-	packsswb xmm0,xmm0
-	packsswb xmm0,xmm0
-	movd	eax,xmm0
-	test	rax,rax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	psllw	xmm0,PASS1_BITS
-
-	movdqa    xmm3,xmm0	; xmm0=in0=(00 01 02 03 04 05 06 07)
-	punpcklwd xmm0,xmm0	; xmm0=(00 00 01 01 02 02 03 03)
-	punpckhwd xmm3,xmm3	; xmm3=(04 04 05 05 06 06 07 07)
-
-	pshufd	xmm1,xmm0,0x50	; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
-	pshufd	xmm0,xmm0,0xFA	; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
-	pshufd	xmm6,xmm3,0x50	; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
-	pshufd	xmm3,xmm3,0xFA	; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
-	jmp	near .column_end
-%endif
-.columnDCT:
-
-	; -- Odd part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movdqa    xmm4,xmm0
-	movdqa    xmm5,xmm0
-	punpcklwd xmm4,xmm1
-	punpckhwd xmm5,xmm1
-	movdqa    xmm0,xmm4
-	movdqa    xmm1,xmm5
-	pmaddwd   xmm4,[rel PW_F256_F089]	; xmm4=(tmp2L)
-	pmaddwd   xmm5,[rel PW_F256_F089]	; xmm5=(tmp2H)
-	pmaddwd   xmm0,[rel PW_F106_MF217]	; xmm0=(tmp0L)
-	pmaddwd   xmm1,[rel PW_F106_MF217]	; xmm1=(tmp0H)
-
-	movdqa    xmm6,xmm2
-	movdqa    xmm7,xmm2
-	punpcklwd xmm6,xmm3
-	punpckhwd xmm7,xmm3
-	movdqa    xmm2,xmm6
-	movdqa    xmm3,xmm7
-	pmaddwd   xmm6,[rel PW_MF060_MF050]	; xmm6=(tmp2L)
-	pmaddwd   xmm7,[rel PW_MF060_MF050]	; xmm7=(tmp2H)
-	pmaddwd   xmm2,[rel PW_F145_MF021]	; xmm2=(tmp0L)
-	pmaddwd   xmm3,[rel PW_F145_MF021]	; xmm3=(tmp0H)
-
-	paddd	xmm6,xmm4		; xmm6=tmp2L
-	paddd	xmm7,xmm5		; xmm7=tmp2H
-	paddd	xmm2,xmm0		; xmm2=tmp0L
-	paddd	xmm3,xmm1		; xmm3=tmp0H
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp0L
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=tmp0H
-
-	; -- Even part
-
-	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	pxor      xmm1,xmm1
-	pxor      xmm2,xmm2
-	punpcklwd xmm1,xmm4		; xmm1=tmp0L
-	punpckhwd xmm2,xmm4		; xmm2=tmp0H
-	psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
-	psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
-	movdqa    xmm3,xmm5		; xmm5=in2=z2
-	punpcklwd xmm5,xmm0		; xmm0=in6=z3
-	punpckhwd xmm3,xmm0
-	pmaddwd   xmm5,[rel PW_F184_MF076]	; xmm5=tmp2L
-	pmaddwd   xmm3,[rel PW_F184_MF076]	; xmm3=tmp2H
-
-	movdqa	xmm4,xmm1
-	movdqa	xmm0,xmm2
-	paddd	xmm1,xmm5		; xmm1=tmp10L
-	paddd	xmm2,xmm3		; xmm2=tmp10H
-	psubd	xmm4,xmm5		; xmm4=tmp12L
-	psubd	xmm0,xmm3		; xmm0=tmp12H
-
-	; -- Final output stage
-
-	movdqa	xmm5,xmm1
-	movdqa	xmm3,xmm2
-	paddd	xmm1,xmm6		; xmm1=data0L
-	paddd	xmm2,xmm7		; xmm2=data0H
-	psubd	xmm5,xmm6		; xmm5=data3L
-	psubd	xmm3,xmm7		; xmm3=data3H
-
-	movdqa	xmm6,[rel PD_DESCALE_P1_4]	; xmm6=[rel PD_DESCALE_P1_4]
-
-	paddd	xmm1,xmm6
-	paddd	xmm2,xmm6
-	psrad	xmm1,DESCALE_P1_4
-	psrad	xmm2,DESCALE_P1_4
-	paddd	xmm5,xmm6
-	paddd	xmm3,xmm6
-	psrad	xmm5,DESCALE_P1_4
-	psrad	xmm3,DESCALE_P1_4
-
-	packssdw  xmm1,xmm2		; xmm1=data0=(00 01 02 03 04 05 06 07)
-	packssdw  xmm5,xmm3		; xmm5=data3=(30 31 32 33 34 35 36 37)
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp0L
-	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp0H
-
-	movdqa	xmm2,xmm4
-	movdqa	xmm3,xmm0
-	paddd	xmm4,xmm7		; xmm4=data1L
-	paddd	xmm0,xmm6		; xmm0=data1H
-	psubd	xmm2,xmm7		; xmm2=data2L
-	psubd	xmm3,xmm6		; xmm3=data2H
-
-	movdqa	xmm7,[rel PD_DESCALE_P1_4]	; xmm7=[rel PD_DESCALE_P1_4]
-
-	paddd	xmm4,xmm7
-	paddd	xmm0,xmm7
-	psrad	xmm4,DESCALE_P1_4
-	psrad	xmm0,DESCALE_P1_4
-	paddd	xmm2,xmm7
-	paddd	xmm3,xmm7
-	psrad	xmm2,DESCALE_P1_4
-	psrad	xmm3,DESCALE_P1_4
-
-	packssdw  xmm4,xmm0		; xmm4=data1=(10 11 12 13 14 15 16 17)
-	packssdw  xmm2,xmm3		; xmm2=data2=(20 21 22 23 24 25 26 27)
-
-	movdqa    xmm6,xmm1	; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm4	; xmm1=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm6,xmm4	; xmm6=(04 14 05 15 06 16 07 17)
-	movdqa    xmm7,xmm2	; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm5	; xmm2=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm7,xmm5	; xmm7=(24 34 25 35 26 36 27 37)
-
-	movdqa    xmm0,xmm1	; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm2	; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm0,xmm2	; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
-	movdqa    xmm3,xmm6	; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm7	; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm3,xmm7	; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
-.column_end:
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows, store into output array.
-
-	mov	rax, [original_rbp]
-	mov	rdi, r12	; (JSAMPROW *)
-	mov	eax, r13d
-
-	; -- Even part
-
-	pxor      xmm4,xmm4
-	punpcklwd xmm4,xmm1		; xmm4=tmp0
-	psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
-
-	; -- Odd part
-
-	punpckhwd xmm1,xmm0
-	punpckhwd xmm6,xmm3
-	movdqa    xmm5,xmm1
-	movdqa    xmm2,xmm6
-	pmaddwd   xmm1,[rel PW_F256_F089]	; xmm1=(tmp2)
-	pmaddwd   xmm6,[rel PW_MF060_MF050]	; xmm6=(tmp2)
-	pmaddwd   xmm5,[rel PW_F106_MF217]	; xmm5=(tmp0)
-	pmaddwd   xmm2,[rel PW_F145_MF021]	; xmm2=(tmp0)
-
-	paddd     xmm6,xmm1		; xmm6=tmp2
-	paddd     xmm2,xmm5		; xmm2=tmp0
-
-	; -- Even part
-
-	punpcklwd xmm0,xmm3
-	pmaddwd   xmm0,[rel PW_F184_MF076]	; xmm0=tmp2
-
-	movdqa    xmm7,xmm4
-	paddd     xmm4,xmm0		; xmm4=tmp10
-	psubd     xmm7,xmm0		; xmm7=tmp12
-
-	; -- Final output stage
-
-	movdqa	xmm1,[rel PD_DESCALE_P2_4]	; xmm1=[rel PD_DESCALE_P2_4]
-
-	movdqa	xmm5,xmm4
-	movdqa	xmm3,xmm7
-	paddd	xmm4,xmm6		; xmm4=data0=(00 10 20 30)
-	paddd	xmm7,xmm2		; xmm7=data1=(01 11 21 31)
-	psubd	xmm5,xmm6		; xmm5=data3=(03 13 23 33)
-	psubd	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
-
-	paddd	xmm4,xmm1
-	paddd	xmm7,xmm1
-	psrad	xmm4,DESCALE_P2_4
-	psrad	xmm7,DESCALE_P2_4
-	paddd	xmm5,xmm1
-	paddd	xmm3,xmm1
-	psrad	xmm5,DESCALE_P2_4
-	psrad	xmm3,DESCALE_P2_4
-
-	packssdw  xmm4,xmm3		; xmm4=(00 10 20 30 02 12 22 32)
-	packssdw  xmm7,xmm5		; xmm7=(01 11 21 31 03 13 23 33)
-
-	movdqa    xmm0,xmm4		; transpose coefficients(phase 1)
-	punpcklwd xmm4,xmm7		; xmm4=(00 01 10 11 20 21 30 31)
-	punpckhwd xmm0,xmm7		; xmm0=(02 03 12 13 22 23 32 33)
-
-	movdqa    xmm6,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm0		; xmm4=(00 01 02 03 10 11 12 13)
-	punpckhdq xmm6,xmm0		; xmm6=(20 21 22 23 30 31 32 33)
-
-	packsswb  xmm4,xmm6		; xmm4=(00 01 02 03 10 11 12 13 20 ..)
-	paddb     xmm4,[rel PB_CENTERJSAMP]
-
-	pshufd    xmm2,xmm4,0x39	; xmm2=(10 11 12 13 20 21 22 23 30 ..)
-	pshufd    xmm1,xmm4,0x4E	; xmm1=(20 21 22 23 30 31 32 33 00 ..)
-	pshufd    xmm3,xmm4,0x93	; xmm3=(30 31 32 33 00 01 02 03 10 ..)
-
-	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-	movd	XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-	movd	XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-	mov	rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-	movd	XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
-	movd	XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
-	uncollect_args
-	mov	rsp,rbp		; rsp <- aligned rbp
-	pop	rsp		; rsp <- original rbp
-	pop	rbp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void * dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-	align	16
-	global	EXTN(jsimd_idct_2x2_sse2) PRIVATE
-
-EXTN(jsimd_idct_2x2_sse2):
-	push	rbp
-	mov	rax,rsp
-	mov	rbp,rsp
-	collect_args
-	push	rbx
-
-	; ---- Pass 1: process columns from input.
-
-	mov	rdx, r10	; quantptr
-	mov	rsi, r11		; inptr
-
-	; | input:                  | result:        |
-	; | 00 01 ** 03 ** 05 ** 07 |                |
-	; | 10 11 ** 13 ** 15 ** 17 |                |
-	; | ** ** ** ** ** ** ** ** |                |
-	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-	; | 50 51 ** 53 ** 55 ** 57 |                |
-	; | ** ** ** ** ** ** ** ** |                |
-	; | 70 71 ** 73 ** 75 ** 77 |                |
-
-	; -- Odd part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
-	; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
-
-	pcmpeqd   xmm7,xmm7
-	pslld     xmm7,WORD_BIT		; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
-
-	movdqa    xmm4,xmm0		; xmm4=(10 11 ** 13 ** 15 ** 17)
-	movdqa    xmm5,xmm2		; xmm5=(50 51 ** 53 ** 55 ** 57)
-	punpcklwd xmm4,xmm1		; xmm4=(10 30 11 31 ** ** 13 33)
-	punpcklwd xmm5,xmm3		; xmm5=(50 70 51 71 ** ** 53 73)
-	pmaddwd   xmm4,[rel PW_F362_MF127]
-	pmaddwd   xmm5,[rel PW_F085_MF072]
-
-	psrld	xmm0,WORD_BIT		; xmm0=(11 -- 13 -- 15 -- 17 --)
-	pand	xmm1,xmm7		; xmm1=(-- 31 -- 33 -- 35 -- 37)
-	psrld	xmm2,WORD_BIT		; xmm2=(51 -- 53 -- 55 -- 57 --)
-	pand	xmm3,xmm7		; xmm3=(-- 71 -- 73 -- 75 -- 77)
-	por	xmm0,xmm1		; xmm0=(11 31 13 33 15 35 17 37)
-	por	xmm2,xmm3		; xmm2=(51 71 53 73 55 75 57 77)
-	pmaddwd	xmm0,[rel PW_F362_MF127]
-	pmaddwd	xmm2,[rel PW_F085_MF072]
-
-	paddd	xmm4,xmm5		; xmm4=tmp0[col0 col1 **** col3]
-	paddd	xmm0,xmm2		; xmm0=tmp0[col1 col3 col5 col7]
-
-	; -- Even part
-
-	movdqa	xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-	pmullw	xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; xmm6=(00 01 ** 03 ** 05 ** 07)
-
-	movdqa	xmm1,xmm6		; xmm1=(00 01 ** 03 ** 05 ** 07)
-	pslld	xmm6,WORD_BIT		; xmm6=(-- 00 -- ** -- ** -- **)
-	pand	xmm1,xmm7		; xmm1=(-- 01 -- 03 -- 05 -- 07)
-	psrad	xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
-	psrad	xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
-
-	; -- Final output stage
-
-	movdqa	xmm3,xmm6
-	movdqa	xmm5,xmm1
-	paddd	xmm6,xmm4	; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
-	paddd	xmm1,xmm0	; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
-	psubd	xmm3,xmm4	; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
-	psubd	xmm5,xmm0	; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
-
-	movdqa	xmm2,[rel PD_DESCALE_P1_2]	; xmm2=[rel PD_DESCALE_P1_2]
-
-	punpckldq  xmm6,xmm3		; xmm6=(A0 B0 ** **)
-
-	movdqa     xmm7,xmm1
-	punpcklqdq xmm1,xmm5		; xmm1=(A1 A3 B1 B3)
-	punpckhqdq xmm7,xmm5		; xmm7=(A5 A7 B5 B7)
-
-	paddd	xmm6,xmm2
-	psrad	xmm6,DESCALE_P1_2
-
-	paddd	xmm1,xmm2
-	paddd	xmm7,xmm2
-	psrad	xmm1,DESCALE_P1_2
-	psrad	xmm7,DESCALE_P1_2
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows, store into output array.
-
-	mov	rdi, r12	; (JSAMPROW *)
-	mov	eax, r13d
-
-	; | input:| result:|
-	; | A0 B0 |        |
-	; | A1 B1 | C0 C1  |
-	; | A3 B3 | D0 D1  |
-	; | A5 B5 |        |
-	; | A7 B7 |        |
-
-	; -- Odd part
-
-	packssdw  xmm1,xmm1		; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
-	packssdw  xmm7,xmm7		; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
-	pmaddwd   xmm1,[rel PW_F362_MF127]
-	pmaddwd   xmm7,[rel PW_F085_MF072]
-
-	paddd     xmm1,xmm7		; xmm1=tmp0[row0 row1 row0 row1]
-
-	; -- Even part
-
-	pslld     xmm6,(CONST_BITS+2)	; xmm6=tmp10[row0 row1 **** ****]
-
-	; -- Final output stage
-
-	movdqa    xmm4,xmm6
-	paddd     xmm6,xmm1	; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
-	psubd     xmm4,xmm1	; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
-
-	punpckldq xmm6,xmm4	; xmm6=(C0 D0 C1 D1)
-
-	paddd     xmm6,[rel PD_DESCALE_P2_2]
-	psrad     xmm6,DESCALE_P2_2
-
-	packssdw  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
-	packsswb  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
-	paddb     xmm6,[rel PB_CENTERJSAMP]
-
-	pextrw	ebx,xmm6,0x00		; ebx=(C0 D0 -- --)
-	pextrw	ecx,xmm6,0x01		; ecx=(C1 D1 -- --)
-
-	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-	mov	rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-	mov	WORD [rdx+rax*SIZEOF_JSAMPLE], bx
-	mov	WORD [rsi+rax*SIZEOF_JSAMPLE], cx
-
-	pop	rbx
-	uncollect_args
-	pop	rbp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jiss2red.asm b/simd/jiss2red.asm
deleted file mode 100644
index 0e15ea8..0000000
--- a/simd/jiss2red.asm
+++ /dev/null
@@ -1,594 +0,0 @@
-;
-; jiss2red.asm - reduced-size IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS	13
-%define PASS1_BITS	2
-
-%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211	equ	 1730		; FIX(0.211164243)
-F_0_509	equ	 4176		; FIX(0.509795579)
-F_0_601	equ	 4926		; FIX(0.601344887)
-F_0_720	equ	 5906		; FIX(0.720959822)
-F_0_765	equ	 6270		; FIX(0.765366865)
-F_0_850	equ	 6967		; FIX(0.850430095)
-F_0_899	equ	 7373		; FIX(0.899976223)
-F_1_061	equ	 8697		; FIX(1.061594337)
-F_1_272	equ	10426		; FIX(1.272758580)
-F_1_451	equ	11893		; FIX(1.451774981)
-F_1_847	equ	15137		; FIX(1.847759065)
-F_2_172	equ	17799		; FIX(2.172734803)
-F_2_562	equ	20995		; FIX(2.562915447)
-F_3_624	equ	29692		; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
-F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
-F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
-F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
-F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
-F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
-F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
-F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
-F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
-F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
-F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
-F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
-F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
-F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_red_sse2) PRIVATE
-
-EXTN(jconst_idct_red_sse2):
-
-PW_F184_MF076	times 4 dw  F_1_847,-F_0_765
-PW_F256_F089	times 4 dw  F_2_562, F_0_899
-PW_F106_MF217	times 4 dw  F_1_061,-F_2_172
-PW_MF060_MF050	times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021	times 4 dw  F_1_451,-F_0_211
-PW_F362_MF127	times 4 dw  F_3_624,-F_1_272
-PW_F085_MF072	times 4 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4	times 4 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4	times 4 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2	times 4 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2	times 4 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-
-	align	16
-	global	EXTN(jsimd_idct_4x4_sse2) PRIVATE
-
-EXTN(jsimd_idct_4x4_sse2):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [wk(0)]
-	pushpic	ebx
-;	push	ecx		; unused
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	short .columnDCT
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	xmm0,xmm1
-	packsswb xmm0,xmm0
-	packsswb xmm0,xmm0
-	movd	eax,xmm0
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	psllw	xmm0,PASS1_BITS
-
-	movdqa    xmm3,xmm0	; xmm0=in0=(00 01 02 03 04 05 06 07)
-	punpcklwd xmm0,xmm0	; xmm0=(00 00 01 01 02 02 03 03)
-	punpckhwd xmm3,xmm3	; xmm3=(04 04 05 05 06 06 07 07)
-
-	pshufd	xmm1,xmm0,0x50	; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
-	pshufd	xmm0,xmm0,0xFA	; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
-	pshufd	xmm6,xmm3,0x50	; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
-	pshufd	xmm3,xmm3,0xFA	; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
-	jmp	near .column_end
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Odd part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	movdqa    xmm4,xmm0
-	movdqa    xmm5,xmm0
-	punpcklwd xmm4,xmm1
-	punpckhwd xmm5,xmm1
-	movdqa    xmm0,xmm4
-	movdqa    xmm1,xmm5
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_F256_F089)]	; xmm4=(tmp2L)
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F256_F089)]	; xmm5=(tmp2H)
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_F106_MF217)]	; xmm0=(tmp0L)
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_F106_MF217)]	; xmm1=(tmp0H)
-
-	movdqa    xmm6,xmm2
-	movdqa    xmm7,xmm2
-	punpcklwd xmm6,xmm3
-	punpckhwd xmm7,xmm3
-	movdqa    xmm2,xmm6
-	movdqa    xmm3,xmm7
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm6=(tmp2L)
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm7=(tmp2H)
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]	; xmm2=(tmp0L)
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_F145_MF021)]	; xmm3=(tmp0H)
-
-	paddd	xmm6,xmm4		; xmm6=tmp2L
-	paddd	xmm7,xmm5		; xmm7=tmp2H
-	paddd	xmm2,xmm0		; xmm2=tmp0L
-	paddd	xmm3,xmm1		; xmm3=tmp0H
-
-	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp0L
-	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=tmp0H
-
-	; -- Even part
-
-	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	pxor      xmm1,xmm1
-	pxor      xmm2,xmm2
-	punpcklwd xmm1,xmm4		; xmm1=tmp0L
-	punpckhwd xmm2,xmm4		; xmm2=tmp0H
-	psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
-	psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
-	movdqa    xmm3,xmm5		; xmm5=in2=z2
-	punpcklwd xmm5,xmm0		; xmm0=in6=z3
-	punpckhwd xmm3,xmm0
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F184_MF076)]	; xmm5=tmp2L
-	pmaddwd   xmm3,[GOTOFF(ebx,PW_F184_MF076)]	; xmm3=tmp2H
-
-	movdqa	xmm4,xmm1
-	movdqa	xmm0,xmm2
-	paddd	xmm1,xmm5		; xmm1=tmp10L
-	paddd	xmm2,xmm3		; xmm2=tmp10H
-	psubd	xmm4,xmm5		; xmm4=tmp12L
-	psubd	xmm0,xmm3		; xmm0=tmp12H
-
-	; -- Final output stage
-
-	movdqa	xmm5,xmm1
-	movdqa	xmm3,xmm2
-	paddd	xmm1,xmm6		; xmm1=data0L
-	paddd	xmm2,xmm7		; xmm2=data0H
-	psubd	xmm5,xmm6		; xmm5=data3L
-	psubd	xmm3,xmm7		; xmm3=data3H
-
-	movdqa	xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; xmm6=[PD_DESCALE_P1_4]
-
-	paddd	xmm1,xmm6
-	paddd	xmm2,xmm6
-	psrad	xmm1,DESCALE_P1_4
-	psrad	xmm2,DESCALE_P1_4
-	paddd	xmm5,xmm6
-	paddd	xmm3,xmm6
-	psrad	xmm5,DESCALE_P1_4
-	psrad	xmm3,DESCALE_P1_4
-
-	packssdw  xmm1,xmm2		; xmm1=data0=(00 01 02 03 04 05 06 07)
-	packssdw  xmm5,xmm3		; xmm5=data3=(30 31 32 33 34 35 36 37)
-
-	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp0L
-	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp0H
-
-	movdqa	xmm2,xmm4
-	movdqa	xmm3,xmm0
-	paddd	xmm4,xmm7		; xmm4=data1L
-	paddd	xmm0,xmm6		; xmm0=data1H
-	psubd	xmm2,xmm7		; xmm2=data2L
-	psubd	xmm3,xmm6		; xmm3=data2H
-
-	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; xmm7=[PD_DESCALE_P1_4]
-
-	paddd	xmm4,xmm7
-	paddd	xmm0,xmm7
-	psrad	xmm4,DESCALE_P1_4
-	psrad	xmm0,DESCALE_P1_4
-	paddd	xmm2,xmm7
-	paddd	xmm3,xmm7
-	psrad	xmm2,DESCALE_P1_4
-	psrad	xmm3,DESCALE_P1_4
-
-	packssdw  xmm4,xmm0		; xmm4=data1=(10 11 12 13 14 15 16 17)
-	packssdw  xmm2,xmm3		; xmm2=data2=(20 21 22 23 24 25 26 27)
-
-	movdqa    xmm6,xmm1	; transpose coefficients(phase 1)
-	punpcklwd xmm1,xmm4	; xmm1=(00 10 01 11 02 12 03 13)
-	punpckhwd xmm6,xmm4	; xmm6=(04 14 05 15 06 16 07 17)
-	movdqa    xmm7,xmm2	; transpose coefficients(phase 1)
-	punpcklwd xmm2,xmm5	; xmm2=(20 30 21 31 22 32 23 33)
-	punpckhwd xmm7,xmm5	; xmm7=(24 34 25 35 26 36 27 37)
-
-	movdqa    xmm0,xmm1	; transpose coefficients(phase 2)
-	punpckldq xmm1,xmm2	; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
-	punpckhdq xmm0,xmm2	; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
-	movdqa    xmm3,xmm6	; transpose coefficients(phase 2)
-	punpckldq xmm6,xmm7	; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
-	punpckhdq xmm3,xmm7	; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
-.column_end:
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows, store into output array.
-
-	mov	eax, [original_ebp]
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-
-	; -- Even part
-
-	pxor      xmm4,xmm4
-	punpcklwd xmm4,xmm1		; xmm4=tmp0
-	psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
-
-	; -- Odd part
-
-	punpckhwd xmm1,xmm0
-	punpckhwd xmm6,xmm3
-	movdqa    xmm5,xmm1
-	movdqa    xmm2,xmm6
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_F256_F089)]	; xmm1=(tmp2)
-	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm6=(tmp2)
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F106_MF217)]	; xmm5=(tmp0)
-	pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]	; xmm2=(tmp0)
-
-	paddd     xmm6,xmm1		; xmm6=tmp2
-	paddd     xmm2,xmm5		; xmm2=tmp0
-
-	; -- Even part
-
-	punpcklwd xmm0,xmm3
-	pmaddwd   xmm0,[GOTOFF(ebx,PW_F184_MF076)]	; xmm0=tmp2
-
-	movdqa    xmm7,xmm4
-	paddd     xmm4,xmm0		; xmm4=tmp10
-	psubd     xmm7,xmm0		; xmm7=tmp12
-
-	; -- Final output stage
-
-	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; xmm1=[PD_DESCALE_P2_4]
-
-	movdqa	xmm5,xmm4
-	movdqa	xmm3,xmm7
-	paddd	xmm4,xmm6		; xmm4=data0=(00 10 20 30)
-	paddd	xmm7,xmm2		; xmm7=data1=(01 11 21 31)
-	psubd	xmm5,xmm6		; xmm5=data3=(03 13 23 33)
-	psubd	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
-
-	paddd	xmm4,xmm1
-	paddd	xmm7,xmm1
-	psrad	xmm4,DESCALE_P2_4
-	psrad	xmm7,DESCALE_P2_4
-	paddd	xmm5,xmm1
-	paddd	xmm3,xmm1
-	psrad	xmm5,DESCALE_P2_4
-	psrad	xmm3,DESCALE_P2_4
-
-	packssdw  xmm4,xmm3		; xmm4=(00 10 20 30 02 12 22 32)
-	packssdw  xmm7,xmm5		; xmm7=(01 11 21 31 03 13 23 33)
-
-	movdqa    xmm0,xmm4		; transpose coefficients(phase 1)
-	punpcklwd xmm4,xmm7		; xmm4=(00 01 10 11 20 21 30 31)
-	punpckhwd xmm0,xmm7		; xmm0=(02 03 12 13 22 23 32 33)
-
-	movdqa    xmm6,xmm4		; transpose coefficients(phase 2)
-	punpckldq xmm4,xmm0		; xmm4=(00 01 02 03 10 11 12 13)
-	punpckhdq xmm6,xmm0		; xmm6=(20 21 22 23 30 31 32 33)
-
-	packsswb  xmm4,xmm6		; xmm4=(00 01 02 03 10 11 12 13 20 ..)
-	paddb     xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-	pshufd    xmm2,xmm4,0x39	; xmm2=(10 11 12 13 20 21 22 23 30 ..)
-	pshufd    xmm1,xmm4,0x4E	; xmm1=(20 21 22 23 30 31 32 33 00 ..)
-	pshufd    xmm3,xmm4,0x93	; xmm3=(30 31 32 33 00 01 02 03 10 ..)
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	movd	XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-	movd	XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
-	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movd	XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
-	movd	XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; unused
-	poppic	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-	align	16
-	global	EXTN(jsimd_idct_2x2_sse2) PRIVATE
-
-EXTN(jsimd_idct_2x2_sse2):
-	push	ebp
-	mov	ebp,esp
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input.
-
-	mov	edx, POINTER [dct_table(ebp)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
-
-	; | input:                  | result:        |
-	; | 00 01 ** 03 ** 05 ** 07 |                |
-	; | 10 11 ** 13 ** 15 ** 17 |                |
-	; | ** ** ** ** ** ** ** ** |                |
-	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-	; | 50 51 ** 53 ** 55 ** 57 |                |
-	; | ** ** ** ** ** ** ** ** |                |
-	; | 70 71 ** 73 ** 75 ** 77 |                |
-
-	; -- Odd part
-
-	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
-	; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
-
-	pcmpeqd   xmm7,xmm7
-	pslld     xmm7,WORD_BIT		; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
-
-	movdqa    xmm4,xmm0		; xmm4=(10 11 ** 13 ** 15 ** 17)
-	movdqa    xmm5,xmm2		; xmm5=(50 51 ** 53 ** 55 ** 57)
-	punpcklwd xmm4,xmm1		; xmm4=(10 30 11 31 ** ** 13 33)
-	punpcklwd xmm5,xmm3		; xmm5=(50 70 51 71 ** ** 53 73)
-	pmaddwd   xmm4,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd   xmm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-	psrld	xmm0,WORD_BIT		; xmm0=(11 -- 13 -- 15 -- 17 --)
-	pand	xmm1,xmm7		; xmm1=(-- 31 -- 33 -- 35 -- 37)
-	psrld	xmm2,WORD_BIT		; xmm2=(51 -- 53 -- 55 -- 57 --)
-	pand	xmm3,xmm7		; xmm3=(-- 71 -- 73 -- 75 -- 77)
-	por	xmm0,xmm1		; xmm0=(11 31 13 33 15 35 17 37)
-	por	xmm2,xmm3		; xmm2=(51 71 53 73 55 75 57 77)
-	pmaddwd	xmm0,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd	xmm2,[GOTOFF(ebx,PW_F085_MF072)]
-
-	paddd	xmm4,xmm5		; xmm4=tmp0[col0 col1 **** col3]
-	paddd	xmm0,xmm2		; xmm0=tmp0[col1 col3 col5 col7]
-
-	; -- Even part
-
-	movdqa	xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	pmullw	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-	; xmm6=(00 01 ** 03 ** 05 ** 07)
-
-	movdqa	xmm1,xmm6		; xmm1=(00 01 ** 03 ** 05 ** 07)
-	pslld	xmm6,WORD_BIT		; xmm6=(-- 00 -- ** -- ** -- **)
-	pand	xmm1,xmm7		; xmm1=(-- 01 -- 03 -- 05 -- 07)
-	psrad	xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
-	psrad	xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
-
-	; -- Final output stage
-
-	movdqa	xmm3,xmm6
-	movdqa	xmm5,xmm1
-	paddd	xmm6,xmm4	; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
-	paddd	xmm1,xmm0	; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
-	psubd	xmm3,xmm4	; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
-	psubd	xmm5,xmm0	; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
-
-	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; xmm2=[PD_DESCALE_P1_2]
-
-	punpckldq  xmm6,xmm3		; xmm6=(A0 B0 ** **)
-
-	movdqa     xmm7,xmm1
-	punpcklqdq xmm1,xmm5		; xmm1=(A1 A3 B1 B3)
-	punpckhqdq xmm7,xmm5		; xmm7=(A5 A7 B5 B7)
-
-	paddd	xmm6,xmm2
-	psrad	xmm6,DESCALE_P1_2
-
-	paddd	xmm1,xmm2
-	paddd	xmm7,xmm2
-	psrad	xmm1,DESCALE_P1_2
-	psrad	xmm7,DESCALE_P1_2
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows, store into output array.
-
-	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(ebp)]
-
-	; | input:| result:|
-	; | A0 B0 |        |
-	; | A1 B1 | C0 C1  |
-	; | A3 B3 | D0 D1  |
-	; | A5 B5 |        |
-	; | A7 B7 |        |
-
-	; -- Odd part
-
-	packssdw  xmm1,xmm1		; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
-	packssdw  xmm7,xmm7		; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
-	pmaddwd   xmm1,[GOTOFF(ebx,PW_F362_MF127)]
-	pmaddwd   xmm7,[GOTOFF(ebx,PW_F085_MF072)]
-
-	paddd     xmm1,xmm7		; xmm1=tmp0[row0 row1 row0 row1]
-
-	; -- Even part
-
-	pslld     xmm6,(CONST_BITS+2)	; xmm6=tmp10[row0 row1 **** ****]
-
-	; -- Final output stage
-
-	movdqa    xmm4,xmm6
-	paddd     xmm6,xmm1	; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
-	psubd     xmm4,xmm1	; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
-
-	punpckldq xmm6,xmm4	; xmm6=(C0 D0 C1 D1)
-
-	paddd     xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
-	psrad     xmm6,DESCALE_P2_2
-
-	packssdw  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
-	packsswb  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
-	paddb     xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-	pextrw	ebx,xmm6,0x00		; ebx=(C0 D0 -- --)
-	pextrw	ecx,xmm6,0x01		; ecx=(C1 D1 -- --)
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
-	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jisseflt.asm b/simd/jisseflt.asm
deleted file mode 100644
index 8faa749..0000000
--- a/simd/jisseflt.asm
+++ /dev/null
@@ -1,572 +0,0 @@
-;
-; jisseflt.asm - floating-point IDCT (SSE & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-	shufps	%1,%2,0x44
-%endmacro
-
-%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-	shufps	%1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_CONST
-
-	alignz	16
-	global	EXTN(jconst_idct_float_sse) PRIVATE
-
-EXTN(jconst_idct_float_sse):
-
-PD_1_414	times 4 dd  1.414213562373095048801689
-PD_1_847	times 4 dd  1.847759065022573512256366
-PD_1_082	times 4 dd  1.082392200292393968799446
-PD_M2_613	times 4 dd -2.613125929752753055713286
-PD_0_125	times 4 dd  0.125	; 1/8
-PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
-
-	alignz	16
-
-; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse (void * dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)	(b)+8			; void * dct_table
-%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
-%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
-%define output_col(b)	(b)+20		; JDIMENSION output_col
-
-%define original_ebp	ebp+0
-%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
-%define WK_NUM		2
-%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-					; FAST_FLOAT workspace[DCTSIZE2]
-
-	align	16
-	global	EXTN(jsimd_idct_float_sse) PRIVATE
-
-EXTN(jsimd_idct_float_sse):
-	push	ebp
-	mov	eax,esp				; eax = original ebp
-	sub	esp, byte 4
-	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
-	mov	[esp],eax
-	mov	ebp,esp				; ebp = aligned ebp
-	lea	esp, [workspace]
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-	push	esi
-	push	edi
-
-	get_GOT	ebx		; get GOT address
-
-	; ---- Pass 1: process columns from input, store into work array.
-
-;	mov	eax, [original_ebp]
-	mov	edx, POINTER [dct_table(eax)]	; quantptr
-	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
-	lea	edi, [workspace]			; FAST_FLOAT * wsptr
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	jnz	near .columnDCT
-
-	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-	por	mm1,mm0
-	packsswb mm1,mm1
-	movd	eax,mm1
-	test	eax,eax
-	jnz	short .columnDCT
-
-	; -- AC terms all zero
-
-	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-	punpckhwd mm1,mm0			; mm1=(** 02 ** 03)
-	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
-	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in0H=(02 03)
-	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
-	cvtpi2ps  xmm3,mm1			; xmm3=(02 03 ** **)
-	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
-	movlhps   xmm0,xmm3			; xmm0=in0=(00 01 02 03)
-
-	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm1,xmm0
-	movaps	xmm2,xmm0
-	movaps	xmm3,xmm0
-
-	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
-	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
-	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
-	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-	jmp	near .nextcolumn
-	alignx	16,7
-%endif
-.columnDCT:
-
-	; -- Even part
-
-	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-	movq      mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-	movq      mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-	movq      mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-	punpckhwd mm4,mm0			; mm4=(** 02 ** 03)
-	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
-	punpckhwd mm5,mm1			; mm5=(** 22 ** 23)
-	punpcklwd mm1,mm1			; mm1=(20 20 21 21)
-
-	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in0H=(02 03)
-	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
-	cvtpi2ps  xmm4,mm4			; xmm4=(02 03 ** **)
-	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
-	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in2H=(22 23)
-	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in2L=(20 21)
-	cvtpi2ps  xmm5,mm5			; xmm5=(22 23 ** **)
-	cvtpi2ps  xmm1,mm1			; xmm1=(20 21 ** **)
-
-	punpckhwd mm6,mm2			; mm6=(** 42 ** 43)
-	punpcklwd mm2,mm2			; mm2=(40 40 41 41)
-	punpckhwd mm7,mm3			; mm7=(** 62 ** 63)
-	punpcklwd mm3,mm3			; mm3=(60 60 61 61)
-
-	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in4H=(42 43)
-	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in4L=(40 41)
-	cvtpi2ps  xmm6,mm6			; xmm6=(42 43 ** **)
-	cvtpi2ps  xmm2,mm2			; xmm2=(40 41 ** **)
-	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in6H=(62 63)
-	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in6L=(60 61)
-	cvtpi2ps  xmm7,mm7			; xmm7=(62 63 ** **)
-	cvtpi2ps  xmm3,mm3			; xmm3=(60 61 ** **)
-
-	movlhps   xmm0,xmm4			; xmm0=in0=(00 01 02 03)
-	movlhps   xmm1,xmm5			; xmm1=in2=(20 21 22 23)
-	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movlhps   xmm2,xmm6			; xmm2=in4=(40 41 42 43)
-	movlhps   xmm3,xmm7			; xmm3=in6=(60 61 62 63)
-	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm4,xmm0
-	movaps	xmm5,xmm1
-	subps	xmm0,xmm2		; xmm0=tmp11
-	subps	xmm1,xmm3
-	addps	xmm4,xmm2		; xmm4=tmp10
-	addps	xmm5,xmm3		; xmm5=tmp13
-
-	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
-	subps	xmm1,xmm5		; xmm1=tmp12
-
-	movaps	xmm6,xmm4
-	movaps	xmm7,xmm0
-	subps	xmm4,xmm5		; xmm4=tmp3
-	subps	xmm0,xmm1		; xmm0=tmp2
-	addps	xmm6,xmm5		; xmm6=tmp0
-	addps	xmm7,xmm1		; xmm7=tmp1
-
-	movaps	XMMWORD [wk(1)], xmm4	; tmp3
-	movaps	XMMWORD [wk(0)], xmm0	; tmp2
-
-	; -- Odd part
-
-	movq      mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-	movq      mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-	movq      mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-	movq      mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-	punpckhwd mm6,mm4			; mm6=(** 12 ** 13)
-	punpcklwd mm4,mm4			; mm4=(10 10 11 11)
-	punpckhwd mm2,mm0			; mm2=(** 32 ** 33)
-	punpcklwd mm0,mm0			; mm0=(30 30 31 31)
-
-	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in1H=(12 13)
-	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in1L=(10 11)
-	cvtpi2ps  xmm4,mm6			; xmm4=(12 13 ** **)
-	cvtpi2ps  xmm2,mm4			; xmm2=(10 11 ** **)
-	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in3H=(32 33)
-	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in3L=(30 31)
-	cvtpi2ps  xmm0,mm2			; xmm0=(32 33 ** **)
-	cvtpi2ps  xmm3,mm0			; xmm3=(30 31 ** **)
-
-	punpckhwd mm7,mm5			; mm7=(** 52 ** 53)
-	punpcklwd mm5,mm5			; mm5=(50 50 51 51)
-	punpckhwd mm3,mm1			; mm3=(** 72 ** 73)
-	punpcklwd mm1,mm1			; mm1=(70 70 71 71)
-
-	movlhps   xmm2,xmm4			; xmm2=in1=(10 11 12 13)
-	movlhps   xmm3,xmm0			; xmm3=in3=(30 31 32 33)
-
-	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in5H=(52 53)
-	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in5L=(50 51)
-	cvtpi2ps  xmm4,mm7			; xmm4=(52 53 ** **)
-	cvtpi2ps  xmm5,mm5			; xmm5=(50 51 ** **)
-	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in7H=(72 73)
-	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in7L=(70 71)
-	cvtpi2ps  xmm0,mm3			; xmm0=(72 73 ** **)
-	cvtpi2ps  xmm1,mm1			; xmm1=(70 71 ** **)
-
-	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movlhps   xmm5,xmm4			; xmm5=in5=(50 51 52 53)
-	movlhps   xmm1,xmm0			; xmm1=in7=(70 71 72 73)
-	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-	movaps	xmm4,xmm2
-	movaps	xmm0,xmm5
-	addps	xmm2,xmm1		; xmm2=z11
-	addps	xmm5,xmm3		; xmm5=z13
-	subps	xmm4,xmm1		; xmm4=z12
-	subps	xmm0,xmm3		; xmm0=z10
-
-	movaps	xmm1,xmm2
-	subps	xmm2,xmm5
-	addps	xmm1,xmm5		; xmm1=tmp7
-
-	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
-
-	movaps	xmm3,xmm0
-	addps	xmm0,xmm4
-	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
-	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
-	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
-	addps	xmm3,xmm0		; xmm3=tmp12
-	subps	xmm4,xmm0		; xmm4=tmp10
-
-	; -- Final output stage
-
-	subps	xmm3,xmm1		; xmm3=tmp6
-	movaps	xmm5,xmm6
-	movaps	xmm0,xmm7
-	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
-	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
-	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
-	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
-	subps	xmm2,xmm3		; xmm2=tmp5
-
-	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
-	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
-	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
-	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
-	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
-	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
-
-	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
-	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
-
-	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
-	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
-
-	addps	xmm4,xmm2		; xmm4=tmp4
-	movaps	xmm0,xmm7
-	movaps	xmm3,xmm5
-	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
-	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
-	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
-	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
-
-	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
-	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
-	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
-	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
-	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
-	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
-
-	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
-	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
-	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
-	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
-	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
-	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
-
-	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
-	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
-
-	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
-	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
-	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
-	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
-	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
-	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
-	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
-	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
-
-	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
-	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
-	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
-	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
-	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
-	dec	ecx					; ctr
-	jnz	near .columnloop
-
-	; -- Prefetch the next coefficient block
-
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-	; ---- Pass 2: process rows from work array, store into output array.
-
-	mov	eax, [original_ebp]
-	lea	esi, [workspace]			; FAST_FLOAT * wsptr
-	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
-	mov	eax, JDIMENSION [output_col(eax)]
-	mov	ecx, DCTSIZE/4				; ctr
-	alignx	16,7
-.rowloop:
-
-	; -- Even part
-
-	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-	movaps	xmm4,xmm0
-	movaps	xmm5,xmm1
-	subps	xmm0,xmm2		; xmm0=tmp11
-	subps	xmm1,xmm3
-	addps	xmm4,xmm2		; xmm4=tmp10
-	addps	xmm5,xmm3		; xmm5=tmp13
-
-	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
-	subps	xmm1,xmm5		; xmm1=tmp12
-
-	movaps	xmm6,xmm4
-	movaps	xmm7,xmm0
-	subps	xmm4,xmm5		; xmm4=tmp3
-	subps	xmm0,xmm1		; xmm0=tmp2
-	addps	xmm6,xmm5		; xmm6=tmp0
-	addps	xmm7,xmm1		; xmm7=tmp1
-
-	movaps	XMMWORD [wk(1)], xmm4	; tmp3
-	movaps	XMMWORD [wk(0)], xmm0	; tmp2
-
-	; -- Odd part
-
-	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-	movaps	xmm4,xmm2
-	movaps	xmm0,xmm5
-	addps	xmm2,xmm1		; xmm2=z11
-	addps	xmm5,xmm3		; xmm5=z13
-	subps	xmm4,xmm1		; xmm4=z12
-	subps	xmm0,xmm3		; xmm0=z10
-
-	movaps	xmm1,xmm2
-	subps	xmm2,xmm5
-	addps	xmm1,xmm5		; xmm1=tmp7
-
-	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
-
-	movaps	xmm3,xmm0
-	addps	xmm0,xmm4
-	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
-	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
-	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
-	addps	xmm3,xmm0		; xmm3=tmp12
-	subps	xmm4,xmm0		; xmm4=tmp10
-
-	; -- Final output stage
-
-	subps	xmm3,xmm1		; xmm3=tmp6
-	movaps	xmm5,xmm6
-	movaps	xmm0,xmm7
-	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
-	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
-	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
-	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
-	subps	xmm2,xmm3		; xmm2=tmp5
-
-	movaps	xmm1,[GOTOFF(ebx,PD_0_125)]	; xmm1=[PD_0_125]
-
-	mulps	xmm6,xmm1		; descale(1/8)
-	mulps	xmm7,xmm1		; descale(1/8)
-	mulps	xmm5,xmm1		; descale(1/8)
-	mulps	xmm0,xmm1		; descale(1/8)
-
-	movhlps   xmm3,xmm6
-	movhlps   xmm1,xmm7
-	cvtps2pi  mm0,xmm6		; round to int32, mm0=data0L=(00 10)
-	cvtps2pi  mm1,xmm7		; round to int32, mm1=data1L=(01 11)
-	cvtps2pi  mm2,xmm3		; round to int32, mm2=data0H=(20 30)
-	cvtps2pi  mm3,xmm1		; round to int32, mm3=data1H=(21 31)
-	packssdw  mm0,mm2		; mm0=data0=(00 10 20 30)
-	packssdw  mm1,mm3		; mm1=data1=(01 11 21 31)
-
-	movhlps   xmm6,xmm5
-	movhlps   xmm7,xmm0
-	cvtps2pi  mm4,xmm5		; round to int32, mm4=data7L=(07 17)
-	cvtps2pi  mm5,xmm0		; round to int32, mm5=data6L=(06 16)
-	cvtps2pi  mm6,xmm6		; round to int32, mm6=data7H=(27 37)
-	cvtps2pi  mm7,xmm7		; round to int32, mm7=data6H=(26 36)
-	packssdw  mm4,mm6		; mm4=data7=(07 17 27 37)
-	packssdw  mm5,mm7		; mm5=data6=(06 16 26 36)
-
-	packsswb  mm0,mm5		; mm0=(00 10 20 30 06 16 26 36)
-	packsswb  mm1,mm4		; mm1=(01 11 21 31 07 17 27 37)
-
-	movaps	xmm3, XMMWORD [wk(0)]	; xmm3=tmp2
-	movaps	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
-
-	movaps	xmm6,[GOTOFF(ebx,PD_0_125)]	; xmm6=[PD_0_125]
-
-	addps	xmm4,xmm2		; xmm4=tmp4
-	movaps	xmm5,xmm3
-	movaps	xmm0,xmm1
-	addps	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
-	addps	xmm1,xmm4		; xmm1=data4=(04 14 24 34)
-	subps	xmm5,xmm2		; xmm5=data5=(05 15 25 35)
-	subps	xmm0,xmm4		; xmm0=data3=(03 13 23 33)
-
-	mulps	xmm3,xmm6		; descale(1/8)
-	mulps	xmm1,xmm6		; descale(1/8)
-	mulps	xmm5,xmm6		; descale(1/8)
-	mulps	xmm0,xmm6		; descale(1/8)
-
-	movhlps   xmm7,xmm3
-	movhlps   xmm2,xmm1
-	cvtps2pi  mm2,xmm3		; round to int32, mm2=data2L=(02 12)
-	cvtps2pi  mm3,xmm1		; round to int32, mm3=data4L=(04 14)
-	cvtps2pi  mm6,xmm7		; round to int32, mm6=data2H=(22 32)
-	cvtps2pi  mm7,xmm2		; round to int32, mm7=data4H=(24 34)
-	packssdw  mm2,mm6		; mm2=data2=(02 12 22 32)
-	packssdw  mm3,mm7		; mm3=data4=(04 14 24 34)
-
-	movhlps   xmm4,xmm5
-	movhlps   xmm6,xmm0
-	cvtps2pi  mm5,xmm5		; round to int32, mm5=data5L=(05 15)
-	cvtps2pi  mm4,xmm0		; round to int32, mm4=data3L=(03 13)
-	cvtps2pi  mm6,xmm4		; round to int32, mm6=data5H=(25 35)
-	cvtps2pi  mm7,xmm6		; round to int32, mm7=data3H=(23 33)
-	packssdw  mm5,mm6		; mm5=data5=(05 15 25 35)
-	packssdw  mm4,mm7		; mm4=data3=(03 13 23 33)
-
-	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
-
-	packsswb  mm2,mm3		; mm2=(02 12 22 32 04 14 24 34)
-	packsswb  mm4,mm5		; mm4=(03 13 23 33 05 15 25 35)
-
-	paddb     mm0,mm6
-	paddb     mm1,mm6
-	paddb     mm2,mm6
-	paddb     mm4,mm6
-
-	movq      mm7,mm0		; transpose coefficients(phase 1)
-	punpcklbw mm0,mm1		; mm0=(00 01 10 11 20 21 30 31)
-	punpckhbw mm7,mm1		; mm7=(06 07 16 17 26 27 36 37)
-	movq      mm3,mm2		; transpose coefficients(phase 1)
-	punpcklbw mm2,mm4		; mm2=(02 03 12 13 22 23 32 33)
-	punpckhbw mm3,mm4		; mm3=(04 05 14 15 24 25 34 35)
-
-	movq      mm5,mm0		; transpose coefficients(phase 2)
-	punpcklwd mm0,mm2		; mm0=(00 01 02 03 10 11 12 13)
-	punpckhwd mm5,mm2		; mm5=(20 21 22 23 30 31 32 33)
-	movq      mm6,mm3		; transpose coefficients(phase 2)
-	punpcklwd mm3,mm7		; mm3=(04 05 06 07 14 15 16 17)
-	punpckhwd mm6,mm7		; mm6=(24 25 26 27 34 35 36 37)
-
-	movq      mm1,mm0		; transpose coefficients(phase 3)
-	punpckldq mm0,mm3		; mm0=(00 01 02 03 04 05 06 07)
-	punpckhdq mm1,mm3		; mm1=(10 11 12 13 14 15 16 17)
-	movq      mm4,mm5		; transpose coefficients(phase 3)
-	punpckldq mm5,mm6		; mm5=(20 21 22 23 24 25 26 27)
-	punpckhdq mm4,mm6		; mm4=(30 31 32 33 34 35 36 37)
-
-	pushpic	ebx			; save GOT address
-
-	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
-	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
-	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
-
-	poppic	ebx			; restore GOT address
-
-	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
-	add	edi, byte 4*SIZEOF_JSAMPROW
-	dec	ecx				; ctr
-	jnz	near .rowloop
-
-	emms		; empty MMX state
-
-	pop	edi
-	pop	esi
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	mov	esp,ebp		; esp <- aligned ebp
-	pop	esp		; esp <- original ebp
-	pop	ebp
-	ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-	align	16
diff --git a/simd/jpeg_nbits_table.inc b/simd/jpeg_nbits_table.inc
new file mode 100644
index 0000000..cbc6990
--- /dev/null
+++ b/simd/jpeg_nbits_table.inc
@@ -0,0 +1,4097 @@
+jpeg_nbits_table db  \
+   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  \
+   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  \
+   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  \
+   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  \
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
diff --git a/simd/jquant-3dn.asm b/simd/jquant-3dn.asm
new file mode 100644
index 0000000..6b7c11c
--- /dev/null
+++ b/simd/jquant-3dn.asm
@@ -0,0 +1,233 @@
+;
+; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                             FAST_FLOAT *workspace);
+;
+
+%define sample_data     ebp+8           ; JSAMPARRAY sample_data
+%define start_col       ebp+12          ; JDIMENSION start_col
+%define workspace       ebp+16          ; FAST_FLOAT *workspace
+
+        align   16
+        global  EXTN(jsimd_convsamp_float_3dnow)
+
+EXTN(jsimd_convsamp_float_3dnow):
+        push    ebp
+        mov     ebp,esp
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        pcmpeqw  mm7,mm7
+        psllw    mm7,7
+        packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
+        mov     eax, JDIMENSION [start_col]
+        mov     edi, POINTER [workspace]        ; (DCTELEM *)
+        mov     ecx, DCTSIZE/2
+        alignx  16,7
+.convloop:
+        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+
+        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+        psubb   mm0,mm7                         ; mm0=(01234567)
+        psubb   mm1,mm7                         ; mm1=(89ABCDEF)
+
+        punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
+        punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
+        punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
+        punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
+
+        punpcklwd mm4,mm2                       ; mm4=(***0***1)
+        punpckhwd mm2,mm2                       ; mm2=(***2***3)
+        punpcklwd mm5,mm0                       ; mm5=(***4***5)
+        punpckhwd mm0,mm0                       ; mm0=(***6***7)
+
+        psrad   mm4,(DWORD_BIT-BYTE_BIT)        ; mm4=(01)
+        psrad   mm2,(DWORD_BIT-BYTE_BIT)        ; mm2=(23)
+        pi2fd   mm4,mm4
+        pi2fd   mm2,mm2
+        psrad   mm5,(DWORD_BIT-BYTE_BIT)        ; mm5=(45)
+        psrad   mm0,(DWORD_BIT-BYTE_BIT)        ; mm0=(67)
+        pi2fd   mm5,mm5
+        pi2fd   mm0,mm0
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+        punpcklwd mm6,mm3                       ; mm6=(***8***9)
+        punpckhwd mm3,mm3                       ; mm3=(***A***B)
+        punpcklwd mm4,mm1                       ; mm4=(***C***D)
+        punpckhwd mm1,mm1                       ; mm1=(***E***F)
+
+        psrad   mm6,(DWORD_BIT-BYTE_BIT)        ; mm6=(89)
+        psrad   mm3,(DWORD_BIT-BYTE_BIT)        ; mm3=(AB)
+        pi2fd   mm6,mm6
+        pi2fd   mm3,mm3
+        psrad   mm4,(DWORD_BIT-BYTE_BIT)        ; mm4=(CD)
+        psrad   mm1,(DWORD_BIT-BYTE_BIT)        ; mm1=(EF)
+        pi2fd   mm4,mm4
+        pi2fd   mm1,mm1
+
+        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+        add     esi, byte 2*SIZEOF_JSAMPROW
+        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+        dec     ecx
+        jnz     near .convloop
+
+        femms           ; empty MMX/3DNow! state
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        pop     ebp
+        ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                             FAST_FLOAT *workspace);
+;
+
+%define coef_block      ebp+8           ; JCOEFPTR coef_block
+%define divisors        ebp+12          ; FAST_FLOAT *divisors
+%define workspace       ebp+16          ; FAST_FLOAT *workspace
+
+        align   16
+        global  EXTN(jsimd_quantize_float_3dnow)
+
+EXTN(jsimd_quantize_float_3dnow):
+        push    ebp
+        mov     ebp,esp
+;       push    ebx             ; unused
+;       push    ecx             ; unused
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov       eax, 0x4B400000       ; (float)0x00C00000 (rndint_magic)
+        movd      mm7,eax
+        punpckldq mm7,mm7               ; mm7={12582912.0F 12582912.0F}
+
+        mov     esi, POINTER [workspace]
+        mov     edx, POINTER [divisors]
+        mov     edi, JCOEFPTR [coef_block]
+        mov     eax, DCTSIZE2/16
+        alignx  16,7
+.quantloop:
+        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+        movq    mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+        pfmul   mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+        pfmul   mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+        movq    mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+        pfmul   mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+        pfmul   mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+        pfadd   mm0,mm7                 ; mm0=(00 ** 01 **)
+        pfadd   mm1,mm7                 ; mm1=(02 ** 03 **)
+        pfadd   mm2,mm7                 ; mm0=(04 ** 05 **)
+        pfadd   mm3,mm7                 ; mm1=(06 ** 07 **)
+
+        movq      mm4,mm0
+        punpcklwd mm0,mm1               ; mm0=(00 02 ** **)
+        punpckhwd mm4,mm1               ; mm4=(01 03 ** **)
+        movq      mm5,mm2
+        punpcklwd mm2,mm3               ; mm2=(04 06 ** **)
+        punpckhwd mm5,mm3               ; mm5=(05 07 ** **)
+
+        punpcklwd mm0,mm4               ; mm0=(00 01 02 03)
+        punpcklwd mm2,mm5               ; mm2=(04 05 06 07)
+
+        movq    mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+        movq    mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+        pfmul   mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+        pfmul   mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+        movq    mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+        movq    mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+        pfmul   mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+        pfmul   mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+        pfadd   mm6,mm7                 ; mm0=(10 ** 11 **)
+        pfadd   mm1,mm7                 ; mm4=(12 ** 13 **)
+        pfadd   mm3,mm7                 ; mm0=(14 ** 15 **)
+        pfadd   mm4,mm7                 ; mm4=(16 ** 17 **)
+
+        movq      mm5,mm6
+        punpcklwd mm6,mm1               ; mm6=(10 12 ** **)
+        punpckhwd mm5,mm1               ; mm5=(11 13 ** **)
+        movq      mm1,mm3
+        punpcklwd mm3,mm4               ; mm3=(14 16 ** **)
+        punpckhwd mm1,mm4               ; mm1=(15 17 ** **)
+
+        punpcklwd mm6,mm5               ; mm6=(10 11 12 13)
+        punpcklwd mm3,mm1               ; mm3=(14 15 16 17)
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+        add     esi, byte 16*SIZEOF_FAST_FLOAT
+        add     edx, byte 16*SIZEOF_FAST_FLOAT
+        add     edi, byte 16*SIZEOF_JCOEF
+        dec     eax
+        jnz     near .quantloop
+
+        femms           ; empty MMX/3DNow! state
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; unused
+;       pop     ebx             ; unused
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jquant-mmx.asm b/simd/jquant-mmx.asm
new file mode 100644
index 0000000..dbfecee
--- /dev/null
+++ b/simd/jquant-mmx.asm
@@ -0,0 +1,274 @@
+;
+; jquant.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+%define sample_data     ebp+8           ; JSAMPARRAY sample_data
+%define start_col       ebp+12          ; JDIMENSION start_col
+%define workspace       ebp+16          ; DCTELEM *workspace
+
+        align   16
+        global  EXTN(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+        push    ebp
+        mov     ebp,esp
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        pxor    mm6,mm6                 ; mm6=(all 0's)
+        pcmpeqw mm7,mm7
+        psllw   mm7,7                   ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
+        mov     eax, JDIMENSION [start_col]
+        mov     edi, POINTER [workspace]        ; (DCTELEM *)
+        mov     ecx, DCTSIZE/4
+        alignx  16,7
+.convloop:
+        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+
+        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]    ; mm0=(01234567)
+        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]    ; mm1=(89ABCDEF)
+
+        mov     ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+        mov     edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+
+        movq    mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]    ; mm2=(GHIJKLMN)
+        movq    mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]    ; mm3=(OPQRSTUV)
+
+        movq      mm4,mm0
+        punpcklbw mm0,mm6               ; mm0=(0123)
+        punpckhbw mm4,mm6               ; mm4=(4567)
+        movq      mm5,mm1
+        punpcklbw mm1,mm6               ; mm1=(89AB)
+        punpckhbw mm5,mm6               ; mm5=(CDEF)
+
+        paddw   mm0,mm7
+        paddw   mm4,mm7
+        paddw   mm1,mm7
+        paddw   mm5,mm7
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+        movq      mm0,mm2
+        punpcklbw mm2,mm6               ; mm2=(GHIJ)
+        punpckhbw mm0,mm6               ; mm0=(KLMN)
+        movq      mm4,mm3
+        punpcklbw mm3,mm6               ; mm3=(OPQR)
+        punpckhbw mm4,mm6               ; mm4=(STUV)
+
+        paddw   mm2,mm7
+        paddw   mm0,mm7
+        paddw   mm3,mm7
+        paddw   mm4,mm7
+
+        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+        add     esi, byte 4*SIZEOF_JSAMPROW
+        add     edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+        dec     ecx
+        jnz     short .convloop
+
+        emms            ; empty MMX state
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        pop     ebp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+%define SHIFT(m,n,b)      MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block      ebp+8           ; JCOEFPTR coef_block
+%define divisors        ebp+12          ; DCTELEM *divisors
+%define workspace       ebp+16          ; DCTELEM *workspace
+
+        align   16
+        global  EXTN(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+        push    ebp
+        mov     ebp,esp
+;       push    ebx             ; unused
+;       push    ecx             ; unused
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     esi, POINTER [workspace]
+        mov     edx, POINTER [divisors]
+        mov     edi, JCOEFPTR [coef_block]
+        mov     ah, 2
+        alignx  16,7
+.quantloop1:
+        mov     al, DCTSIZE2/8/2
+        alignx  16,7
+.quantloop2:
+        movq    mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+        movq    mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+        movq    mm0,mm2
+        movq    mm1,mm3
+
+        psraw   mm2,(WORD_BIT-1)  ; -1 if value < 0, 0 otherwise
+        psraw   mm3,(WORD_BIT-1)
+
+        pxor    mm0,mm2   ; val = -val
+        pxor    mm1,mm3
+        psubw   mm0,mm2
+        psubw   mm1,mm3
+
+        ;
+        ; MMX is an annoyingly crappy instruction set. It has two
+        ; misfeatures that are causing problems here:
+        ;
+        ; - All multiplications are signed.
+        ;
+        ; - The second operand for the shifts is not treated as packed.
+        ;
+        ;
+        ; We work around the first problem by implementing this algorithm:
+        ;
+        ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+        ; {
+        ;   enum { SHORT_BIT = 16 };
+        ;   signed short sx = (signed short) x;
+        ;   signed short sy = (signed short) y;
+        ;   signed long sz;
+        ;
+        ;   sz = (long) sx * (long) sy;     /* signed multiply */
+        ;
+        ;   if (sx < 0) sz += (long) sy << SHORT_BIT;
+        ;   if (sy < 0) sz += (long) sx << SHORT_BIT;
+        ;
+        ;   return (unsigned long) sz;
+        ; }
+        ;
+        ; (note that a negative sx adds _sy_ and vice versa)
+        ;
+        ; For the second problem, we replace the shift by a multiplication.
+        ; Unfortunately that means we have to deal with the signed issue again.
+        ;
+
+        paddw   mm0, MMWORD [CORRECTION(0,0,edx)]   ; correction + roundfactor
+        paddw   mm1, MMWORD [CORRECTION(0,1,edx)]
+
+        movq    mm4,mm0   ; store current value for later
+        movq    mm5,mm1
+        pmulhw  mm0, MMWORD [RECIPROCAL(0,0,edx)]   ; reciprocal
+        pmulhw  mm1, MMWORD [RECIPROCAL(0,1,edx)]
+        paddw   mm0,mm4         ; reciprocal is always negative (MSB=1),
+        paddw   mm1,mm5   ; so we always need to add the initial value
+                        ; (input value is never negative as we
+                        ; inverted it at the start of this routine)
+
+        ; here it gets a bit tricky as both scale
+        ; and mm0/mm1 can be negative
+        movq    mm6, MMWORD [SCALE(0,0,edx)]    ; scale
+        movq    mm7, MMWORD [SCALE(0,1,edx)]
+        movq    mm4,mm0
+        movq    mm5,mm1
+        pmulhw  mm0,mm6
+        pmulhw  mm1,mm7
+
+        psraw   mm6,(WORD_BIT-1)    ; determine if scale is negative
+        psraw   mm7,(WORD_BIT-1)
+
+        pand    mm6,mm4             ; and add input if it is
+        pand    mm7,mm5
+        paddw   mm0,mm6
+        paddw   mm1,mm7
+
+        psraw   mm4,(WORD_BIT-1)    ; then check if negative input
+        psraw   mm5,(WORD_BIT-1)
+
+        pand    mm4, MMWORD [SCALE(0,0,edx)]    ; and add scale if it is
+        pand    mm5, MMWORD [SCALE(0,1,edx)]
+        paddw   mm0,mm4
+        paddw   mm1,mm5
+
+        pxor    mm0,mm2   ; val = -val
+        pxor    mm1,mm3
+        psubw   mm0,mm2
+        psubw   mm1,mm3
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+        add     esi, byte 8*SIZEOF_DCTELEM
+        add     edx, byte 8*SIZEOF_DCTELEM
+        add     edi, byte 8*SIZEOF_JCOEF
+        dec     al
+        jnz     near .quantloop2
+        dec     ah
+        jnz     near .quantloop1        ; to avoid branch misprediction
+
+        emms            ; empty MMX state
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; unused
+;       pop     ebx             ; unused
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jquant-sse.asm b/simd/jquant-sse.asm
new file mode 100644
index 0000000..796723a
--- /dev/null
+++ b/simd/jquant-sse.asm
@@ -0,0 +1,211 @@
+;
+; jquant.asm - sample data conversion and quantization (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT *workspace);
+;
+
+%define sample_data     ebp+8           ; JSAMPARRAY sample_data
+%define start_col       ebp+12          ; JDIMENSION start_col
+%define workspace       ebp+16          ; FAST_FLOAT *workspace
+
+        align   16
+        global  EXTN(jsimd_convsamp_float_sse)
+
+EXTN(jsimd_convsamp_float_sse):
+        push    ebp
+        mov     ebp,esp
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        pcmpeqw  mm7,mm7
+        psllw    mm7,7
+        packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
+        mov     eax, JDIMENSION [start_col]
+        mov     edi, POINTER [workspace]        ; (DCTELEM *)
+        mov     ecx, DCTSIZE/2
+        alignx  16,7
+.convloop:
+        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+
+        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+        psubb   mm0,mm7                         ; mm0=(01234567)
+        psubb   mm1,mm7                         ; mm1=(89ABCDEF)
+
+        punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
+        punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
+        punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
+        punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
+
+        punpcklwd mm4,mm2                       ; mm4=(***0***1)
+        punpckhwd mm2,mm2                       ; mm2=(***2***3)
+        punpcklwd mm5,mm0                       ; mm5=(***4***5)
+        punpckhwd mm0,mm0                       ; mm0=(***6***7)
+
+        psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(01)
+        psrad     mm2,(DWORD_BIT-BYTE_BIT)      ; mm2=(23)
+        cvtpi2ps  xmm0,mm4                      ; xmm0=(01**)
+        cvtpi2ps  xmm1,mm2                      ; xmm1=(23**)
+        psrad     mm5,(DWORD_BIT-BYTE_BIT)      ; mm5=(45)
+        psrad     mm0,(DWORD_BIT-BYTE_BIT)      ; mm0=(67)
+        cvtpi2ps  xmm2,mm5                      ; xmm2=(45**)
+        cvtpi2ps  xmm3,mm0                      ; xmm3=(67**)
+
+        punpcklwd mm6,mm3                       ; mm6=(***8***9)
+        punpckhwd mm3,mm3                       ; mm3=(***A***B)
+        punpcklwd mm4,mm1                       ; mm4=(***C***D)
+        punpckhwd mm1,mm1                       ; mm1=(***E***F)
+
+        psrad     mm6,(DWORD_BIT-BYTE_BIT)      ; mm6=(89)
+        psrad     mm3,(DWORD_BIT-BYTE_BIT)      ; mm3=(AB)
+        cvtpi2ps  xmm4,mm6                      ; xmm4=(89**)
+        cvtpi2ps  xmm5,mm3                      ; xmm5=(AB**)
+        psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(CD)
+        psrad     mm1,(DWORD_BIT-BYTE_BIT)      ; mm1=(EF)
+        cvtpi2ps  xmm6,mm4                      ; xmm6=(CD**)
+        cvtpi2ps  xmm7,mm1                      ; xmm7=(EF**)
+
+        movlhps   xmm0,xmm1                     ; xmm0=(0123)
+        movlhps   xmm2,xmm3                     ; xmm2=(4567)
+        movlhps   xmm4,xmm5                     ; xmm4=(89AB)
+        movlhps   xmm6,xmm7                     ; xmm6=(CDEF)
+
+        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+        add     esi, byte 2*SIZEOF_JSAMPROW
+        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+        dec     ecx
+        jnz     near .convloop
+
+        emms            ; empty MMX state
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        pop     ebp
+        ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                           FAST_FLOAT *workspace);
+;
+
+%define coef_block      ebp+8           ; JCOEFPTR coef_block
+%define divisors        ebp+12          ; FAST_FLOAT *divisors
+%define workspace       ebp+16          ; FAST_FLOAT *workspace
+
+        align   16
+        global  EXTN(jsimd_quantize_float_sse)
+
+EXTN(jsimd_quantize_float_sse):
+        push    ebp
+        mov     ebp,esp
+;       push    ebx             ; unused
+;       push    ecx             ; unused
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     esi, POINTER [workspace]
+        mov     edx, POINTER [divisors]
+        mov     edi, JCOEFPTR [coef_block]
+        mov     eax, DCTSIZE2/16
+        alignx  16,7
+.quantloop:
+        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+        movhlps  xmm4,xmm0
+        movhlps  xmm5,xmm1
+
+        cvtps2pi mm0,xmm0
+        cvtps2pi mm1,xmm1
+        cvtps2pi mm4,xmm4
+        cvtps2pi mm5,xmm5
+
+        movhlps  xmm6,xmm2
+        movhlps  xmm7,xmm3
+
+        cvtps2pi mm2,xmm2
+        cvtps2pi mm3,xmm3
+        cvtps2pi mm6,xmm6
+        cvtps2pi mm7,xmm7
+
+        packssdw mm0,mm4
+        packssdw mm1,mm5
+        packssdw mm2,mm6
+        packssdw mm3,mm7
+
+        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+        add     esi, byte 16*SIZEOF_FAST_FLOAT
+        add     edx, byte 16*SIZEOF_FAST_FLOAT
+        add     edi, byte 16*SIZEOF_JCOEF
+        dec     eax
+        jnz     short .quantloop
+
+        emms            ; empty MMX state
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; unused
+;       pop     ebx             ; unused
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jquantf-sse2-64.asm b/simd/jquantf-sse2-64.asm
new file mode 100644
index 0000000..8af256c
--- /dev/null
+++ b/simd/jquantf-sse2-64.asm
@@ -0,0 +1,158 @@
+;
+; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11 = JDIMENSION start_col
+; r12 = FAST_FLOAT *workspace
+
+        align   16
+        global  EXTN(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+        push    rbp
+        mov     rax,rsp
+        mov     rbp,rsp
+        collect_args
+        push    rbx
+
+        pcmpeqw  xmm7,xmm7
+        psllw    xmm7,7
+        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+        mov rsi, r10
+        mov     eax, r11d
+        mov rdi, r12
+        mov     rcx, DCTSIZE/2
+.convloop:
+        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+        mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
+
+        movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+        movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+        psubb   xmm0,xmm7                       ; xmm0=(01234567)
+        psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
+
+        punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
+        punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
+
+        punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
+        punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
+        punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
+        punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
+
+        psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
+        psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
+        cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
+        cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
+        psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
+        psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
+        cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
+        cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
+
+        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+        add     rsi, byte 2*SIZEOF_JSAMPROW
+        add     rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+        dec     rcx
+        jnz     short .convloop
+
+        pop     rbx
+        uncollect_args
+        pop     rbp
+        ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                         FAST_FLOAT *workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT *divisors
+; r12 = FAST_FLOAT *workspace
+
+        align   16
+        global  EXTN(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+        push    rbp
+        mov     rax,rsp
+        mov     rbp,rsp
+        collect_args
+
+        mov rsi, r12
+        mov rdx, r11
+        mov rdi, r10
+        mov     rax, DCTSIZE2/16
+.quantloop:
+        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+        cvtps2dq xmm0,xmm0
+        cvtps2dq xmm1,xmm1
+        cvtps2dq xmm2,xmm2
+        cvtps2dq xmm3,xmm3
+
+        packssdw xmm0,xmm1
+        packssdw xmm2,xmm3
+
+        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+        add     rsi, byte 16*SIZEOF_FAST_FLOAT
+        add     rdx, byte 16*SIZEOF_FAST_FLOAT
+        add     rdi, byte 16*SIZEOF_JCOEF
+        dec     rax
+        jnz     short .quantloop
+
+        uncollect_args
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jquantf-sse2.asm b/simd/jquantf-sse2.asm
new file mode 100644
index 0000000..a8d4cd3
--- /dev/null
+++ b/simd/jquantf-sse2.asm
@@ -0,0 +1,171 @@
+;
+; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT *workspace);
+;
+
+%define sample_data     ebp+8           ; JSAMPARRAY sample_data
+%define start_col       ebp+12          ; JDIMENSION start_col
+%define workspace       ebp+16          ; FAST_FLOAT *workspace
+
+        align   16
+        global  EXTN(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+        push    ebp
+        mov     ebp,esp
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        pcmpeqw  xmm7,xmm7
+        psllw    xmm7,7
+        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
+        mov     eax, JDIMENSION [start_col]
+        mov     edi, POINTER [workspace]        ; (DCTELEM *)
+        mov     ecx, DCTSIZE/2
+        alignx  16,7
+.convloop:
+        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+
+        movq    xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+        movq    xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+        psubb   xmm0,xmm7                       ; xmm0=(01234567)
+        psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
+
+        punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
+        punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
+
+        punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
+        punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
+        punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
+        punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
+
+        psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
+        psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
+        cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
+        cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
+        psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
+        psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
+        cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
+        cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
+
+        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+        add     esi, byte 2*SIZEOF_JSAMPROW
+        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+        dec     ecx
+        jnz     short .convloop
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        pop     ebp
+        ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                            FAST_FLOAT *workspace);
+;
+
+%define coef_block      ebp+8           ; JCOEFPTR coef_block
+%define divisors        ebp+12          ; FAST_FLOAT *divisors
+%define workspace       ebp+16          ; FAST_FLOAT *workspace
+
+        align   16
+        global  EXTN(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+        push    ebp
+        mov     ebp,esp
+;       push    ebx             ; unused
+;       push    ecx             ; unused
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     esi, POINTER [workspace]
+        mov     edx, POINTER [divisors]
+        mov     edi, JCOEFPTR [coef_block]
+        mov     eax, DCTSIZE2/16
+        alignx  16,7
+.quantloop:
+        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+        cvtps2dq xmm0,xmm0
+        cvtps2dq xmm1,xmm1
+        cvtps2dq xmm2,xmm2
+        cvtps2dq xmm3,xmm3
+
+        packssdw xmm0,xmm1
+        packssdw xmm2,xmm3
+
+        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+        add     esi, byte 16*SIZEOF_FAST_FLOAT
+        add     edx, byte 16*SIZEOF_FAST_FLOAT
+        add     edi, byte 16*SIZEOF_JCOEF
+        dec     eax
+        jnz     short .quantloop
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; unused
+;       pop     ebx             ; unused
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jquanti-altivec.c b/simd/jquanti-altivec.c
new file mode 100644
index 0000000..b3adab9
--- /dev/null
+++ b/simd/jquanti-altivec.c
@@ -0,0 +1,252 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
+ * always get the data we want by using a single vector load (although we may
+ * have to permute the result.)
+ */
+#if __BIG_ENDIAN__
+
+#define LOAD_ROW(row) {  \
+  elemptr = sample_data[row] + start_col;  \
+  in##row = vec_ld(0, elemptr);  \
+  if ((size_t)elemptr & 15)  \
+    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr));  \
+}
+
+#else
+
+#define LOAD_ROW(row) {  \
+  elemptr = sample_data[row] + start_col;  \
+  in##row = vec_vsx_ld(0, elemptr);  \
+}
+
+#endif
+
+
+void
+jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
+                        DCTELEM *workspace)
+{
+  JSAMPROW elemptr;
+
+  __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
+  __vector short out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
+  __vector unsigned char pb_zero = { __16X(0) };
+
+  LOAD_ROW(0);
+  LOAD_ROW(1);
+  LOAD_ROW(2);
+  LOAD_ROW(3);
+  LOAD_ROW(4);
+  LOAD_ROW(5);
+  LOAD_ROW(6);
+  LOAD_ROW(7);
+
+  out0 = (__vector short)VEC_UNPACKHU(in0);
+  out1 = (__vector short)VEC_UNPACKHU(in1);
+  out2 = (__vector short)VEC_UNPACKHU(in2);
+  out3 = (__vector short)VEC_UNPACKHU(in3);
+  out4 = (__vector short)VEC_UNPACKHU(in4);
+  out5 = (__vector short)VEC_UNPACKHU(in5);
+  out6 = (__vector short)VEC_UNPACKHU(in6);
+  out7 = (__vector short)VEC_UNPACKHU(in7);
+
+  out0 = vec_sub(out0, pw_centerjsamp);
+  out1 = vec_sub(out1, pw_centerjsamp);
+  out2 = vec_sub(out2, pw_centerjsamp);
+  out3 = vec_sub(out3, pw_centerjsamp);
+  out4 = vec_sub(out4, pw_centerjsamp);
+  out5 = vec_sub(out5, pw_centerjsamp);
+  out6 = vec_sub(out6, pw_centerjsamp);
+  out7 = vec_sub(out7, pw_centerjsamp);
+
+  vec_st(out0, 0, workspace);
+  vec_st(out1, 16, workspace);
+  vec_st(out2, 32, workspace);
+  vec_st(out3, 48, workspace);
+  vec_st(out4, 64, workspace);
+  vec_st(out5, 80, workspace);
+  vec_st(out6, 96, workspace);
+  vec_st(out7, 112, workspace);
+}
+
+
+#define WORD_BIT 16
+
+/* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
+   We basically need an unsigned equivalent of vec_madds(). */
+
+#define MULTIPLY(vs0, vs1, out) {  \
+  tmpe = vec_mule((__vector unsigned short)vs0,  \
+                  (__vector unsigned short)vs1);  \
+  tmpo = vec_mulo((__vector unsigned short)vs0,  \
+                  (__vector unsigned short)vs1);  \
+  out = (__vector short)vec_perm((__vector unsigned short)tmpe,  \
+                                 (__vector unsigned short)tmpo,  \
+                                 shift_pack_index);  \
+}
+
+void
+jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors,
+                        DCTELEM *workspace)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
+    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
+    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
+    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
+  __vector unsigned int tmpe, tmpo;
+
+  /* Constants */
+  __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
+#if __BIG_ENDIAN__
+  __vector unsigned char shift_pack_index =
+    {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
+#else
+  __vector unsigned char shift_pack_index =
+    {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
+#endif
+
+  row0 = vec_ld(0, workspace);
+  row1 = vec_ld(16, workspace);
+  row2 = vec_ld(32, workspace);
+  row3 = vec_ld(48, workspace);
+  row4 = vec_ld(64, workspace);
+  row5 = vec_ld(80, workspace);
+  row6 = vec_ld(96, workspace);
+  row7 = vec_ld(112, workspace);
+
+  /* Branch-less absolute value */
+  row0s = vec_sra(row0, pw_word_bit_m1);
+  row1s = vec_sra(row1, pw_word_bit_m1);
+  row2s = vec_sra(row2, pw_word_bit_m1);
+  row3s = vec_sra(row3, pw_word_bit_m1);
+  row4s = vec_sra(row4, pw_word_bit_m1);
+  row5s = vec_sra(row5, pw_word_bit_m1);
+  row6s = vec_sra(row6, pw_word_bit_m1);
+  row7s = vec_sra(row7, pw_word_bit_m1);
+  row0 = vec_xor(row0, row0s);
+  row1 = vec_xor(row1, row1s);
+  row2 = vec_xor(row2, row2s);
+  row3 = vec_xor(row3, row3s);
+  row4 = vec_xor(row4, row4s);
+  row5 = vec_xor(row5, row5s);
+  row6 = vec_xor(row6, row6s);
+  row7 = vec_xor(row7, row7s);
+  row0 = vec_sub(row0, row0s);
+  row1 = vec_sub(row1, row1s);
+  row2 = vec_sub(row2, row2s);
+  row3 = vec_sub(row3, row3s);
+  row4 = vec_sub(row4, row4s);
+  row5 = vec_sub(row5, row5s);
+  row6 = vec_sub(row6, row6s);
+  row7 = vec_sub(row7, row7s);
+
+  corr0 = vec_ld(DCTSIZE2 * 2, divisors);
+  corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
+  corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
+  corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
+  corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
+  corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
+  corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
+  corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
+
+  row0 = vec_add(row0, corr0);
+  row1 = vec_add(row1, corr1);
+  row2 = vec_add(row2, corr2);
+  row3 = vec_add(row3, corr3);
+  row4 = vec_add(row4, corr4);
+  row5 = vec_add(row5, corr5);
+  row6 = vec_add(row6, corr6);
+  row7 = vec_add(row7, corr7);
+
+  recip0 = vec_ld(0, divisors);
+  recip1 = vec_ld(16, divisors);
+  recip2 = vec_ld(32, divisors);
+  recip3 = vec_ld(48, divisors);
+  recip4 = vec_ld(64, divisors);
+  recip5 = vec_ld(80, divisors);
+  recip6 = vec_ld(96, divisors);
+  recip7 = vec_ld(112, divisors);
+
+  MULTIPLY(row0, recip0, row0);
+  MULTIPLY(row1, recip1, row1);
+  MULTIPLY(row2, recip2, row2);
+  MULTIPLY(row3, recip3, row3);
+  MULTIPLY(row4, recip4, row4);
+  MULTIPLY(row5, recip5, row5);
+  MULTIPLY(row6, recip6, row6);
+  MULTIPLY(row7, recip7, row7);
+
+  scale0 = vec_ld(DCTSIZE2 * 4, divisors);
+  scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
+  scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
+  scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
+  scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
+  scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
+  scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
+  scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
+
+  MULTIPLY(row0, scale0, row0);
+  MULTIPLY(row1, scale1, row1);
+  MULTIPLY(row2, scale2, row2);
+  MULTIPLY(row3, scale3, row3);
+  MULTIPLY(row4, scale4, row4);
+  MULTIPLY(row5, scale5, row5);
+  MULTIPLY(row6, scale6, row6);
+  MULTIPLY(row7, scale7, row7);
+
+  row0 = vec_xor(row0, row0s);
+  row1 = vec_xor(row1, row1s);
+  row2 = vec_xor(row2, row2s);
+  row3 = vec_xor(row3, row3s);
+  row4 = vec_xor(row4, row4s);
+  row5 = vec_xor(row5, row5s);
+  row6 = vec_xor(row6, row6s);
+  row7 = vec_xor(row7, row7s);
+  row0 = vec_sub(row0, row0s);
+  row1 = vec_sub(row1, row1s);
+  row2 = vec_sub(row2, row2s);
+  row3 = vec_sub(row3, row3s);
+  row4 = vec_sub(row4, row4s);
+  row5 = vec_sub(row5, row5s);
+  row6 = vec_sub(row6, row6s);
+  row7 = vec_sub(row7, row7s);
+
+  vec_st(row0, 0, coef_block);
+  vec_st(row1, 16, coef_block);
+  vec_st(row2, 32, coef_block);
+  vec_st(row3, 48, coef_block);
+  vec_st(row4, 64, coef_block);
+  vec_st(row5, 80, coef_block);
+  vec_st(row6, 96, coef_block);
+  vec_st(row7, 112, coef_block);
+}
diff --git a/simd/jquanti-sse2-64.asm b/simd/jquanti-sse2-64.asm
new file mode 100644
index 0000000..9b3f4ee
--- /dev/null
+++ b/simd/jquanti-sse2-64.asm
@@ -0,0 +1,187 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                      DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11 = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+        align   16
+        global  EXTN(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+        push    rbp
+        mov     rax,rsp
+        mov     rbp,rsp
+        collect_args
+        push    rbx
+
+        pxor    xmm6,xmm6               ; xmm6=(all 0's)
+        pcmpeqw xmm7,xmm7
+        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+        mov rsi, r10
+        mov eax, r11d
+        mov rdi, r12
+        mov     rcx, DCTSIZE/4
+.convloop:
+        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+        mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
+
+        movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
+        movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)
+
+        mov     rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+        mov     rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+
+        movq    xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
+        movq    xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)
+
+        punpcklbw xmm0,xmm6             ; xmm0=(01234567)
+        punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
+        paddw     xmm0,xmm7
+        paddw     xmm1,xmm7
+        punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
+        punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
+        paddw     xmm2,xmm7
+        paddw     xmm3,xmm7
+
+        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+        movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+        movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+        add     rsi, byte 4*SIZEOF_JSAMPROW
+        add     rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+        dec     rcx
+        jnz     short .convloop
+
+        pop     rbx
+        uncollect_args
+        pop     rbp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
+;                      DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+        align   16
+        global  EXTN(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+        push    rbp
+        mov     rax,rsp
+        mov     rbp,rsp
+        collect_args
+
+        mov rsi, r12
+        mov rdx, r11
+        mov rdi, r10
+        mov     rax, DCTSIZE2/32
+.quantloop:
+        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+        movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+        movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+        movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+        movdqa  xmm0,xmm4
+        movdqa  xmm1,xmm5
+        movdqa  xmm2,xmm6
+        movdqa  xmm3,xmm7
+        psraw   xmm4,(WORD_BIT-1)
+        psraw   xmm5,(WORD_BIT-1)
+        psraw   xmm6,(WORD_BIT-1)
+        psraw   xmm7,(WORD_BIT-1)
+        pxor    xmm0,xmm4
+        pxor    xmm1,xmm5
+        pxor    xmm2,xmm6
+        pxor    xmm3,xmm7
+        psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
+        psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
+        psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
+        psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;
+
+        paddw   xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
+        paddw   xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+        paddw   xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+        paddw   xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+        pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
+        pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+        pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+        pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+        pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)]  ; scale
+        pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
+        pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
+        pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+        pxor    xmm0,xmm4
+        pxor    xmm1,xmm5
+        pxor    xmm2,xmm6
+        pxor    xmm3,xmm7
+        psubw   xmm0,xmm4
+        psubw   xmm1,xmm5
+        psubw   xmm2,xmm6
+        psubw   xmm3,xmm7
+        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+        movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+        movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+        add     rsi, byte 32*SIZEOF_DCTELEM
+        add     rdx, byte 32*SIZEOF_DCTELEM
+        add     rdi, byte 32*SIZEOF_JCOEF
+        dec     rax
+        jnz     near .quantloop
+
+        uncollect_args
+        pop     rbp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jquanti-sse2.asm b/simd/jquanti-sse2.asm
new file mode 100644
index 0000000..4299c33
--- /dev/null
+++ b/simd/jquanti-sse2.asm
@@ -0,0 +1,200 @@
+;
+; jquanti.asm - sample data conversion and quantization (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+        SECTION SEG_TEXT
+        BITS    32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                      DCTELEM *workspace);
+;
+
+%define sample_data     ebp+8           ; JSAMPARRAY sample_data
+%define start_col       ebp+12          ; JDIMENSION start_col
+%define workspace       ebp+16          ; DCTELEM *workspace
+
+        align   16
+        global  EXTN(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+        push    ebp
+        mov     ebp,esp
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        pxor    xmm6,xmm6               ; xmm6=(all 0's)
+        pcmpeqw xmm7,xmm7
+        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
+        mov     eax, JDIMENSION [start_col]
+        mov     edi, POINTER [workspace]        ; (DCTELEM *)
+        mov     ecx, DCTSIZE/4
+        alignx  16,7
+.convloop:
+        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+
+        movq    xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
+        movq    xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)
+
+        mov     ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+        mov     edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+
+        movq    xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
+        movq    xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)
+
+        punpcklbw xmm0,xmm6             ; xmm0=(01234567)
+        punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
+        paddw     xmm0,xmm7
+        paddw     xmm1,xmm7
+        punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
+        punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
+        paddw     xmm2,xmm7
+        paddw     xmm3,xmm7
+
+        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+        movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+        movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+        add     esi, byte 4*SIZEOF_JSAMPROW
+        add     edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+        dec     ecx
+        jnz     short .convloop
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        pop     ebp
+        ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
+;                      DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block      ebp+8           ; JCOEFPTR coef_block
+%define divisors        ebp+12          ; DCTELEM *divisors
+%define workspace       ebp+16          ; DCTELEM *workspace
+
+        align   16
+        global  EXTN(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+        push    ebp
+        mov     ebp,esp
+;       push    ebx             ; unused
+;       push    ecx             ; unused
+;       push    edx             ; need not be preserved
+        push    esi
+        push    edi
+
+        mov     esi, POINTER [workspace]
+        mov     edx, POINTER [divisors]
+        mov     edi, JCOEFPTR [coef_block]
+        mov     eax, DCTSIZE2/32
+        alignx  16,7
+.quantloop:
+        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+        movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+        movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+        movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+        movdqa  xmm0,xmm4
+        movdqa  xmm1,xmm5
+        movdqa  xmm2,xmm6
+        movdqa  xmm3,xmm7
+        psraw   xmm4,(WORD_BIT-1)
+        psraw   xmm5,(WORD_BIT-1)
+        psraw   xmm6,(WORD_BIT-1)
+        psraw   xmm7,(WORD_BIT-1)
+        pxor    xmm0,xmm4
+        pxor    xmm1,xmm5
+        pxor    xmm2,xmm6
+        pxor    xmm3,xmm7
+        psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
+        psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
+        psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
+        psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;
+
+        paddw   xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+        paddw   xmm1, XMMWORD [CORRECTION(1,0,edx)]
+        paddw   xmm2, XMMWORD [CORRECTION(2,0,edx)]
+        paddw   xmm3, XMMWORD [CORRECTION(3,0,edx)]
+        pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+        pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+        pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+        pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+        pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)]  ; scale
+        pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
+        pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
+        pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
+
+        pxor    xmm0,xmm4
+        pxor    xmm1,xmm5
+        pxor    xmm2,xmm6
+        pxor    xmm3,xmm7
+        psubw   xmm0,xmm4
+        psubw   xmm1,xmm5
+        psubw   xmm2,xmm6
+        psubw   xmm3,xmm7
+        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+        movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+        movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+        add     esi, byte 32*SIZEOF_DCTELEM
+        add     edx, byte 32*SIZEOF_DCTELEM
+        add     edi, byte 32*SIZEOF_JCOEF
+        dec     eax
+        jnz     near .quantloop
+
+        pop     edi
+        pop     esi
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; unused
+;       pop     ebx             ; unused
+        pop     ebp
+        ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+        align   16
diff --git a/simd/jsimd.h b/simd/jsimd.h
index ba56d45..a39fafa 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -2,9 +2,11 @@
  * simd/jsimd.h
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2011 D. R. Commander
+ * Copyright (C) 2011, 2014-2016 D. R. Commander
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California
  * Copyright (C) 2014 Linaro Limited
- * 
+ * Copyright (C) 2015-2016 Matthieu Darbois
+ *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * For conditions of distribution and use, see copyright notice in jsimdext.inc
@@ -19,656 +21,851 @@
 #define JSIMD_SSE        0x04
 #define JSIMD_SSE2       0x08
 #define JSIMD_ARM_NEON   0x10
-
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_simd_cpu_support                 jSiCpuSupport
-#define jsimd_rgb_ycc_convert_mmx             jSRGBYCCM
-#define jsimd_extrgb_ycc_convert_mmx          jSEXTRGBYCCM
-#define jsimd_extrgbx_ycc_convert_mmx         jSEXTRGBXYCCM
-#define jsimd_extbgr_ycc_convert_mmx          jSEXTBGRYCCM
-#define jsimd_extbgrx_ycc_convert_mmx         jSEXTBGRXYCCM
-#define jsimd_extxbgr_ycc_convert_mmx         jSEXTXBGRYCCM
-#define jsimd_extxrgb_ycc_convert_mmx         jSEXTXRGBYCCM
-#define jsimd_rgb_gray_convert_mmx            jSRGBGRYM
-#define jsimd_extrgb_gray_convert_mmx         jSEXTRGBGRYM
-#define jsimd_extrgbx_gray_convert_mmx        jSEXTRGBXGRYM
-#define jsimd_extbgr_gray_convert_mmx         jSEXTBGRGRYM
-#define jsimd_extbgrx_gray_convert_mmx        jSEXTBGRXGRYM
-#define jsimd_extxbgr_gray_convert_mmx        jSEXTXBGRGRYM
-#define jsimd_extxrgb_gray_convert_mmx        jSEXTXRGBGRYM
-#define jsimd_ycc_rgb_convert_mmx             jSYCCRGBM
-#define jsimd_ycc_extrgb_convert_mmx          jSYCCEXTRGBM
-#define jsimd_ycc_extrgbx_convert_mmx         jSYCCEXTRGBXM
-#define jsimd_ycc_extbgr_convert_mmx          jSYCCEXTBGRM
-#define jsimd_ycc_extbgrx_convert_mmx         jSYCCEXTBGRXM
-#define jsimd_ycc_extxbgr_convert_mmx         jSYCCEXTXBGRM
-#define jsimd_ycc_extxrgb_convert_mmx         jSYCCEXTXRGBM
-#define jconst_rgb_ycc_convert_sse2           jSCRGBYCCS2
-#define jsimd_rgb_ycc_convert_sse2            jSRGBYCCS2
-#define jsimd_extrgb_ycc_convert_sse2         jSEXTRGBYCCS2
-#define jsimd_extrgbx_ycc_convert_sse2        jSEXTRGBXYCCS2
-#define jsimd_extbgr_ycc_convert_sse2         jSEXTBGRYCCS2
-#define jsimd_extbgrx_ycc_convert_sse2        jSEXTBGRXYCCS2
-#define jsimd_extxbgr_ycc_convert_sse2        jSEXTXBGRYCCS2
-#define jsimd_extxrgb_ycc_convert_sse2        jSEXTXRGBYCCS2
-#define jconst_rgb_gray_convert_sse2          jSCRGBGRYS2
-#define jsimd_rgb_gray_convert_sse2           jSRGBGRYS2
-#define jsimd_extrgb_gray_convert_sse2        jSEXTRGBGRYS2
-#define jsimd_extrgbx_gray_convert_sse2       jSEXTRGBXGRYS2
-#define jsimd_extbgr_gray_convert_sse2        jSEXTBGRGRYS2
-#define jsimd_extbgrx_gray_convert_sse2       jSEXTBGRXGRYS2
-#define jsimd_extxbgr_gray_convert_sse2       jSEXTXBGRGRYS2
-#define jsimd_extxrgb_gray_convert_sse2       jSEXTXRGBGRYS2
-#define jconst_ycc_rgb_convert_sse2           jSCYCCRGBS2
-#define jsimd_ycc_rgb_convert_sse2            jSYCCRGBS2
-#define jsimd_ycc_extrgb_convert_sse2         jSYCCEXTRGBS2
-#define jsimd_ycc_extrgbx_convert_sse2        jSYCCEXTRGBXS2
-#define jsimd_ycc_extbgr_convert_sse2         jSYCCEXTBGRS2
-#define jsimd_ycc_extbgrx_convert_sse2        jSYCCEXTBGRXS2
-#define jsimd_ycc_extxbgr_convert_sse2        jSYCCEXTXBGRS2
-#define jsimd_ycc_extxrgb_convert_sse2        jSYCCEXTXRGBS2
-#define jsimd_h2v2_downsample_mmx             jSDnH2V2M
-#define jsimd_h2v1_downsample_mmx             jSDnH2V1M
-#define jsimd_h2v2_downsample_sse2            jSDnH2V2S2
-#define jsimd_h2v1_downsample_sse2            jSDnH2V1S2
-#define jsimd_h2v2_upsample_mmx               jSUpH2V2M
-#define jsimd_h2v1_upsample_mmx               jSUpH2V1M
-#define jsimd_h2v2_fancy_upsample_mmx         jSFUpH2V2M
-#define jsimd_h2v1_fancy_upsample_mmx         jSFUpH2V1M
-#define jsimd_h2v2_merged_upsample_mmx        jSMUpH2V2M
-#define jsimd_h2v2_extrgb_merged_upsample_mmx jSMUpH2V2EXTRGBM
-#define jsimd_h2v2_extrgbx_merged_upsample_mmx jSMUpH2V2EXTRGBXM
-#define jsimd_h2v2_extbgr_merged_upsample_mmx jSMUpH2V2EXTBGRM
-#define jsimd_h2v2_extbgrx_merged_upsample_mmx jSMUpH2V2EXTBGRXM
-#define jsimd_h2v2_extxbgr_merged_upsample_mmx jSMUpH2V2EXTXBGRM
-#define jsimd_h2v2_extxrgb_merged_upsample_mmx jSMUpH2V2EXTXRGBM
-#define jsimd_h2v1_merged_upsample_mmx        jSMUpH2V1M
-#define jsimd_h2v1_extrgb_merged_upsample_mmx jSMUpH2V1EXTRGBM
-#define jsimd_h2v1_extrgbx_merged_upsample_mmx jSMUpH2V1EXTRGBXM
-#define jsimd_h2v1_extbgr_merged_upsample_mmx jSMUpH2V1EXTBGRM
-#define jsimd_h2v1_extbgrx_merged_upsample_mmx jSMUpH2V1EXTBGRXM
-#define jsimd_h2v1_extxbgr_merged_upsample_mmx jSMUpH2V1EXTXBGRM
-#define jsimd_h2v1_extxrgb_merged_upsample_mmx jSMUpH2V1EXTXRGBM
-#define jsimd_h2v2_upsample_sse2              jSUpH2V2S2
-#define jsimd_h2v1_upsample_sse2              jSUpH2V1S2
-#define jconst_fancy_upsample_sse2            jSCFUpS2
-#define jsimd_h2v2_fancy_upsample_sse2        jSFUpH2V2S2
-#define jsimd_h2v1_fancy_upsample_sse2        jSFUpH2V1S2
-#define jconst_merged_upsample_sse2           jSCMUpS2
-#define jsimd_h2v2_merged_upsample_sse2       jSMUpH2V2S2
-#define jsimd_h2v2_extrgb_merged_upsample_sse2 jSMUpH2V2EXTRGBS2
-#define jsimd_h2v2_extrgbx_merged_upsample_sse2 jSMUpH2V2EXTRGBXS2
-#define jsimd_h2v2_extbgr_merged_upsample_sse2 jSMUpH2V2EXTBGRS2
-#define jsimd_h2v2_extbgrx_merged_upsample_sse2 jSMUpH2V2EXTBGRXS2
-#define jsimd_h2v2_extxbgr_merged_upsample_sse2 jSMUpH2V2EXTXBGRS2
-#define jsimd_h2v2_extxrgb_merged_upsample_sse2 jSMUpH2V2EXTXRGBS2
-#define jsimd_h2v1_merged_upsample_sse2       jSMUpH2V1S2
-#define jsimd_h2v1_extrgb_merged_upsample_sse2 jSMUpH2V1EXTRGBS2
-#define jsimd_h2v1_extrgbx_merged_upsample_sse2 jSMUpH2V1EXTRGBXS2
-#define jsimd_h2v1_extbgr_merged_upsample_sse2 jSMUpH2V1EXTBGRS2
-#define jsimd_h2v1_extbgrx_merged_upsample_sse2 jSMUpH2V1EXTBGRXS2
-#define jsimd_h2v1_extxbgr_merged_upsample_sse2 jSMUpH2V1EXTXBGRS2
-#define jsimd_h2v1_extxrgb_merged_upsample_sse2 jSMUpH2V1EXTXRGBS2
-#define jsimd_convsamp_mmx                    jSConvM
-#define jsimd_convsamp_sse2                   jSConvS2
-#define jsimd_convsamp_float_3dnow            jSConvF3D
-#define jsimd_convsamp_float_sse              jSConvFS
-#define jsimd_convsamp_float_sse2             jSConvFS2
-#define jsimd_fdct_islow_mmx                  jSFDMIS
-#define jsimd_fdct_ifast_mmx                  jSFDMIF
-#define jconst_fdct_islow_sse2                jSCFDS2IS
-#define jsimd_fdct_islow_sse2                 jSFDS2IS
-#define jconst_fdct_ifast_sse2                jSCFDS2IF
-#define jsimd_fdct_ifast_sse2                 jSFDS2IF
-#define jsimd_fdct_float_3dnow                jSFD3DF
-#define jconst_fdct_float_sse                 jSCFDSF
-#define jsimd_fdct_float_sse                  jSFDSF
-#define jsimd_quantize_mmx                    jSQuantM
-#define jsimd_quantize_sse2                   jSQuantS2
-#define jsimd_quantize_float_3dnow            jSQuantF3D
-#define jsimd_quantize_float_sse              jSQuantFS
-#define jsimd_quantize_float_sse2             jSQuantFS2
-#define jsimd_idct_2x2_mmx                    jSIDM22
-#define jsimd_idct_4x4_mmx                    jSIDM44
-#define jconst_idct_red_sse2                  jSCIDS2R
-#define jsimd_idct_2x2_sse2                   jSIDS222
-#define jsimd_idct_4x4_sse2                   jSIDS244
-#define jsimd_idct_islow_mmx                  jSIDMIS
-#define jsimd_idct_ifast_mmx                  jSIDMIF
-#define jconst_idct_islow_sse2                jSCIDS2IS
-#define jsimd_idct_islow_sse2                 jSIDS2IS
-#define jconst_idct_ifast_sse2                jSCIDS2IF
-#define jsimd_idct_ifast_sse2                 jSIDS2IF
-#define jsimd_idct_float_3dnow                jSID3DF
-#define jconst_fdct_float_sse                 jSCIDSF
-#define jsimd_idct_float_sse                  jSIDSF
-#define jconst_fdct_float_sse2                jSCIDS2F
-#define jsimd_idct_float_sse2                 jSIDS2F
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
+#define JSIMD_MIPS_DSPR2 0x20
+#define JSIMD_ALTIVEC    0x40
 
 /* SIMD Ext: retrieve SIMD/CPU information */
-EXTERN(unsigned int) jpeg_simd_cpu_support JPP((void));
+EXTERN(unsigned int) jpeg_simd_cpu_support (void);
 
-/* SIMD Color Space Conversion */
+/* RGB & extended RGB --> YCC Colorspace Conversion */
 EXTERN(void) jsimd_rgb_ycc_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-
-EXTERN(void) jsimd_rgb_gray_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-EXTERN(void) jsimd_extrgb_gray_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-EXTERN(void) jsimd_extrgbx_gray_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-EXTERN(void) jsimd_extbgr_gray_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-EXTERN(void) jsimd_extbgrx_gray_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-EXTERN(void) jsimd_extxbgr_gray_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-EXTERN(void) jsimd_extxrgb_gray_convert_mmx
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-
-EXTERN(void) jsimd_ycc_rgb_convert_mmx
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
-EXTERN(void) jsimd_ycc_extrgb_convert_mmx
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
-EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
-EXTERN(void) jsimd_ycc_extbgr_convert_mmx
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
-EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
-EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
-EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 
 extern const int jconst_rgb_ycc_convert_sse2[];
 EXTERN(void) jsimd_rgb_ycc_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_neon
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_neon
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_neon
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_neon
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_neon
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_neon
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_neon
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
+/* RGB & extended RGB --> Grayscale Colorspace Conversion */
+EXTERN(void) jsimd_rgb_gray_convert_mmx
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmx
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmx
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmx
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmx
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmx
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmx
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 
 extern const int jconst_rgb_gray_convert_sse2[];
 EXTERN(void) jsimd_rgb_gray_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_gray_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_gray_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_gray_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_gray_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_gray_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_gray_convert_sse2
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_altivec
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
+/* YCC --> RGB & extended RGB Colorspace Conversion */
+EXTERN(void) jsimd_ycc_rgb_convert_mmx
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_mmx
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_mmx
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 
 extern const int jconst_ycc_rgb_convert_sse2[];
 EXTERN(void) jsimd_ycc_rgb_convert_sse2
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_sse2
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_sse2
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
-
-EXTERN(void) jsimd_rgb_ycc_convert_neon
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-EXTERN(void) jsimd_extrgb_ycc_convert_neon
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-EXTERN(void) jsimd_extrgbx_ycc_convert_neon
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-EXTERN(void) jsimd_extbgr_ycc_convert_neon
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-EXTERN(void) jsimd_extbgrx_ycc_convert_neon
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-EXTERN(void) jsimd_extxbgr_ycc_convert_neon
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
-EXTERN(void) jsimd_extxrgb_ycc_convert_neon
-        JPP((JDIMENSION img_width,
-             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 
 EXTERN(void) jsimd_ycc_rgb_convert_neon
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_neon
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_neon
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_neon
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_neon
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_neon
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_neon
-        JPP((JDIMENSION out_width,
-             JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_rgb565_convert_neon
-        JPP((JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-             JSAMPARRAY output_buf, int num_rows));
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
 
-/* SIMD Downsample */
-EXTERN(void) jsimd_h2v2_downsample_mmx
-        JPP((JDIMENSION image_width, int max_v_samp_factor,
-             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_mips_dspr2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_altivec
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+
+/* NULL Colorspace Conversion */
+EXTERN(void) jsimd_c_null_convert_mips_dspr2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows, int num_components);
+
+/* h2v1 Downsampling */
 EXTERN(void) jsimd_h2v1_downsample_mmx
-        JPP((JDIMENSION image_width, int max_v_samp_factor,
-             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-             JSAMPARRAY input_data, JSAMPARRAY output_data));
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_sse2
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_neon
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_mips_dspr2
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_altivec
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+/* h2v2 Downsampling */
+EXTERN(void) jsimd_h2v2_downsample_mmx
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v2_downsample_sse2
-        JPP((JDIMENSION image_width, int max_v_samp_factor,
-             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-             JSAMPARRAY input_data, JSAMPARRAY output_data));
-EXTERN(void) jsimd_h2v1_downsample_sse2
-        JPP((JDIMENSION image_width, int max_v_samp_factor,
-             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-             JSAMPARRAY input_data, JSAMPARRAY output_data));
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
 
-/* SIMD Upsample */
-EXTERN(void) jsimd_h2v2_upsample_mmx
-        JPP((int max_v_samp_factor, JDIMENSION output_width,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v2_downsample_neon
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_mips_dspr2
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_altivec
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+/* h2v2 Smooth Downsampling */
+EXTERN(void) jsimd_h2v2_smooth_downsample_mips_dspr2
+        (JSAMPARRAY input_data, JSAMPARRAY output_data,
+         JDIMENSION v_samp_factor, int max_v_samp_factor,
+         int smoothing_factor, JDIMENSION width_blocks,
+         JDIMENSION image_width);
+
+
+/* Upsampling */
 EXTERN(void) jsimd_h2v1_upsample_mmx
-        JPP((int max_v_samp_factor, JDIMENSION output_width,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+         JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_mmx
+        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+         JSAMPARRAY *output_data_ptr);
 
-EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
-        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
-EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
-        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
-
-EXTERN(void) jsimd_h2v2_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-
-EXTERN(void) jsimd_h2v2_upsample_sse2
-        JPP((int max_v_samp_factor, JDIMENSION output_width,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
 EXTERN(void) jsimd_h2v1_upsample_sse2
-        JPP((int max_v_samp_factor, JDIMENSION output_width,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+         JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_sse2
+        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+         JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_mips_dspr2
+        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+         JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_mips_dspr2
+        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+         JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_int_upsample_mips_dspr2
+        (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
+         JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
+         int max_v_samp_factor);
+
+EXTERN(void) jsimd_h2v1_upsample_altivec
+        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+         JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_altivec
+        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+         JSAMPARRAY *output_data_ptr);
+
+/* Fancy Upsampling */
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
+        (int max_v_samp_factor, JDIMENSION downsampled_width,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
+        (int max_v_samp_factor, JDIMENSION downsampled_width,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
 extern const int jconst_fancy_upsample_sse2[];
-EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
-        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
 EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
-        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
-
-extern const int jconst_merged_upsample_sse2[];
-EXTERN(void) jsimd_h2v2_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
-EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
-        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
-             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+        (int max_v_samp_factor, JDIMENSION downsampled_width,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
+        (int max_v_samp_factor, JDIMENSION downsampled_width,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_neon
-        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+        (int max_v_samp_factor, JDIMENSION downsampled_width,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
-/* SIMD Sample Conversion */
-EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
-                                     JDIMENSION start_col,
-                                     DCTELEM * workspace));
+EXTERN(void) jsimd_h2v1_fancy_upsample_mips_dspr2
+        (int max_v_samp_factor, JDIMENSION downsampled_width,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_mips_dspr2
+        (int max_v_samp_factor, JDIMENSION downsampled_width,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
-EXTERN(void) jsimd_convsamp_sse2 JPP((JSAMPARRAY sample_data,
-                                      JDIMENSION start_col,
-                                      DCTELEM * workspace));
+EXTERN(void) jsimd_h2v1_fancy_upsample_altivec
+        (int max_v_samp_factor, JDIMENSION downsampled_width,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_altivec
+        (int max_v_samp_factor, JDIMENSION downsampled_width,
+         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 
-EXTERN(void) jsimd_convsamp_neon JPP((JSAMPARRAY sample_data,
-                                      JDIMENSION start_col,
-                                      DCTELEM * workspace));
+/* Merged Upsampling */
+EXTERN(void) jsimd_h2v1_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
 
-EXTERN(void) jsimd_convsamp_float_3dnow JPP((JSAMPARRAY sample_data,
-                                             JDIMENSION start_col,
-                                             FAST_FLOAT * workspace));
+EXTERN(void) jsimd_h2v2_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
 
-EXTERN(void) jsimd_convsamp_float_sse JPP((JSAMPARRAY sample_data,
-                                           JDIMENSION start_col,
-                                           FAST_FLOAT * workspace));
+extern const int jconst_merged_upsample_sse2[];
+EXTERN(void) jsimd_h2v1_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
 
-EXTERN(void) jsimd_convsamp_float_sse2 JPP((JSAMPARRAY sample_data,
-                                            JDIMENSION start_col,
-                                            FAST_FLOAT * workspace));
+EXTERN(void) jsimd_h2v2_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
 
-/* SIMD Forward DCT */
-EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data));
-EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data));
+EXTERN(void) jsimd_h2v1_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec
+        (JDIMENSION output_width, JSAMPIMAGE input_buf,
+         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+
+/* Sample Conversion */
+EXTERN(void) jsimd_convsamp_mmx
+        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_sse2
+        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_neon
+        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_mips_dspr2
+        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_altivec
+        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+/* Floating Point Sample Conversion */
+EXTERN(void) jsimd_convsamp_float_3dnow
+        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_convsamp_float_sse
+        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_convsamp_float_sse2
+        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_convsamp_float_mips_dspr2
+        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+/* Slow Integer Forward DCT */
+EXTERN(void) jsimd_fdct_islow_mmx (DCTELEM *data);
+
+extern const int jconst_fdct_islow_sse2[];
+EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_neon (DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM *data);
+
+/* Fast Integer Forward DCT */
+EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM *data);
 
 extern const int jconst_fdct_ifast_sse2[];
-EXTERN(void) jsimd_fdct_islow_sse2 JPP((DCTELEM * data));
-extern const int jconst_fdct_islow_sse2[];
-EXTERN(void) jsimd_fdct_ifast_sse2 JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast_sse2 (DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_ifast_neon JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast_neon (DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_float_3dnow JPP((FAST_FLOAT * data));
+EXTERN(void) jsimd_fdct_ifast_mips_dspr2 (DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_altivec (DCTELEM *data);
+
+/* Floating Point Forward DCT */
+EXTERN(void) jsimd_fdct_float_3dnow (FAST_FLOAT *data);
 
 extern const int jconst_fdct_float_sse[];
-EXTERN(void) jsimd_fdct_float_sse JPP((FAST_FLOAT * data));
+EXTERN(void) jsimd_fdct_float_sse (FAST_FLOAT *data);
 
-/* SIMD Quantization */
-EXTERN(void) jsimd_quantize_mmx JPP((JCOEFPTR coef_block,
-                                     DCTELEM * divisors,
-                                     DCTELEM * workspace));
+/* Quantization */
+EXTERN(void) jsimd_quantize_mmx
+        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
-EXTERN(void) jsimd_quantize_sse2 JPP((JCOEFPTR coef_block,
-                                      DCTELEM * divisors,
-                                      DCTELEM * workspace));
+EXTERN(void) jsimd_quantize_sse2
+        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
-EXTERN(void) jsimd_quantize_neon JPP((JCOEFPTR coef_block,
-                                      DCTELEM * divisors,
-                                      DCTELEM * workspace));
+EXTERN(void) jsimd_quantize_neon
+        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
-EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block,
-                                             FAST_FLOAT * divisors,
-                                             FAST_FLOAT * workspace));
+EXTERN(void) jsimd_quantize_mips_dspr2
+        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
-EXTERN(void) jsimd_quantize_float_sse JPP((JCOEFPTR coef_block,
-                                           FAST_FLOAT * divisors,
-                                           FAST_FLOAT * workspace));
+EXTERN(void) jsimd_quantize_altivec
+        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
-EXTERN(void) jsimd_quantize_float_sse2 JPP((JCOEFPTR coef_block,
-                                            FAST_FLOAT * divisors,
-                                            FAST_FLOAT * workspace));
+/* Floating Point Quantization */
+EXTERN(void) jsimd_quantize_float_3dnow
+        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
-/* SIMD Reduced Inverse DCT */
-EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table,
-                                     JCOEFPTR coef_block,
-                                     JSAMPARRAY output_buf,
-                                     JDIMENSION output_col));
-EXTERN(void) jsimd_idct_4x4_mmx JPP((void * dct_table,
-                                     JCOEFPTR coef_block,
-                                     JSAMPARRAY output_buf,
-                                     JDIMENSION output_col));
+EXTERN(void) jsimd_quantize_float_sse
+        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_quantize_float_sse2
+        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_quantize_float_mips_dspr2
+        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+
+/* Scaled Inverse DCT */
+EXTERN(void) jsimd_idct_2x2_mmx
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_mmx
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
 
 extern const int jconst_idct_red_sse2[];
-EXTERN(void) jsimd_idct_2x2_sse2 JPP((void * dct_table,
-                                      JCOEFPTR coef_block,
-                                      JSAMPARRAY output_buf,
-                                      JDIMENSION output_col));
-EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table,
-                                      JCOEFPTR coef_block,
-                                      JSAMPARRAY output_buf,
-                                      JDIMENSION output_col));
+EXTERN(void) jsimd_idct_2x2_sse2
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_sse2
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
 
-EXTERN(void) jsimd_idct_2x2_neon JPP((void * dct_table,
-                                      JCOEFPTR coef_block,
-                                      JSAMPARRAY output_buf,
-                                      JDIMENSION output_col));
-EXTERN(void) jsimd_idct_4x4_neon JPP((void * dct_table,
-                                      JCOEFPTR coef_block,
-                                      JSAMPARRAY output_buf,
-                                      JDIMENSION output_col));
+EXTERN(void) jsimd_idct_2x2_neon
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_neon
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
 
-/* SIMD Inverse DCT */
-EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
-                                       JCOEFPTR coef_block,
-                                       JSAMPARRAY output_buf,
-                                       JDIMENSION output_col));
-EXTERN(void) jsimd_idct_ifast_mmx JPP((void * dct_table,
-                                       JCOEFPTR coef_block,
-                                       JSAMPARRAY output_buf,
-                                       JDIMENSION output_col));
+EXTERN(void) jsimd_idct_2x2_mips_dspr2
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_mips_dspr2
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col, int *workspace);
+EXTERN(void) jsimd_idct_6x6_mips_dspr2
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12_pass1_mips_dspr2
+        (JCOEFPTR coef_block, void *dct_table, int *workspace);
+EXTERN(void) jsimd_idct_12x12_pass2_mips_dspr2
+        (int *workspace, int *output);
+
+/* Slow Integer Inverse DCT */
+EXTERN(void) jsimd_idct_islow_mmx
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
 
 extern const int jconst_idct_islow_sse2[];
-EXTERN(void) jsimd_idct_islow_sse2 JPP((void * dct_table,
-                                        JCOEFPTR coef_block,
-                                        JSAMPARRAY output_buf,
-                                        JDIMENSION output_col));
+EXTERN(void) jsimd_idct_islow_sse2
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_neon
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_mips_dspr2
+        (void *dct_table, JCOEFPTR coef_block, int *output_buf,
+         JSAMPLE *output_col);
+
+EXTERN(void) jsimd_idct_islow_altivec
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
+
+/* Fast Integer Inverse DCT */
+EXTERN(void) jsimd_idct_ifast_mmx
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
+
 extern const int jconst_idct_ifast_sse2[];
-EXTERN(void) jsimd_idct_ifast_sse2 JPP((void * dct_table,
-                                        JCOEFPTR coef_block,
-                                        JSAMPARRAY output_buf,
-                                        JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast_sse2
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
 
-EXTERN(void) jsimd_idct_islow_neon JPP((void * dct_table,
-                                        JCOEFPTR coef_block,
-                                        JSAMPARRAY output_buf,
-                                        JDIMENSION output_col));
-EXTERN(void) jsimd_idct_ifast_neon JPP((void * dct_table,
-                                        JCOEFPTR coef_block,
-                                        JSAMPARRAY output_buf,
-                                        JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast_neon
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
 
-EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
-                                         JCOEFPTR coef_block,
-                                         JSAMPARRAY output_buf,
-                                         JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2
+        (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
+         const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2
+        (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
+         const int *idct_coefs);
+
+EXTERN(void) jsimd_idct_ifast_altivec
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
+
+/* Floating Point Inverse DCT */
+EXTERN(void) jsimd_idct_float_3dnow
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
 
 extern const int jconst_idct_float_sse[];
-EXTERN(void) jsimd_idct_float_sse JPP((void * dct_table,
-                                       JCOEFPTR coef_block,
-                                       JSAMPARRAY output_buf,
-                                       JDIMENSION output_col));
+EXTERN(void) jsimd_idct_float_sse
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
 
 extern const int jconst_idct_float_sse2[];
-EXTERN(void) jsimd_idct_float_sse2 JPP((void * dct_table,
-                                        JCOEFPTR coef_block,
-                                        JSAMPARRAY output_buf,
-                                        JDIMENSION output_col));
+EXTERN(void) jsimd_idct_float_sse2
+        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         JDIMENSION output_col);
 
+/* Huffman coding */
+extern const int jconst_huff_encode_one_block[];
+EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2
+        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon
+        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon_slowtbl
+        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+         c_derived_tbl *dctbl, c_derived_tbl *actbl);
diff --git a/simd/jsimd_altivec.h b/simd/jsimd_altivec.h
new file mode 100644
index 0000000..2660219
--- /dev/null
+++ b/simd/jsimd_altivec.h
@@ -0,0 +1,99 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+#include <altivec.h>
+
+
+/* Common code */
+
+#define __4X(a) a, a, a, a
+#define __4X2(a, b) a, b, a, b, a, b, a, b
+#define __8X(a) __4X(a), __4X(a)
+#define __16X(a) __8X(a), __8X(a)
+
+#define TRANSPOSE(row, col)  \
+{  \
+  __vector short row04l, row04h, row15l, row15h,  \
+                 row26l, row26h, row37l, row37h;  \
+  __vector short col01e, col01o, col23e, col23o,  \
+                 col45e, col45o, col67e, col67o;  \
+  \
+                                       /* transpose coefficients (phase 1) */ \
+  row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
+  row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
+  row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
+  row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
+  row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
+  row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
+  row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
+  row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
+  \
+                                       /* transpose coefficients (phase 2) */ \
+  col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
+  col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
+  col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
+  col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
+  col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
+  col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
+  col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
+  col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
+  \
+                                       /* transpose coefficients (phase 3) */ \
+  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */   \
+  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */   \
+  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */   \
+  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */   \
+  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */   \
+  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */   \
+  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */   \
+  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */   \
+}
+
+#ifndef min
+#define min(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
+
+/* Macros to abstract big/little endian bit twiddling */
+
+#if __BIG_ENDIAN__
+
+#define VEC_LD(a, b) vec_ld(a, b)
+#define VEC_ST(a, b, c) vec_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
+#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
+
+#else
+
+#define VEC_LD(a, b) vec_vsx_ld(a, b)
+#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
+#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
+#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
+
+#endif
diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c
index 72fa1fc..197eb61 100644
--- a/simd/jsimd_arm.c
+++ b/simd/jsimd_arm.c
@@ -2,17 +2,16 @@
  * jsimd_arm.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2013-2014D. R. Commander
+ * Copyright 2009-2011, 2013-2014, 2016 D. R. Commander
+ * Copyright 2015-2016 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * For conditions of distribution and use, see copyright notice in jsimdext.inc
  *
  * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on
- * ARM architecture.
- *
- * Based on the stubs from 'jsimd_none.c'
+ * of the library and the SIMD implementations when running on a
+ * 32-bit ARM architecture.
  */
 
 #define JPEG_INTERNALS
@@ -28,6 +27,7 @@
 #include <ctype.h>
 
 static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
 
 #if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
 
@@ -123,12 +123,15 @@
 #endif
 
   /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCE_ARM_NEON");
+  env = getenv("JSIMD_FORCENEON");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support &= JSIMD_ARM_NEON;
-  env = getenv("JSIMD_FORCE_NO_SIMD");
+  env = getenv("JSIMD_FORCENONE");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
 }
 
 GLOBAL(int)
@@ -170,6 +173,7 @@
     return 0;
   if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
     return 0;
+
   if (simd_support & JSIMD_ARM_NEON)
     return 1;
 
@@ -200,8 +204,7 @@
 {
   void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
-  switch(cinfo->in_color_space)
-  {
+  switch(cinfo->in_color_space) {
     case JCS_EXT_RGB:
       neonfct=jsimd_extrgb_ycc_convert_neon;
       break;
@@ -229,9 +232,7 @@
       break;
   }
 
-  if (simd_support & JSIMD_ARM_NEON)
-    neonfct(cinfo->image_width, input_buf,
-        output_buf, output_row, num_rows);
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -248,8 +249,7 @@
 {
   void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
 
-  switch(cinfo->out_color_space)
-  {
+  switch(cinfo->out_color_space) {
     case JCS_EXT_RGB:
       neonfct=jsimd_ycc_extrgb_convert_neon;
       break;
@@ -277,9 +277,7 @@
       break;
   }
 
-  if (simd_support & JSIMD_ARM_NEON)
-    neonfct(cinfo->output_width, input_buf,
-        input_row, output_buf, num_rows);
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
 }
 
 GLOBAL(void)
@@ -287,9 +285,8 @@
                           JSAMPIMAGE input_buf, JDIMENSION input_row,
                           JSAMPARRAY output_buf, int num_rows)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
-                                  output_buf, num_rows);
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
 }
 
 GLOBAL(int)
@@ -309,13 +306,13 @@
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
@@ -338,17 +335,17 @@
 
 GLOBAL(void)
 jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr, 
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr, 
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
 }
 
@@ -379,21 +376,21 @@
 
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr, 
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr, 
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
-        compptr->downsampled_width, input_data, output_data_ptr);
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(int)
@@ -459,15 +456,14 @@
 
 GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM * workspace)
+                DCTELEM *workspace)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_convsamp_neon(sample_data, start_col, workspace);
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
 }
 
 GLOBAL(void)
 jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT * workspace)
+                      FAST_FLOAT *workspace)
 {
 }
 
@@ -505,19 +501,18 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM * data)
+jsimd_fdct_islow (DCTELEM *data)
 {
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM * data)
+jsimd_fdct_ifast (DCTELEM *data)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_fdct_ifast_neon(data);
+  jsimd_fdct_ifast_neon(data);
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT * data)
+jsimd_fdct_float (FAST_FLOAT *data)
 {
 }
 
@@ -549,16 +544,15 @@
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-                DCTELEM * workspace)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_quantize_neon(coef_block, divisors, workspace);
+  jsimd_quantize_neon(coef_block, divisors, workspace);
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                      FAST_FLOAT * workspace)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
 {
 }
 
@@ -579,7 +573,7 @@
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
-  if ((simd_support & JSIMD_ARM_NEON))
+  if (simd_support & JSIMD_ARM_NEON)
     return 1;
 
   return 0;
@@ -602,28 +596,28 @@
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
-  if ((simd_support & JSIMD_ARM_NEON))
+  if (simd_support & JSIMD_ARM_NEON)
     return 1;
 
   return 0;
 }
 
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
-  if ((simd_support & JSIMD_ARM_NEON))
-    jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
+                      output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
-  if ((simd_support & JSIMD_ARM_NEON))
-    jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
+                      output_col);
 }
 
 GLOBAL(int)
@@ -668,7 +662,7 @@
   if (IFAST_SCALE_BITS != 2)
     return 0;
 
-  if ((simd_support & JSIMD_ARM_NEON))
+  if (simd_support & JSIMD_ARM_NEON)
     return 1;
 
   return 0;
@@ -683,27 +677,51 @@
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
 {
-  if ((simd_support & JSIMD_ARM_NEON))
-    jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, output_col);
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
 {
-  if ((simd_support & JSIMD_ARM_NEON))
-    jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, output_col);
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
 {
 }
 
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c
index 65724cb..62dbc45 100644
--- a/simd/jsimd_arm64.c
+++ b/simd/jsimd_arm64.c
@@ -2,7 +2,8 @@
  * jsimd_arm64.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2013-2014 D. R. Commander
+ * Copyright 2009-2011, 2013-2014, 2016 D. R. Commander
+ * Copyright 2015-2016 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -25,7 +26,84 @@
 #include <string.h>
 #include <ctype.h>
 
+#define JSIMD_FASTLD3 1
+#define JSIMD_FASTST3 2
+#define JSIMD_FASTTBL 4
+
 static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
+                                    JSIMD_FASTTBL;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_cpuinfo (char *buffer, const char *field, char *value)
+{
+  char *p;
+  if (*value == 0)
+    return 0;
+  if (strncmp(buffer, field, strlen(field)) != 0)
+    return 0;
+  buffer += strlen(field);
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'value' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, value))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(value);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo (int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+          check_cpuinfo(buffer, "CPU part", "0xd07"))
+        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
+           percent speedup by disabling the use of that instruction.  The
+           speedup on Cortex-A57 is more subtle but still measurable. */
+        simd_features &= ~JSIMD_FASTTBL;
+      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+        /* The SIMD version of Huffman encoding is slower than the C version on
+           Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
+           CPU. */
+        simd_huffman = simd_features = 0;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
 
 /*
  * Check what SIMD accelerations are supported.
@@ -33,16 +111,19 @@
  * FIXME: This code is racy under a multi-threaded environment.
  */
 
-/* 
+/*
  * ARMv8 architectures support NEON extensions by default.
  * It is no longer optional as it was with ARMv7.
- */ 
+ */
 
 
 LOCAL(void)
 init_simd (void)
 {
   char *env = NULL;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
 
   if (simd_support != ~0U)
     return;
@@ -50,6 +131,13 @@
   simd_support = 0;
 
   simd_support |= JSIMD_ARM_NEON;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
 
   /* Force different settings through environment variables */
   env = getenv("JSIMD_FORCENEON");
@@ -58,6 +146,19 @@
   env = getenv("JSIMD_FORCENONE");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
+  env = getenv("JSIMD_FASTLD3");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_features |= JSIMD_FASTLD3;
+  if ((env != NULL) && (strcmp(env, "0") == 0))
+    simd_features &= ~JSIMD_FASTLD3;
+  env = getenv("JSIMD_FASTST3");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_features |= JSIMD_FASTST3;
+  if ((env != NULL) && (strcmp(env, "0") == 0))
+    simd_features &= ~JSIMD_FASTST3;
 }
 
 GLOBAL(int)
@@ -65,6 +166,17 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -117,6 +229,46 @@
                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
                        JDIMENSION output_row, int num_rows)
 {
+  void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space) {
+    case JCS_EXT_RGB:
+      if (simd_features & JSIMD_FASTLD3)
+        neonfct=jsimd_extrgb_ycc_convert_neon;
+      else
+        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct=jsimd_extrgbx_ycc_convert_neon;
+      break;
+    case JCS_EXT_BGR:
+      if (simd_features & JSIMD_FASTLD3)
+        neonfct=jsimd_extbgr_ycc_convert_neon;
+      else
+        neonfct=jsimd_extbgr_ycc_convert_neon_slowld3;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct=jsimd_extbgrx_ycc_convert_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct=jsimd_extxbgr_ycc_convert_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct=jsimd_extxrgb_ycc_convert_neon;
+      break;
+    default:
+      if (simd_features & JSIMD_FASTLD3)
+        neonfct=jsimd_extrgb_ycc_convert_neon;
+      else
+        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
+      break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -135,14 +287,20 @@
 
   switch(cinfo->out_color_space) {
     case JCS_EXT_RGB:
-      neonfct=jsimd_ycc_extrgb_convert_neon;
+      if (simd_features & JSIMD_FASTST3)
+        neonfct=jsimd_ycc_extrgb_convert_neon;
+      else
+        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
       break;
     case JCS_EXT_RGBX:
     case JCS_EXT_RGBA:
       neonfct=jsimd_ycc_extrgbx_convert_neon;
       break;
     case JCS_EXT_BGR:
-      neonfct=jsimd_ycc_extbgr_convert_neon;
+      if (simd_features & JSIMD_FASTST3)
+        neonfct=jsimd_ycc_extbgr_convert_neon;
+      else
+        neonfct=jsimd_ycc_extbgr_convert_neon_slowst3;
       break;
     case JCS_EXT_BGRX:
     case JCS_EXT_BGRA:
@@ -157,12 +315,14 @@
       neonfct=jsimd_ycc_extxrgb_convert_neon;
       break;
     default:
-      neonfct=jsimd_ycc_extrgb_convert_neon;
+      if (simd_features & JSIMD_FASTST3)
+        neonfct=jsimd_ycc_extrgb_convert_neon;
+      else
+        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
       break;
   }
 
-  if (simd_support & JSIMD_ARM_NEON)
-    neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
 }
 
 GLOBAL(void)
@@ -170,9 +330,8 @@
                           JSAMPIMAGE input_buf, JDIMENSION input_row,
                           JSAMPARRAY output_buf, int num_rows)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
-                                  output_buf, num_rows);
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
 }
 
 GLOBAL(int)
@@ -180,6 +339,17 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -188,19 +358,36 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
 }
 
 GLOBAL(int)
@@ -221,17 +408,17 @@
 
 GLOBAL(void)
 jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr,
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
 }
 
@@ -253,17 +440,17 @@
 
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr,
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
 }
 
@@ -304,6 +491,19 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -317,13 +517,14 @@
 
 GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM * workspace)
+                DCTELEM *workspace)
 {
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
 }
 
 GLOBAL(void)
 jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT * workspace)
+                      FAST_FLOAT *workspace)
 {
 }
 
@@ -332,6 +533,15 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -340,6 +550,15 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -352,17 +571,19 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM * data)
+jsimd_fdct_islow (DCTELEM *data)
 {
+  jsimd_fdct_islow_neon(data);
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM * data)
+jsimd_fdct_ifast (DCTELEM *data)
 {
+  jsimd_fdct_ifast_neon(data);
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT * data)
+jsimd_fdct_float (FAST_FLOAT *data)
 {
 }
 
@@ -371,6 +592,17 @@
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -383,14 +615,15 @@
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-                DCTELEM * workspace)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
 {
+  jsimd_quantize_neon(coef_block, divisors, workspace);
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                      FAST_FLOAT * workspace)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
 {
 }
 
@@ -441,23 +674,21 @@
 }
 
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
+                      output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
+                      output_col);
 }
 
 GLOBAL(int)
@@ -517,28 +748,55 @@
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
-                          output_col);
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
-                          output_col);
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
 }
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  if (simd_features & JSIMD_FASTTBL)
+    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                            dctbl, actbl);
+  else
+    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+                                                    last_dc_val, dctbl, actbl);
+}
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index f488b0f..d236314 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -6,6 +6,9 @@
  * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
  * Copyright (C) 2013-2014, Linaro Limited
  * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Copyright (C) 2014-2016, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
+ * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -25,11 +28,10 @@
  */
 
 #if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
 #endif
 
 .text
-.arch armv8-a+fp+simd
 
 
 #define RESPECT_STRICT_ALIGNMENT 1
@@ -53,42 +55,71 @@
 .endm
 
 /* Transpose elements of single 128 bit registers */
-.macro transpose_single x0,x1,xi,xilen,literal
-    ins  \xi\xilen[0],  \x0\xilen[0]
-    ins  \x1\xilen[0],  \x0\xilen[1]
-    trn1 \x0\literal,   \x0\literal, \x1\literal
-    trn2 \x1\literal,   \xi\literal, \x1\literal
+.macro transpose_single x0, x1, xi, xilen, literal
+    ins             \xi\xilen[0], \x0\xilen[0]
+    ins             \x1\xilen[0], \x0\xilen[1]
+    trn1            \x0\literal, \x0\literal, \x1\literal
+    trn2            \x1\literal, \xi\literal, \x1\literal
 .endm
 
 /* Transpose elements of 2 differnet registers */
-.macro transpose x0,x1,xi,xilen,literal
-    mov  \xi\xilen,     \x0\xilen
-    trn1 \x0\literal,   \x0\literal, \x1\literal
-    trn2 \x1\literal,   \xi\literal, \x1\literal
+.macro transpose x0, x1, xi, xilen, literal
+    mov             \xi\xilen, \x0\xilen
+    trn1            \x0\literal, \x0\literal, \x1\literal
+    trn2            \x1\literal, \xi\literal, \x1\literal
 .endm
 
 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
-    mov  \xi\xilen, \x0\xilen
-    trn1 \x0\x0len, \x0\x0len, \x2\x2len
-    trn2 \x2\x2len, \xi\x0len, \x2\x2len
-    mov  \xi\xilen, \x1\xilen
-    trn1 \x1\x1len, \x1\x1len, \x3\x3len
-    trn2 \x3\x3len, \xi\x1len, \x3\x3len
+.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
+    mov             \xi\xilen, \x0\xilen
+    trn1            \x0\x0len, \x0\x0len, \x2\x2len
+    trn2            \x2\x2len, \xi\x0len, \x2\x2len
+    mov             \xi\xilen, \x1\xilen
+    trn1            \x1\x1len, \x1\x1len, \x3\x3len
+    trn2            \x3\x3len, \xi\x1len, \x3\x3len
 .endm
 
-.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
-    mov  \xi\xilen, \x0\xilen
-    trn1 \x0\x0len, \x0\x0len, \x1\x1len
-    trn2 \x1\x2len, \xi\x0len, \x1\x2len
-    mov  \xi\xilen, \x2\xilen
-    trn1 \x2\x2len, \x2\x2len, \x3\x3len
-    trn2 \x3\x2len, \xi\x1len, \x3\x3len
+.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
+    mov             \xi\xilen, \x0\xilen
+    trn1            \x0\x0len, \x0\x0len, \x1\x1len
+    trn2            \x1\x2len, \xi\x0len, \x1\x2len
+    mov             \xi\xilen, \x2\xilen
+    trn1            \x2\x2len, \x2\x2len, \x3\x3len
+    trn2            \x3\x2len, \xi\x1len, \x3\x3len
 .endm
 
-.macro transpose_4x4 x0, x1, x2, x3,x5
-    transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
-    transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
+.macro transpose_4x4 x0, x1, x2, x3, x5
+    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
+    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
+.endm
+
+.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
+    trn1            \t0\().8h, \l0\().8h, \l1\().8h
+    trn1            \t1\().8h, \l2\().8h, \l3\().8h
+    trn1            \t2\().8h, \l4\().8h, \l5\().8h
+    trn1            \t3\().8h, \l6\().8h, \l7\().8h
+    trn2            \l1\().8h, \l0\().8h, \l1\().8h
+    trn2            \l3\().8h, \l2\().8h, \l3\().8h
+    trn2            \l5\().8h, \l4\().8h, \l5\().8h
+    trn2            \l7\().8h, \l6\().8h, \l7\().8h
+
+    trn1            \l4\().4s, \t2\().4s, \t3\().4s
+    trn2            \t3\().4s, \t2\().4s, \t3\().4s
+    trn1            \t2\().4s, \t0\().4s, \t1\().4s
+    trn2            \l2\().4s, \t0\().4s, \t1\().4s
+    trn1            \t0\().4s, \l1\().4s, \l3\().4s
+    trn2            \l3\().4s, \l1\().4s, \l3\().4s
+    trn2            \t1\().4s, \l5\().4s, \l7\().4s
+    trn1            \l5\().4s, \l5\().4s, \l7\().4s
+
+    trn2            \l6\().2d, \l2\().2d, \t3\().2d
+    trn1            \l0\().2d, \t2\().2d, \l4\().2d
+    trn1            \l1\().2d, \t0\().2d, \l5\().2d
+    trn2            \l7\().2d, \l3\().2d, \t1\().2d
+    trn1            \l2\().2d, \l2\().2d, \t3\().2d
+    trn2            \l4\().2d, \t2\().2d, \l4\().2d
+    trn1            \l3\().2d, \l3\().2d, \t1\().2d
+    trn2            \l5\().2d, \t0\().2d, \l5\().2d
 .endm
 
 
@@ -100,630 +131,606 @@
  * Perform dequantization and inverse DCT on one block of coefficients.
  *
  * GLOBAL(void)
- * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
+ * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
  *                        JSAMPARRAY output_buf, JDIMENSION output_col)
  */
 
-#define FIX_0_298631336  (2446)
-#define FIX_0_390180644  (3196)
-#define FIX_0_541196100  (4433)
-#define FIX_0_765366865  (6270)
-#define FIX_0_899976223  (7373)
-#define FIX_1_175875602  (9633)
-#define FIX_1_501321110  (12299)
-#define FIX_1_847759065  (15137)
-#define FIX_1_961570560  (16069)
-#define FIX_2_053119869  (16819)
-#define FIX_2_562915447  (20995)
-#define FIX_3_072711026  (25172)
+#define CONST_BITS 13
+#define PASS1_BITS 2
 
-#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
-#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
-#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
-#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
-#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
-#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
-#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
-#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
-
-/*
- * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
- * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
- */
-#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
-{                                                                             \
-    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
-    INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
-    INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
-                                                                              \
-    /* 1-D iDCT input data */                                                 \
-    row0 = xrow0;                                                             \
-    row1 = xrow1;                                                             \
-    row2 = xrow2;                                                             \
-    row3 = xrow3;                                                             \
-    row4 = xrow4;                                                             \
-    row5 = xrow5;                                                             \
-    row6 = xrow6;                                                             \
-    row7 = xrow7;                                                             \
-                                                                              \
-    q5 = row7 + row3;                                                         \
-    q4 = row5 + row1;                                                         \
-    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
-         MULTIPLY(q4, FIX_1_175875602);                                       \
-    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
-         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
-    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
-         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
-    q4 = q6;                                                                  \
-    q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
-    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
-          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
-    /* now we can use q1 (reloadable constants have been used up) */          \
-    q1 = q3 + q2;                                                             \
-    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
-          MULTIPLY(row1, -FIX_0_899976223);                                   \
-    q5 = q7;                                                                  \
-    q1 = q1 + q6;                                                             \
-    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
-          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
-                                                                              \
-    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
-    tmp11_plus_tmp2 = q1;                                                     \
-    row1 = 0;                                                                 \
-                                                                              \
-    q1 = q1 - q6;                                                             \
-    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
-          MULTIPLY(row3, -FIX_2_562915447);                                   \
-    q1 = q1 - q6;                                                             \
-    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
-         MULTIPLY(row6, FIX_0_541196100);                                     \
-    q3 = q3 - q2;                                                             \
-                                                                              \
-    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
-    tmp11_minus_tmp2 = q1;                                                    \
-                                                                              \
-    q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
-    q2 = q1 + q6;                                                             \
-    q1 = q1 - q6;                                                             \
-                                                                              \
-    /* pick up the results */                                                 \
-    tmp0  = q4;                                                               \
-    tmp1  = q5;                                                               \
-    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
-    tmp3  = q7;                                                               \
-    tmp10 = q2;                                                               \
-    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
-    tmp12 = q3;                                                               \
-    tmp13 = q1;                                                               \
-}
-
-#define XFIX_0_899976223                    v0.4h[0]
-#define XFIX_0_541196100                    v0.4h[1]
-#define XFIX_2_562915447                    v0.4h[2]
-#define XFIX_0_298631336_MINUS_0_899976223  v0.4h[3]
-#define XFIX_1_501321110_MINUS_0_899976223  v1.4h[0]
-#define XFIX_2_053119869_MINUS_2_562915447  v1.4h[1]
-#define XFIX_0_541196100_PLUS_0_765366865   v1.4h[2]
-#define XFIX_1_175875602                    v1.4h[3]
-#define XFIX_1_175875602_MINUS_0_390180644  v2.4h[0]
-#define XFIX_0_541196100_MINUS_1_847759065  v2.4h[1]
-#define XFIX_3_072711026_MINUS_2_562915447  v2.4h[2]
-#define XFIX_1_175875602_MINUS_1_961570560  v2.4h[3]
+#define F_0_298  2446  /* FIX(0.298631336) */
+#define F_0_390  3196  /* FIX(0.390180644) */
+#define F_0_541  4433  /* FIX(0.541196100) */
+#define F_0_765  6270  /* FIX(0.765366865) */
+#define F_0_899  7373  /* FIX(0.899976223) */
+#define F_1_175  9633  /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
 
 .balign 16
-jsimd_idct_islow_neon_consts:
-    .short FIX_0_899976223                    /* d0[0] */
-    .short FIX_0_541196100                    /* d0[1] */
-    .short FIX_2_562915447                    /* d0[2] */
-    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
-    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
-    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
-    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
-    .short FIX_1_175875602                    /* d1[3] */
-    /* reloadable constants */
-    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
-    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
-    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
-    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
+Ljsimd_idct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+#define XFIX_P_0_298 v0.h[0]
+#define XFIX_N_0_390 v0.h[1]
+#define XFIX_P_0_541 v0.h[2]
+#define XFIX_P_0_765 v0.h[3]
+#define XFIX_N_0_899 v0.h[4]
+#define XFIX_P_1_175 v0.h[5]
+#define XFIX_P_1_501 v0.h[6]
+#define XFIX_N_1_847 v0.h[7]
+#define XFIX_N_1_961 v1.h[0]
+#define XFIX_P_2_053 v1.h[1]
+#define XFIX_N_2_562 v1.h[2]
+#define XFIX_P_3_072 v1.h[3]
 
 asm_function jsimd_idct_islow_neon
-
     DCT_TABLE       .req x0
     COEF_BLOCK      .req x1
     OUTPUT_BUF      .req x2
     OUTPUT_COL      .req x3
     TMP1            .req x0
     TMP2            .req x1
-    TMP3            .req x2
-    TMP4            .req x15
+    TMP3            .req x9
+    TMP4            .req x10
+    TMP5            .req x11
+    TMP6            .req x12
+    TMP7            .req x13
+    TMP8            .req x14
 
-    ROW0L           .req v16
-    ROW0R           .req v17
-    ROW1L           .req v18
-    ROW1R           .req v19
-    ROW2L           .req v20
-    ROW2R           .req v21
-    ROW3L           .req v22
-    ROW3R           .req v23
-    ROW4L           .req v24
-    ROW4R           .req v25
-    ROW5L           .req v26
-    ROW5R           .req v27
-    ROW6L           .req v28
-    ROW6R           .req v29
-    ROW7L           .req v30
-    ROW7R           .req v31
-    /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
-    sub             sp, sp, 272
-    str             x15, [sp], 16
-    adr             x15, jsimd_idct_islow_neon_consts
-    st1             {v0.8b - v3.8b}, [sp], 32
-    st1             {v4.8b - v7.8b}, [sp], 32
-    st1             {v8.8b - v11.8b}, [sp], 32
-    st1             {v12.8b - v15.8b}, [sp], 32
-    st1             {v16.8b - v19.8b}, [sp], 32
-    st1             {v20.8b - v23.8b}, [sp], 32
-    st1             {v24.8b - v27.8b}, [sp], 32
-    st1             {v28.8b - v31.8b}, [sp], 32
-    ld1             {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
-    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
-    ld1             {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
-    mul             v16.4h, v16.4h, v0.4h
-    mul             v17.4h, v17.4h, v1.4h
-    ins             v16.2d[1], v17.2d[0]  /* 128 bit q8 */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
-    mul             v18.4h, v18.4h, v2.4h
-    mul             v19.4h, v19.4h, v3.4h
-    ins             v18.2d[1], v19.2d[0]  /* 128 bit q9 */
-    ld1             {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
-    mul             v20.4h, v20.4h, v4.4h
-    mul             v21.4h, v21.4h, v5.4h
-    ins             v20.2d[1], v21.2d[0]  /* 128 bit q10 */
-    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
-    mul             v22.4h, v22.4h, v6.4h
-    mul             v23.4h, v23.4h, v7.4h
-    ins             v22.2d[1], v23.2d[0]  /* 128 bit q11 */
-    ld1             {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
-    mul             v24.4h, v24.4h, v0.4h
-    mul             v25.4h, v25.4h, v1.4h
-    ins             v24.2d[1], v25.2d[0]  /* 128 bit q12 */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
-    mul             v28.4h, v28.4h, v4.4h
-    mul             v29.4h, v29.4h, v5.4h
-    ins             v28.2d[1], v29.2d[0]  /* 128 bit q14 */
-    mul             v26.4h, v26.4h, v2.4h
-    mul             v27.4h, v27.4h, v3.4h
-    ins             v26.2d[1], v27.2d[0]  /* 128 bit q13 */
-    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x15]  /* load constants */
-    add             x15, x15, #16
-    mul             v30.4h, v30.4h, v6.4h
-    mul             v31.4h, v31.4h, v7.4h
-    ins             v30.2d[1], v31.2d[0]  /* 128 bit q15 */
-    /* Go to the bottom of the stack */
-    sub             sp, sp, 352
-    stp             x4, x5, [sp], 16
-    st1             {v8.4h - v11.4h}, [sp], 32  /* save NEON registers */
-    st1             {v12.4h - v15.4h}, [sp], 32
-    /* 1-D IDCT, pass 1, left 4x8 half */
-    add             v4.4h,    ROW7L.4h, ROW3L.4h
-    add             v5.4h,    ROW5L.4h, ROW1L.4h
-    smull           v12.4s,   v4.4h,    XFIX_1_175875602_MINUS_1_961570560
-    smlal           v12.4s,   v5.4h,    XFIX_1_175875602
-    smull           v14.4s,   v4.4h,    XFIX_1_175875602
-    /* Check for the zero coefficients in the right 4x8 half */
-    smlal           v14.4s,   v5.4h,    XFIX_1_175875602_MINUS_0_390180644
-    ssubl           v6.4s,    ROW0L.4h, ROW4L.4h
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
-    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
-    smlal           v4.4s,    ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
-      orr           x0,       x4,       x5
-    mov             v8.16b,   v12.16b
-    smlsl           v12.4s,   ROW5L.4h, XFIX_2_562915447
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
-    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
-    shl             v6.4s,    v6.4s,    #13
-      orr           x0,       x0,       x4
-    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
-      orr           x0,       x0 ,      x5
-    add             v2.4s,    v6.4s,    v4.4s
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
-    mov             v10.16b,  v14.16b
-    add             v2.4s,    v2.4s,    v12.4s
-      orr           x0,       x0,       x4
-    smlsl           v14.4s,   ROW7L.4h, XFIX_0_899976223
-      orr           x0,       x0,       x5
-    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
-    rshrn           ROW1L.4h, v2.4s,    #11
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
-    sub             v2.4s,    v2.4s,    v12.4s
-    smlal           v10.4s,   ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
-      orr           x0,       x0,       x4
-    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
-      orr           x0,       x0,       x5
-    sub             v2.4s,    v2.4s,    v12.4s
-    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
-    smlal           v12.4s,   ROW6L.4h, XFIX_0_541196100
-    sub             v6.4s,    v6.4s,    v4.4s
-      orr           x0,       x0,       x4
-    rshrn           ROW6L.4h, v2.4s,    #11
-      orr           x0,       x0,       x5
-    add             v2.4s,    v6.4s,    v10.4s
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
-    sub             v6.4s,    v6.4s,    v10.4s
-    saddl           v10.4s,   ROW0L.4h, ROW4L.4h
-      orr           x0,       x0,       x4
-    rshrn           ROW2L.4h, v2.4s,    #11
-      orr           x0,       x0,       x5
-    rshrn           ROW5L.4h, v6.4s,    #11
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
-    shl             v10.4s,   v10.4s,   #13
-    smlal           v8.4s,    ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
-      orr           x0,       x0,       x4
-    add             v4.4s,    v10.4s,   v12.4s
-      orr           x0,       x0,       x5
-    cmp             x0, #0 /* orrs instruction removed */
-    sub             v2.4s,    v10.4s,   v12.4s
-    add             v12.4s,   v4.4s,    v14.4s
-      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
-    sub             v4.4s,    v4.4s,    v14.4s
-    add             v10.4s,   v2.4s,    v8.4s
-      orr           x0,       x4,       x5
-    sub             v6.4s,    v2.4s,    v8.4s
-      /* pop             {x4, x5} */
-      sub           sp, sp, 80
-      ldp           x4, x5, [sp], 16
-    rshrn           ROW7L.4h, v4.4s,    #11
-    rshrn           ROW3L.4h, v10.4s,   #11
-    rshrn           ROW0L.4h, v12.4s,   #11
-    rshrn           ROW4L.4h, v6.4s,    #11
+    sub             sp, sp, #64
+    adr             x15, Ljsimd_idct_islow_neon_consts
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
+    ld1             {v0.8h, v1.8h}, [x15]
+    ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
+    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
+    ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
+    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
 
-      beq             3f /* Go to do some special handling for the sparse right 4x8 half */
+    cmeq            v16.8h, v3.8h, #0
+    cmeq            v26.8h, v4.8h, #0
+    cmeq            v27.8h, v5.8h, #0
+    cmeq            v28.8h, v6.8h, #0
+    cmeq            v29.8h, v7.8h, #0
+    cmeq            v30.8h, v8.8h, #0
+    cmeq            v31.8h, v9.8h, #0
 
-    /* 1-D IDCT, pass 1, right 4x8 half */
-    ld1             {v2.4h},  [x15]    /* reload constants */
-    add             v10.4h,   ROW7R.4h, ROW3R.4h
-    add             v8.4h,    ROW5R.4h, ROW1R.4h
-    /* Transpose ROW6L <-> ROW7L   (v3 available free register) */
-    transpose       ROW6L, ROW7L, v3, .16b, .4h
-    smull           v12.4s,   v10.4h,   XFIX_1_175875602_MINUS_1_961570560
-    smlal           v12.4s,   v8.4h,    XFIX_1_175875602
-    /* Transpose ROW2L <-> ROW3L   (v3 available free register) */
-    transpose       ROW2L, ROW3L, v3, .16b, .4h
-    smull           v14.4s,   v10.4h,   XFIX_1_175875602
-    smlal           v14.4s,   v8.4h,    XFIX_1_175875602_MINUS_0_390180644
-    /* Transpose ROW0L <-> ROW1L   (v3 available free register) */
-    transpose       ROW0L, ROW1L, v3, .16b, .4h
-    ssubl           v6.4s,    ROW0R.4h, ROW4R.4h
-    smull           v4.4s,    ROW2R.4h, XFIX_0_541196100
-    smlal           v4.4s,    ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
-    /* Transpose ROW4L <-> ROW5L   (v3 available free register) */
-    transpose       ROW4L, ROW5L, v3, .16b, .4h
-    mov             v8.16b,   v12.16b
-    smlsl           v12.4s,   ROW5R.4h, XFIX_2_562915447
-    smlal           v12.4s,   ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447
-    /* Transpose ROW1L <-> ROW3L   (v3 available free register) */
-    transpose       ROW1L, ROW3L, v3, .16b, .2s
-    shl             v6.4s,    v6.4s,    #13
-    smlsl           v8.4s,    ROW1R.4h, XFIX_0_899976223
-    /* Transpose ROW4L <-> ROW6L   (v3 available free register) */
-    transpose       ROW4L, ROW6L, v3, .16b, .2s
-    add             v2.4s,    v6.4s,    v4.4s
-    mov             v10.16b,  v14.16b
-    add             v2.4s,    v2.4s,    v12.4s
-    /* Transpose ROW0L <-> ROW2L   (v3 available free register) */
-    transpose       ROW0L, ROW2L, v3, .16b, .2s
-    smlsl           v14.4s,   ROW7R.4h, XFIX_0_899976223
-    smlal           v14.4s,   ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223
-    rshrn           ROW1R.4h, v2.4s,    #11
-    /* Transpose ROW5L <-> ROW7L   (v3 available free register) */
-    transpose       ROW5L, ROW7L, v3, .16b, .2s
-    sub             v2.4s,    v2.4s,    v12.4s
-    smlal           v10.4s,   ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
-    smlsl           v10.4s,   ROW3R.4h, XFIX_2_562915447
-    sub             v2.4s,    v2.4s,    v12.4s
-    smull           v12.4s,   ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865
-    smlal           v12.4s,   ROW6R.4h, XFIX_0_541196100
-    sub             v6.4s,    v6.4s,    v4.4s
-    rshrn           ROW6R.4h, v2.4s,    #11
-    add             v2.4s,    v6.4s,    v10.4s
-    sub             v6.4s,    v6.4s,    v10.4s
-    saddl           v10.4s,   ROW0R.4h, ROW4R.4h
-    rshrn           ROW2R.4h, v2.4s,    #11
-    rshrn           ROW5R.4h, v6.4s,    #11
-    shl             v10.4s,   v10.4s,   #13
-    smlal           v8.4s,    ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
-    add             v4.4s,    v10.4s,   v12.4s
-    sub             v2.4s,    v10.4s,   v12.4s
-    add             v12.4s,   v4.4s,    v14.4s
-    sub             v4.4s,    v4.4s,    v14.4s
-    add             v10.4s,   v2.4s,    v8.4s
-    sub             v6.4s,    v2.4s,    v8.4s
-    rshrn           ROW7R.4h, v4.4s,    #11
-    rshrn           ROW3R.4h, v10.4s,   #11
-    rshrn           ROW0R.4h, v12.4s,   #11
-    rshrn           ROW4R.4h, v6.4s,    #11
-    /* Transpose right 4x8 half */
-    transpose       ROW6R, ROW7R, v3, .16b, .4h
-    transpose       ROW2R, ROW3R, v3, .16b, .4h
-    transpose       ROW0R, ROW1R, v3, .16b, .4h
-    transpose       ROW4R, ROW5R, v3, .16b, .4h
-    transpose       ROW1R, ROW3R, v3, .16b, .2s
-    transpose       ROW4R, ROW6R, v3, .16b, .2s
-    transpose       ROW0R, ROW2R, v3, .16b, .2s
-    transpose       ROW5R, ROW7R, v3, .16b, .2s
+    and             v10.16b, v16.16b, v26.16b
+    and             v11.16b, v27.16b, v28.16b
+    and             v12.16b, v29.16b, v30.16b
+    and             v13.16b, v31.16b, v10.16b
+    and             v14.16b, v11.16b, v12.16b
+    mul             v2.8h, v2.8h, v18.8h
+    and             v15.16b, v13.16b, v14.16b
+    shl             v10.8h, v2.8h, #(PASS1_BITS)
+    sqxtn           v16.8b, v15.8h
+    mov             TMP1, v16.d[0]
+    sub             sp, sp, #64
+    mvn             TMP2, TMP1
 
-1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
-    ld1             {v2.4h},  [x15]    /* reload constants */
-    smull           v12.4S,   ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
-    smlal           v12.4s,   ROW1L.4h, XFIX_1_175875602
-    smlal           v12.4s,   ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
-    smlal           v12.4s,   ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
-    smull           v14.4s,   ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
-    smlal           v14.4s,   ROW3L.4h, XFIX_1_175875602
-    smlal           v14.4s,   ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
-    smlal           v14.4s,   ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
-    ssubl           v6.4s,    ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
-    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
-    smlal           v4.4s,    ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L.4h <-> ROW2R.4h */
-    mov             v8.16b,   v12.16b
-    smlsl           v12.4s,   ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
-    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
-    shl             v6.4s,    v6.4s,    #13
-    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
-    add             v2.4s,    v6.4s,    v4.4s
-    mov             v10.16b,  v14.16b
-    add             v2.4s,    v2.4s,    v12.4s
-    smlsl           v14.4s,   ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
-    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
-    shrn            ROW1L.4h, v2.4s,    #16
-    sub             v2.4s,    v2.4s,    v12.4s
-    smlal           v10.4s,   ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
-    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
-    sub             v2.4s,    v2.4s,    v12.4s
-    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
-    smlal           v12.4s,   ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
-    sub             v6.4s,    v6.4s,    v4.4s
-    shrn            ROW2R.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
-    add             v2.4s,    v6.4s,    v10.4s
-    sub             v6.4s,    v6.4s,    v10.4s
-    saddl           v10.4s,   ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
-    shrn            ROW2L.4h, v2.4s,    #16
-    shrn            ROW1R.4h, v6.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
-    shl             v10.4s,   v10.4s,   #13
-    smlal           v8.4s,    ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
-    add             v4.4s,    v10.4s,   v12.4s
-    sub             v2.4s,    v10.4s,   v12.4s
-    add             v12.4s,   v4.4s,    v14.4s
-    sub             v4.4s,    v4.4s,    v14.4s
-    add             v10.4s,   v2.4s,    v8.4s
-    sub             v6.4s,    v2.4s,    v8.4s
-    shrn            ROW3R.4h, v4.4s,    #16 /* ROW7L.4h <-> ROW3R.4h */
-    shrn            ROW3L.4h, v10.4s,   #16
-    shrn            ROW0L.4h, v12.4s,   #16
-    shrn            ROW0R.4h, v6.4s,    #16 /* ROW4L.4h <-> ROW0R.4h */
-    /* 1-D IDCT, pass 2, right 4x8 half */
-    ld1             {v2.4h},  [x15]    /* reload constants */
-    smull           v12.4s,   ROW5R.4h, XFIX_1_175875602
-    smlal           v12.4s,   ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
-    smlal           v12.4s,   ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
-    smlal           v12.4s,   ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
-    smull           v14.4s,   ROW7R.4h, XFIX_1_175875602
-    smlal           v14.4s,   ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
-    smlal           v14.4s,   ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
-    smlal           v14.4s,   ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
-    ssubl           v6.4s,    ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
-    smull           v4.4s,    ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
-    smlal           v4.4s,    ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
-    mov             v8.16b,   v12.16b
-    smlsl           v12.4s,   ROW5R.4h, XFIX_2_562915447
-    smlal           v12.4s,   ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
-    shl             v6.4s,    v6.4s,    #13
-    smlsl           v8.4s,    ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
-    add             v2.4s,    v6.4s,    v4.4s
-    mov             v10.16b,  v14.16b
-    add             v2.4s,    v2.4s,    v12.4s
-    smlsl           v14.4s,   ROW7R.4h, XFIX_0_899976223
-    smlal           v14.4s,   ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
-    shrn            ROW5L.4h, v2.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
-    sub             v2.4s,    v2.4s,    v12.4s
-    smlal           v10.4s,   ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
-    smlsl           v10.4s,   ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
-    sub             v2.4s,    v2.4s,    v12.4s
-    smull           v12.4s,   ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L.4h <-> ROW2R.4h */
-    smlal           v12.4s,   ROW6R.4h, XFIX_0_541196100
-    sub             v6.4s,    v6.4s,    v4.4s
-    shrn            ROW6R.4h, v2.4s,    #16
-    add             v2.4s,    v6.4s,    v10.4s
-    sub             v6.4s,    v6.4s,    v10.4s
-    saddl           v10.4s,   ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
-    shrn            ROW6L.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
-    shrn            ROW5R.4h, v6.4s,    #16
-    shl             v10.4s,   v10.4s,   #13
-    smlal           v8.4s,    ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
-    add             v4.4s,    v10.4s,   v12.4s
-    sub             v2.4s,    v10.4s,   v12.4s
-    add             v12.4s,   v4.4s,    v14.4s
-    sub             v4.4s,    v4.4s,    v14.4s
-    add             v10.4s,   v2.4s,    v8.4s
-    sub             v6.4s,    v2.4s,    v8.4s
-    shrn            ROW7R.4h, v4.4s,    #16
-    shrn            ROW7L.4h, v10.4s,   #16 /* ROW7L.4h <-> ROW3R.4h */
-    shrn            ROW4L.4h, v12.4s,   #16 /* ROW4L.4h <-> ROW0R.4h */
-    shrn            ROW4R.4h, v6.4s,    #16
+    cbnz            TMP2, 2f
+    /* case all AC coeffs are zeros */
+    dup             v2.2d, v10.d[0]
+    dup             v6.2d, v10.d[1]
+    mov             v3.16b, v2.16b
+    mov             v7.16b, v6.16b
+    mov             v4.16b, v2.16b
+    mov             v8.16b, v6.16b
+    mov             v5.16b, v2.16b
+    mov             v9.16b, v6.16b
+1:
+    /* for this transpose, we should organise data like this:
+     * 00, 01, 02, 03, 40, 41, 42, 43
+     * 10, 11, 12, 13, 50, 51, 52, 53
+     * 20, 21, 22, 23, 60, 61, 62, 63
+     * 30, 31, 32, 33, 70, 71, 72, 73
+     * 04, 05, 06, 07, 44, 45, 46, 47
+     * 14, 15, 16, 17, 54, 55, 56, 57
+     * 24, 25, 26, 27, 64, 65, 66, 67
+     * 34, 35, 36, 37, 74, 75, 76, 77
+     */
+    trn1            v28.8h, v2.8h, v3.8h
+    trn1            v29.8h, v4.8h, v5.8h
+    trn1            v30.8h, v6.8h, v7.8h
+    trn1            v31.8h, v8.8h, v9.8h
+    trn2            v16.8h, v2.8h, v3.8h
+    trn2            v17.8h, v4.8h, v5.8h
+    trn2            v18.8h, v6.8h, v7.8h
+    trn2            v19.8h, v8.8h, v9.8h
+    trn1            v2.4s, v28.4s, v29.4s
+    trn1            v6.4s, v30.4s, v31.4s
+    trn1            v3.4s, v16.4s, v17.4s
+    trn1            v7.4s, v18.4s, v19.4s
+    trn2            v4.4s, v28.4s, v29.4s
+    trn2            v8.4s, v30.4s, v31.4s
+    trn2            v5.4s, v16.4s, v17.4s
+    trn2            v9.4s, v18.4s, v19.4s
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
 
-2:  /* Descale to 8-bit and range limit */
-    ins             v16.2d[1], v17.2d[0]
-    ins             v18.2d[1], v19.2d[0]
-    ins             v20.2d[1], v21.2d[0]
-    ins             v22.2d[1], v23.2d[0]
-    sqrshrn         v16.8b,   v16.8h,   #2
-    sqrshrn2        v16.16b,  v18.8h,   #2
-    sqrshrn         v18.8b,   v20.8h,   #2
-    sqrshrn2        v18.16b,  v22.8h,   #2
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
 
-    /* vpop            {v8.4h - d15.4h} */ /* restore NEON registers */
-    ld1             {v8.4h - v11.4h}, [sp], 32
-    ld1             {v12.4h - v15.4h}, [sp], 32
-    ins             v24.2d[1], v25.2d[0]
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
 
-    sqrshrn         v20.8b,   v24.8h,   #2
-      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
-    /* trn1            v16.8h,    v16.8h,  v18.8h */
-    transpose       v16, v18, v3, .16b, .8h
-    ins             v26.2d[1], v27.2d[0]
-    ins             v28.2d[1], v29.2d[0]
-    ins             v30.2d[1], v31.2d[0]
-    sqrshrn2        v20.16b,  v26.8h,   #2
-    sqrshrn         v22.8b,   v28.8h,   #2
-    movi            v0.16b,   #(CENTERJSAMPLE)
-    sqrshrn2        v22.16b,  v30.8h,   #2
-    transpose_single v16, v17, v3, .2d, .8b
-    transpose_single v18, v19, v3, .2d, .8b
-    add             v16.8b,   v16.8b,   v0.8b
-    add             v17.8b,   v17.8b,   v0.8b
-    add             v18.8b,   v18.8b,   v0.8b
-    add             v19.8b,   v19.8b,   v0.8b
-    transpose       v20, v22, v3, .16b, .8h
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
+
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
+
+    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    movi            v0.16b, #(CENTERJSAMPLE)
+    /* Prepare pointers (dual-issue with NEON instructions) */
+      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
+    sqrshrn         v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
+    sqrshrn         v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP1, TMP1, OUTPUT_COL
+    sqrshrn         v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP2, TMP2, OUTPUT_COL
+    sqrshrn         v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP3, TMP3, OUTPUT_COL
+    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP4, TMP4, OUTPUT_COL
+    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
+    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
+    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
+      add             TMP5, TMP5, OUTPUT_COL
+    add             v16.16b, v28.16b, v0.16b
+      add             TMP6, TMP6, OUTPUT_COL
+    add             v18.16b, v29.16b, v0.16b
+      add             TMP7, TMP7, OUTPUT_COL
+    add             v20.16b, v30.16b, v0.16b
+      add             TMP8, TMP8, OUTPUT_COL
+    add             v22.16b, v31.16b, v0.16b
+
+    /* Transpose the final 8-bit samples */
+    trn1            v28.16b, v16.16b, v18.16b
+    trn1            v30.16b, v20.16b, v22.16b
+    trn2            v29.16b, v16.16b, v18.16b
+    trn2            v31.16b, v20.16b, v22.16b
+
+    trn1            v16.8h, v28.8h, v30.8h
+    trn2            v18.8h, v28.8h, v30.8h
+    trn1            v20.8h, v29.8h, v31.8h
+    trn2            v22.8h, v29.8h, v31.8h
+
+    uzp1            v28.4s, v16.4s, v18.4s
+    uzp2            v30.4s, v16.4s, v18.4s
+    uzp1            v29.4s, v20.4s, v22.4s
+    uzp2            v31.4s, v20.4s, v22.4s
+
     /* Store results to the output buffer */
-    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    add             TMP1,     TMP1,     OUTPUT_COL
-    add             TMP2,     TMP2,     OUTPUT_COL
-    st1             {v16.8b}, [TMP1]
-    transpose_single v20, v21, v3, .2d, .8b
-    st1             {v17.8b}, [TMP2]
-    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    add             TMP1,     TMP1,     OUTPUT_COL
-    add             TMP2,     TMP2,     OUTPUT_COL
-    st1             {v18.8b}, [TMP1]
-    add             v20.8b,   v20.8b,   v0.8b
-    add             v21.8b,   v21.8b,   v0.8b
-    st1             {v19.8b}, [TMP2]
-    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    ldp             TMP3,     TMP4,     [OUTPUT_BUF]
-    add             TMP1,     TMP1,     OUTPUT_COL
-    add             TMP2,     TMP2,     OUTPUT_COL
-    add             TMP3,     TMP3,     OUTPUT_COL
-    add             TMP4,     TMP4,     OUTPUT_COL
-    transpose_single v22, v23, v3, .2d, .8b
-    st1             {v20.8b}, [TMP1]
-    add             v22.8b,   v22.8b,   v0.8b
-    add             v23.8b,   v23.8b,   v0.8b
-    st1             {v21.8b}, [TMP2]
-    st1             {v22.8b}, [TMP3]
-    st1             {v23.8b}, [TMP4]
-    ldr             x15, [sp], 16
-    ld1             {v0.8b - v3.8b}, [sp], 32
-    ld1             {v4.8b - v7.8b}, [sp], 32
-    ld1             {v8.8b - v11.8b}, [sp], 32
-    ld1             {v12.8b - v15.8b}, [sp], 32
-    ld1             {v16.8b - v19.8b}, [sp], 32
-    ld1             {v20.8b - v23.8b}, [sp], 32
-    ld1             {v24.8b - v27.8b}, [sp], 32
-    ld1             {v28.8b - v31.8b}, [sp], 32
+    st1             {v28.d}[0], [TMP1]
+    st1             {v29.d}[0], [TMP2]
+    st1             {v28.d}[1], [TMP3]
+    st1             {v29.d}[1], [TMP4]
+    st1             {v30.d}[0], [TMP5]
+    st1             {v31.d}[0], [TMP6]
+    st1             {v30.d}[1], [TMP7]
+    st1             {v31.d}[1], [TMP8]
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
     blr             x30
 
-3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+.balign 16
+2:
+    mul             v3.8h, v3.8h, v19.8h
+    mul             v4.8h, v4.8h, v20.8h
+    mul             v5.8h, v5.8h, v21.8h
+    add             TMP4, xzr, TMP2, LSL #32
+    mul             v6.8h, v6.8h, v22.8h
+    mul             v7.8h, v7.8h, v23.8h
+    adds            TMP3, xzr, TMP2, LSR #32
+    mul             v8.8h, v8.8h, v24.8h
+    mul             v9.8h, v9.8h, v25.8h
+    b.ne            3f
+    /* Right AC coef is zero */
+    dup             v15.2d, v10.d[1]
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
 
-    /* Transpose left 4x8 half */
-    transpose       ROW6L, ROW7L, v3, .16b, .4h
-    transpose       ROW2L, ROW3L, v3, .16b, .4h
-    transpose       ROW0L, ROW1L, v3, .16b, .4h
-    transpose       ROW4L, ROW5L, v3, .16b, .4h
-    shl             ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
-    transpose       ROW1L, ROW3L, v3, .16b, .2s
-    transpose       ROW4L, ROW6L, v3, .16b, .2s
-    transpose       ROW0L, ROW2L, v3, .16b, .2s
-    transpose       ROW5L, ROW7L, v3, .16b, .2s
-    cmp             x0, #0
-    beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
 
-    /* Only row 0 is non-zero for the right 4x8 half  */
-    dup             ROW1R.4h, ROW0R.4h[1]
-    dup             ROW2R.4h, ROW0R.4h[2]
-    dup             ROW3R.4h, ROW0R.4h[3]
-    dup             ROW4R.4h, ROW0R.4h[0]
-    dup             ROW5R.4h, ROW0R.4h[1]
-    dup             ROW6R.4h, ROW0R.4h[2]
-    dup             ROW7R.4h, ROW0R.4h[3]
-    dup             ROW0R.4h, ROW0R.4h[0]
-    b               1b /* Go to 'normal' second pass */
+    add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
 
-4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
-    ld1             {v2.4h},  [x15]    /* reload constants */
-    smull           v12.4s,   ROW1L.4h, XFIX_1_175875602
-    smlal           v12.4s,   ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
-    smull           v14.4s,   ROW3L.4h, XFIX_1_175875602
-    smlal           v14.4s,   ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
-    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
-    sshll           v6.4s,    ROW0L.4h, #13
-    mov             v8.16b,   v12.16b
-    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
-    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
-    add             v2.4s,    v6.4s,    v4.4s
-    mov             v10.16b,  v14.16b
-    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
-    add             v2.4s,    v2.4s,    v12.4s
-    add             v12.4s,   v12.4s,   v12.4s
-    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
-    shrn            ROW1L.4h, v2.4s,    #16
-    sub             v2.4s,    v2.4s,    v12.4s
-    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
-    sub             v6.4s,    v6.4s,    v4.4s
-    shrn            ROW2R.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
-    add             v2.4s,    v6.4s,    v10.4s
-    sub             v6.4s,    v6.4s,    v10.4s
-    sshll           v10.4s,   ROW0L.4h, #13
-    shrn            ROW2L.4h, v2.4s,    #16
-    shrn            ROW1R.4h, v6.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
-    add             v4.4s,    v10.4s,   v12.4s
-    sub             v2.4s,    v10.4s,   v12.4s
-    add             v12.4s,   v4.4s,    v14.4s
-    sub             v4.4s,    v4.4s,    v14.4s
-    add             v10.4s,   v2.4s,    v8.4s
-    sub             v6.4s,    v2.4s,    v8.4s
-    shrn            ROW3R.4h, v4.4s,    #16 /* ROW7L.4h <-> ROW3R.4h */
-    shrn            ROW3L.4h, v10.4s,   #16
-    shrn            ROW0L.4h, v12.4s,   #16
-    shrn            ROW0R.4h, v6.4s,    #16 /* ROW4L.4h <-> ROW0R.4h */
-    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
-    ld1             {v2.4h},  [x15]    /* reload constants */
-    smull           v12.4s,   ROW5L.4h, XFIX_1_175875602
-    smlal           v12.4s,   ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560
-    smull           v14.4s,   ROW7L.4h, XFIX_1_175875602
-    smlal           v14.4s,   ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644
-    smull           v4.4s,    ROW6L.4h, XFIX_0_541196100
-    sshll           v6.4s,    ROW4L.4h, #13
-    mov             v8.16b,   v12.16b
-    smlal           v12.4s,   ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447
-    smlsl           v8.4s,    ROW5L.4h, XFIX_0_899976223
-    add             v2.4s,    v6.4s,    v4.4s
-    mov             v10.16b,  v14.16b
-    smlal           v14.4s,   ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223
-    add             v2.4s,    v2.4s,    v12.4s
-    add             v12.4s,   v12.4s,   v12.4s
-    smlsl           v10.4s,   ROW7L.4h, XFIX_2_562915447
-    shrn            ROW5L.4h, v2.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
-    sub             v2.4s,    v2.4s,    v12.4s
-    smull           v12.4s,   ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865
-    sub             v6.4s,    v6.4s,    v4.4s
-    shrn            ROW6R.4h, v2.4s,    #16
-    add             v2.4s,    v6.4s,    v10.4s
-    sub             v6.4s,    v6.4s,    v10.4s
-    sshll           v10.4s,   ROW4L.4h, #13
-    shrn            ROW6L.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
-    shrn            ROW5R.4h, v6.4s,    #16
-    add             v4.4s,    v10.4s,   v12.4s
-    sub             v2.4s,    v10.4s,   v12.4s
-    add             v12.4s,   v4.4s,    v14.4s
-    sub             v4.4s,    v4.4s,    v14.4s
-    add             v10.4s,   v2.4s,    v8.4s
-    sub             v6.4s,    v2.4s,    v8.4s
-    shrn            ROW7R.4h, v4.4s,    #16
-    shrn            ROW7L.4h, v10.4s,   #16 /* ROW7L.4h <-> ROW3R.4h */
-    shrn            ROW4L.4h, v12.4s,   #16 /* ROW4L.4h <-> ROW0R.4h */
-    shrn            ROW4R.4h, v6.4s,    #16
-    b               2b /* Go to epilogue */
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
+
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
+
+    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    mov             v6.16b, v15.16b
+    mov             v7.16b, v15.16b
+    mov             v8.16b, v15.16b
+    mov             v9.16b, v15.16b
+    b               1b
+
+.balign 16
+3:
+    cbnz            TMP4, 4f
+    /* Left AC coef is zero */
+    dup             v14.2d, v10.d[0]
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
+
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
+
+    mov             v2.16b, v14.16b
+    mov             v3.16b, v14.16b
+    mov             v4.16b, v14.16b
+    mov             v5.16b, v14.16b
+    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    b               1b
+
+.balign 16
+4:
+    /* "No" AC coef is zero */
+    /* Even part: reverse the even part of the forward DCT. */
+    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    mov             v21.16b, v19.16b               /* tmp3 = z1 */
+    mov             v20.16b, v18.16b               /* tmp3 = z1 */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
+    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
+    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
+    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
+    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
+    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
+    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
+    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
+
+    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+
+    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+
+    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
+    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
+    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
+    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
+
+    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
+    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
+    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
+    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
+    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
+    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
+    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
+    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
+
+    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
+    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
+    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
+    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
+    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
+    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
+    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
+    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
+    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
+    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
+    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
+    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
+    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
+    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
+    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
+    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
+    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
+    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
+    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
+    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
+    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
+    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
+    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
+
+    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    b               1b
 
     .unreq          DCT_TABLE
     .unreq          COEF_BLOCK
@@ -733,23 +740,26 @@
     .unreq          TMP2
     .unreq          TMP3
     .unreq          TMP4
+    .unreq          TMP5
+    .unreq          TMP6
+    .unreq          TMP7
+    .unreq          TMP8
 
-    .unreq          ROW0L
-    .unreq          ROW0R
-    .unreq          ROW1L
-    .unreq          ROW1R
-    .unreq          ROW2L
-    .unreq          ROW2R
-    .unreq          ROW3L
-    .unreq          ROW3R
-    .unreq          ROW4L
-    .unreq          ROW4R
-    .unreq          ROW5L
-    .unreq          ROW5R
-    .unreq          ROW6L
-    .unreq          ROW6R
-    .unreq          ROW7L
-    .unreq          ROW7R
+#undef CENTERJSAMPLE
+#undef CONST_BITS
+#undef PASS1_BITS
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
 
 
 /*****************************************************************************/
@@ -770,17 +780,17 @@
  * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
  */
 
-#define XFIX_1_082392200 v0.4h[0]
-#define XFIX_1_414213562 v0.4h[1]
-#define XFIX_1_847759065 v0.4h[2]
-#define XFIX_2_613125930 v0.4h[3]
+#define XFIX_1_082392200 v0.h[0]
+#define XFIX_1_414213562 v0.h[1]
+#define XFIX_1_847759065 v0.h[2]
+#define XFIX_2_613125930 v0.h[3]
 
 .balign 16
-jsimd_idct_ifast_neon_consts:
-    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
-    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
-    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
-    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+Ljsimd_idct_ifast_neon_consts:
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
 
 asm_function jsimd_idct_ifast_neon
 
@@ -790,261 +800,182 @@
     OUTPUT_COL      .req x3
     TMP1            .req x0
     TMP2            .req x1
-    TMP3            .req x2
-    TMP4            .req x22
-    TMP5            .req x23
+    TMP3            .req x9
+    TMP4            .req x10
+    TMP5            .req x11
+    TMP6            .req x12
+    TMP7            .req x13
+    TMP8            .req x14
 
     /* Load and dequantize coefficients into NEON registers
      * with the following allocation:
      *       0 1 2 3 | 4 5 6 7
      *      ---------+--------
-     *   0 | d16     | d17     ( v8.8h  )
-     *   1 | d18     | d19     ( v9.8h  )
-     *   2 | d20     | d21     ( v10.8h )
-     *   3 | d22     | d23     ( v11.8h )
-     *   4 | d24     | d25     ( v12.8h )
-     *   5 | d26     | d27     ( v13.8h )
-     *   6 | d28     | d29     ( v14.8h )
-     *   7 | d30     | d31     ( v15.8h )
+     *   0 | d16     | d17     ( v16.8h )
+     *   1 | d18     | d19     ( v17.8h )
+     *   2 | d20     | d21     ( v18.8h )
+     *   3 | d22     | d23     ( v19.8h )
+     *   4 | d24     | d25     ( v20.8h )
+     *   5 | d26     | d27     ( v21.8h )
+     *   6 | d28     | d29     ( v22.8h )
+     *   7 | d30     | d31     ( v23.8h )
      */
     /* Save NEON registers used in fast IDCT */
-    sub             sp, sp, #176
-    stp             x22, x23, [sp], 16
-    adr             x23, jsimd_idct_ifast_neon_consts
-    st1             {v0.8b - v3.8b}, [sp], 32
-    st1             {v4.8b - v7.8b}, [sp], 32
-    st1             {v8.8b - v11.8b}, [sp], 32
-    st1             {v12.8b - v15.8b}, [sp], 32
-    st1             {v16.8b - v19.8b}, [sp], 32
-    ld1             {v8.8h, v9.8h}, [COEF_BLOCK], 32
+    adr             TMP5, Ljsimd_idct_ifast_neon_consts
+    ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
-    ld1             {v10.8h, v11.8h}, [COEF_BLOCK], 32
-    mul             v8.8h,  v8.8h,  v0.8h
+    ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
+    mul             v16.8h, v16.8h, v0.8h
     ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v9.8h,  v9.8h,  v1.8h
-    ld1             {v12.8h, v13.8h}, [COEF_BLOCK], 32
-    mul             v10.8h, v10.8h, v2.8h
+    mul             v17.8h, v17.8h, v1.8h
+    ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
+    mul             v18.8h, v18.8h, v2.8h
     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
-    mul             v11.8h, v11.8h, v3.8h
-    ld1             {v14.8h, v15.8h}, [COEF_BLOCK], 32
-    mul             v12.8h, v12.8h, v0.8h
+    mul             v19.8h, v19.8h, v3.8h
+    ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
+    mul             v20.8h, v20.8h, v0.8h
     ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v14.8h, v14.8h, v2.8h
-    mul             v13.8h, v13.8h, v1.8h
-    ld1             {v0.4h}, [x23]      /* load constants */
-    mul             v15.8h, v15.8h, v3.8h
+    mul             v22.8h, v22.8h, v2.8h
+    mul             v21.8h, v21.8h, v1.8h
+    ld1             {v0.4h}, [TMP5]        /* load constants */
+    mul             v23.8h, v23.8h, v3.8h
 
     /* 1-D IDCT, pass 1 */
-    sub             v2.8h,    v10.8h,   v14.8h
-    add             v14.8h,   v10.8h,   v14.8h
-    sub             v1.8h,    v11.8h,   v13.8h
-    add             v13.8h,   v11.8h,   v13.8h
-    sub             v5.8h,    v9.8h,    v15.8h
-    add             v15.8h,   v9.8h,    v15.8h
-    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
-    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
-    add             v3.8h,    v1.8h,    v1.8h
-    sub             v1.8h,    v5.8h,    v1.8h
-    add             v10.8h,   v2.8h,    v4.8h
-    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
-    sub             v2.8h,    v15.8h,   v13.8h
-    add             v3.8h,    v3.8h,    v6.8h
-    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
-    add             v1.8h,    v1.8h,    v4.8h
-    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
-    sub             v10.8h,   v10.8h,   v14.8h
-    add             v2.8h,    v2.8h,    v6.8h
-    sub             v6.8h,    v8.8h,    v12.8h
-    add             v12.8h,   v8.8h,    v12.8h
-    add             v9.8h,    v5.8h,    v4.8h
-    add             v5.8h,    v6.8h,    v10.8h
-    sub             v10.8h,   v6.8h,    v10.8h
-    add             v6.8h,    v15.8h,   v13.8h
-    add             v8.8h,    v12.8h,   v14.8h
-    sub             v3.8h,    v6.8h,    v3.8h
-    sub             v12.8h,   v12.8h,   v14.8h
-    sub             v3.8h,    v3.8h,    v1.8h
-    sub             v1.8h,    v9.8h,    v1.8h
-    add             v2.8h,    v3.8h,    v2.8h
-    sub             v15.8h,   v8.8h,    v6.8h
-    add             v1.8h,    v1.8h,    v2.8h
-    add             v8.8h,    v8.8h,    v6.8h
-    add             v14.8h,   v5.8h,    v3.8h
-    sub             v9.8h,    v5.8h,    v3.8h
-    sub             v13.8h,   v10.8h,   v2.8h
-    add             v10.8h,   v10.8h,   v2.8h
-    /* Transpose  q8-q9 */
-    mov             v18.16b,  v8.16b
-    trn1            v8.8h,    v8.8h,    v9.8h
-    trn2            v9.8h,    v18.8h,   v9.8h
-    sub             v11.8h,   v12.8h,   v1.8h
-    /* Transpose  q14-q15 */
-    mov             v18.16b,  v14.16b
-    trn1            v14.8h,   v14.8h,   v15.8h
-    trn2            v15.8h,   v18.8h,   v15.8h
-    add             v12.8h,   v12.8h,   v1.8h
-    /* Transpose  q10-q11 */
-    mov             v18.16b,  v10.16b
-    trn1            v10.8h,   v10.8h,   v11.8h
-    trn2            v11.8h,   v18.8h,   v11.8h
-    /* Transpose  q12-q13 */
-    mov             v18.16b,  v12.16b
-    trn1            v12.8h,   v12.8h,   v13.8h
-    trn2            v13.8h,   v18.8h,   v13.8h
-    /* Transpose  q9-q11 */
-    mov             v18.16b,  v9.16b
-    trn1            v9.4s,    v9.4s,    v11.4s
-    trn2            v11.4s,   v18.4s,   v11.4s
-    /* Transpose  q12-q14 */
-    mov             v18.16b,  v12.16b
-    trn1            v12.4s,   v12.4s,   v14.4s
-    trn2            v14.4s,   v18.4s,   v14.4s
-    /* Transpose  q8-q10 */
-    mov             v18.16b,  v8.16b
-    trn1            v8.4s,    v8.4s,    v10.4s
-    trn2            v10.4s,   v18.4s,   v10.4s
-    /* Transpose  q13-q15 */
-    mov             v18.16b,  v13.16b
-    trn1            v13.4s,   v13.4s,   v15.4s
-    trn2            v15.4s,   v18.4s,   v15.4s
-    /* vswp            v14.4h,   v10-MSB.4h */
-    umov            x22, v14.d[0]
-    ins             v14.2d[0], v10.2d[1]
-    ins             v10.2d[1], x22
-    /* vswp            v13.4h,   v9MSB.4h */
-
-    umov            x22, v13.d[0]
-    ins             v13.2d[0], v9.2d[1]
-    ins             v9.2d[1], x22
+    sub             v2.8h, v18.8h, v22.8h
+    add             v22.8h, v18.8h, v22.8h
+    sub             v1.8h, v19.8h, v21.8h
+    add             v21.8h, v19.8h, v21.8h
+    sub             v5.8h, v17.8h, v23.8h
+    add             v23.8h, v17.8h, v23.8h
+    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
+    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
+    add             v3.8h, v1.8h, v1.8h
+    sub             v1.8h, v5.8h, v1.8h
+    add             v18.8h, v2.8h, v4.8h
+    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
+    sub             v2.8h, v23.8h, v21.8h
+    add             v3.8h, v3.8h, v6.8h
+    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
+    add             v1.8h, v1.8h, v4.8h
+    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
+    sub             v18.8h, v18.8h, v22.8h
+    add             v2.8h, v2.8h, v6.8h
+    sub             v6.8h, v16.8h, v20.8h
+    add             v20.8h, v16.8h, v20.8h
+    add             v17.8h, v5.8h, v4.8h
+    add             v5.8h, v6.8h, v18.8h
+    sub             v18.8h, v6.8h, v18.8h
+    add             v6.8h, v23.8h, v21.8h
+    add             v16.8h, v20.8h, v22.8h
+    sub             v3.8h, v6.8h, v3.8h
+    sub             v20.8h, v20.8h, v22.8h
+    sub             v3.8h, v3.8h, v1.8h
+    sub             v1.8h, v17.8h, v1.8h
+    add             v2.8h, v3.8h, v2.8h
+    sub             v23.8h, v16.8h, v6.8h
+    add             v1.8h, v1.8h, v2.8h
+    add             v16.8h, v16.8h, v6.8h
+    add             v22.8h, v5.8h, v3.8h
+    sub             v17.8h, v5.8h, v3.8h
+    sub             v21.8h, v18.8h, v2.8h
+    add             v18.8h, v18.8h, v2.8h
+    sub             v19.8h, v20.8h, v1.8h
+    add             v20.8h, v20.8h, v1.8h
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
     /* 1-D IDCT, pass 2 */
-    sub             v2.8h,    v10.8h,   v14.8h
-    /* vswp            v15.4h,   v11MSB.4h */
-    umov            x22, v15.d[0]
-    ins             v15.2d[0], v11.2d[1]
-    ins             v11.2d[1], x22
-    add             v14.8h,   v10.8h,   v14.8h
-    /* vswp            v12.4h,   v8-MSB.4h */
-    umov            x22, v12.d[0]
-    ins             v12.2d[0], v8.2d[1]
-    ins             v8.2d[1], x22
-    sub             v1.8h,    v11.8h,   v13.8h
-    add             v13.8h,   v11.8h,   v13.8h
-    sub             v5.8h,    v9.8h,    v15.8h
-    add             v15.8h,   v9.8h,    v15.8h
-    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
-    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
-    add             v3.8h,    v1.8h,    v1.8h
-    sub             v1.8h,    v5.8h,    v1.8h
-    add             v10.8h,   v2.8h,    v4.8h
-    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
-    sub             v2.8h,    v15.8h,   v13.8h
-    add             v3.8h,    v3.8h,    v6.8h
-    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
-    add             v1.8h,    v1.8h,    v4.8h
-    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
-    sub             v10.8h,   v10.8h,   v14.8h
-    add             v2.8h,    v2.8h,    v6.8h
-    sub             v6.8h,    v8.8h,    v12.8h
-    add             v12.8h,   v8.8h,    v12.8h
-    add             v9.8h,    v5.8h,    v4.8h
-    add             v5.8h,    v6.8h,    v10.8h
-    sub             v10.8h,   v6.8h,    v10.8h
-    add             v6.8h,    v15.8h,   v13.8h
-    add             v8.8h,    v12.8h,   v14.8h
-    sub             v3.8h,    v6.8h,    v3.8h
-    sub             v12.8h,   v12.8h,   v14.8h
-    sub             v3.8h,    v3.8h,    v1.8h
-    sub             v1.8h,    v9.8h,    v1.8h
-    add             v2.8h,    v3.8h,    v2.8h
-    sub             v15.8h,   v8.8h,    v6.8h
-    add             v1.8h,    v1.8h,    v2.8h
-    add             v8.8h,    v8.8h,    v6.8h
-    add             v14.8h,   v5.8h,    v3.8h
-    sub             v9.8h,    v5.8h,    v3.8h
-    sub             v13.8h,   v10.8h,   v2.8h
-    add             v10.8h,   v10.8h,   v2.8h
-    sub             v11.8h,   v12.8h,   v1.8h
-    add             v12.8h,   v12.8h,   v1.8h
+    sub             v2.8h, v18.8h, v22.8h
+    add             v22.8h, v18.8h, v22.8h
+    sub             v1.8h, v19.8h, v21.8h
+    add             v21.8h, v19.8h, v21.8h
+    sub             v5.8h, v17.8h, v23.8h
+    add             v23.8h, v17.8h, v23.8h
+    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
+    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
+    add             v3.8h, v1.8h, v1.8h
+    sub             v1.8h, v5.8h, v1.8h
+    add             v18.8h, v2.8h, v4.8h
+    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
+    sub             v2.8h, v23.8h, v21.8h
+    add             v3.8h, v3.8h, v6.8h
+    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
+    add             v1.8h, v1.8h, v4.8h
+    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
+    sub             v18.8h, v18.8h, v22.8h
+    add             v2.8h, v2.8h, v6.8h
+    sub             v6.8h, v16.8h, v20.8h
+    add             v20.8h, v16.8h, v20.8h
+    add             v17.8h, v5.8h, v4.8h
+    add             v5.8h, v6.8h, v18.8h
+    sub             v18.8h, v6.8h, v18.8h
+    add             v6.8h, v23.8h, v21.8h
+    add             v16.8h, v20.8h, v22.8h
+    sub             v3.8h, v6.8h, v3.8h
+    sub             v20.8h, v20.8h, v22.8h
+    sub             v3.8h, v3.8h, v1.8h
+    sub             v1.8h, v17.8h, v1.8h
+    add             v2.8h, v3.8h, v2.8h
+    sub             v23.8h, v16.8h, v6.8h
+    add             v1.8h, v1.8h, v2.8h
+    add             v16.8h, v16.8h, v6.8h
+    add             v22.8h, v5.8h, v3.8h
+    sub             v17.8h, v5.8h, v3.8h
+    sub             v21.8h, v18.8h, v2.8h
+    add             v18.8h, v18.8h, v2.8h
+    sub             v19.8h, v20.8h, v1.8h
+    add             v20.8h, v20.8h, v1.8h
     /* Descale to 8-bit and range limit */
-    movi            v0.16b,   #0x80
-    sqshrn          v8.8b,    v8.8h,    #5
-    sqshrn2         v8.16b,   v9.8h,    #5
-    sqshrn          v9.8b,    v10.8h,   #5
-    sqshrn2         v9.16b,   v11.8h,   #5
-    sqshrn          v10.8b,   v12.8h,   #5
-    sqshrn2         v10.16b,  v13.8h,   #5
-    sqshrn          v11.8b,   v14.8h,   #5
-    sqshrn2         v11.16b,  v15.8h,   #5
-    add             v8.16b,   v8.16b,   v0.16b
-    add             v9.16b,   v9.16b,   v0.16b
-    add             v10.16b,  v10.16b,  v0.16b
-    add             v11.16b,  v11.16b,  v0.16b
+    movi            v0.16b, #0x80
+      /* Prepare pointers (dual-issue with NEON instructions) */
+      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
+    sqshrn          v28.8b, v16.8h, #5
+      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
+    sqshrn          v29.8b, v17.8h, #5
+      add             TMP1, TMP1, OUTPUT_COL
+    sqshrn          v30.8b, v18.8h, #5
+      add             TMP2, TMP2, OUTPUT_COL
+    sqshrn          v31.8b, v19.8h, #5
+      add             TMP3, TMP3, OUTPUT_COL
+    sqshrn2         v28.16b, v20.8h, #5
+      add             TMP4, TMP4, OUTPUT_COL
+    sqshrn2         v29.16b, v21.8h, #5
+      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
+    sqshrn2         v30.16b, v22.8h, #5
+      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
+    sqshrn2         v31.16b, v23.8h, #5
+      add             TMP5, TMP5, OUTPUT_COL
+    add             v16.16b, v28.16b, v0.16b
+      add             TMP6, TMP6, OUTPUT_COL
+    add             v18.16b, v29.16b, v0.16b
+      add             TMP7, TMP7, OUTPUT_COL
+    add             v20.16b, v30.16b, v0.16b
+      add             TMP8, TMP8, OUTPUT_COL
+    add             v22.16b, v31.16b, v0.16b
+
     /* Transpose the final 8-bit samples */
-    /* Transpose  q8-q9 */
-    mov             v18.16b,  v8.16b
-    trn1            v8.8h,    v8.8h,    v9.8h
-    trn2            v9.8h,    v18.8h,   v9.8h
-    /* Transpose  q10-q11 */
-    mov             v18.16b,  v10.16b
-    trn1            v10.8h,   v10.8h,   v11.8h
-    trn2            v11.8h,   v18.8h,   v11.8h
-    /* Transpose  q8-q10 */
-    mov             v18.16b,  v8.16b
-    trn1            v8.4s,    v8.4s,    v10.4s
-    trn2            v10.4s,   v18.4s,   v10.4s
-    /* Transpose  q9-q11 */
-    mov             v18.16b,  v9.16b
-    trn1            v9.4s,    v9.4s,    v11.4s
-    trn2            v11.4s,   v18.4s,   v11.4s
-    /* make copy */
-    ins             v17.2d[0], v8.2d[1]
-    /* Transpose  d16-d17-msb */
-    mov             v18.16b,  v8.16b
-    trn1            v8.8b,    v8.8b,    v17.8b
-    trn2            v17.8b,   v18.8b,   v17.8b
-    /* make copy */
-    ins             v19.2d[0], v9.2d[1]
-    mov             v18.16b,  v9.16b
-    trn1            v9.8b,    v9.8b,    v19.8b
-    trn2            v19.8b,   v18.8b,   v19.8b
+    trn1            v28.16b, v16.16b, v18.16b
+    trn1            v30.16b, v20.16b, v22.16b
+    trn2            v29.16b, v16.16b, v18.16b
+    trn2            v31.16b, v20.16b, v22.16b
+
+    trn1            v16.8h, v28.8h, v30.8h
+    trn2            v18.8h, v28.8h, v30.8h
+    trn1            v20.8h, v29.8h, v31.8h
+    trn2            v22.8h, v29.8h, v31.8h
+
+    uzp1            v28.4s, v16.4s, v18.4s
+    uzp2            v30.4s, v16.4s, v18.4s
+    uzp1            v29.4s, v20.4s, v22.4s
+    uzp2            v31.4s, v20.4s, v22.4s
+
     /* Store results to the output buffer */
-    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    add             TMP1,     TMP1,     OUTPUT_COL
-    add             TMP2,     TMP2,     OUTPUT_COL
-    st1             {v8.8b},  [TMP1]
-    st1             {v17.8b}, [TMP2]
-    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    add             TMP1,     TMP1,     OUTPUT_COL
-    add             TMP2,     TMP2,     OUTPUT_COL
-    st1             {v9.8b},  [TMP1]
-    /* make copy */
-    ins             v7.2d[0], v10.2d[1]
-    mov             v18.16b,  v10.16b
-    trn1            v10.8b,   v10.8b,   v7.8b
-    trn2            v7.8b,    v18.8b,   v7.8b
-    st1             {v19.8b}, [TMP2]
-    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
-    ldp             TMP4,     TMP5,     [OUTPUT_BUF], 16
-    add             TMP1,     TMP1,     OUTPUT_COL
-    add             TMP2,     TMP2,     OUTPUT_COL
-    add             TMP4,     TMP4,     OUTPUT_COL
-    add             TMP5,     TMP5,     OUTPUT_COL
-    st1             {v10.8b}, [TMP1]
-    /* make copy */
-    ins             v16.2d[0], v11.2d[1]
-    mov             v18.16b,  v11.16b
-    trn1            v11.8b,   v11.8b,   v16.8b
-    trn2            v16.8b,   v18.8b,   v16.8b
-    st1             {v7.8b},  [TMP2]
-    st1             {v11.8b}, [TMP4]
-    st1             {v16.8b}, [TMP5]
-    sub             sp, sp, #176
-    ldp             x22, x23, [sp], 16
-    ld1             {v0.8b - v3.8b}, [sp], 32
-    ld1             {v4.8b - v7.8b}, [sp], 32
-    ld1             {v8.8b - v11.8b}, [sp], 32
-    ld1             {v12.8b - v15.8b}, [sp], 32
-    ld1             {v16.8b - v19.8b}, [sp], 32
+    st1             {v28.d}[0], [TMP1]
+    st1             {v29.d}[0], [TMP2]
+    st1             {v28.d}[1], [TMP3]
+    st1             {v29.d}[1], [TMP4]
+    st1             {v30.d}[0], [TMP5]
+    st1             {v31.d}[0], [TMP6]
+    st1             {v30.d}[1], [TMP7]
+    st1             {v31.d}[1], [TMP8]
     blr             x30
 
     .unreq          DCT_TABLE
@@ -1055,6 +986,10 @@
     .unreq          TMP2
     .unreq          TMP3
     .unreq          TMP4
+    .unreq          TMP5
+    .unreq          TMP6
+    .unreq          TMP7
+    .unreq          TMP8
 
 
 /*****************************************************************************/
@@ -1079,81 +1014,80 @@
 
 #define CONST_BITS  13
 
-#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
+#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
+#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
+#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
+#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
+#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
+#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
+#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
+#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
+#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
+#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
+#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
+#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
+#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
+#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
 
 .balign 16
-jsimd_idct_4x4_neon_consts:
-    .short     FIX_1_847759065     /* v0.4h[0] */
-    .short     -FIX_0_765366865    /* v0.4h[1] */
-    .short     -FIX_0_211164243    /* v0.4h[2] */
-    .short     FIX_1_451774981     /* v0.4h[3] */
-    .short     -FIX_2_172734803    /* d1[0] */
-    .short     FIX_1_061594337     /* d1[1] */
-    .short     -FIX_0_509795579    /* d1[2] */
-    .short     -FIX_0_601344887    /* d1[3] */
-    .short     FIX_0_899976223     /* v2.4h[0] */
-    .short     FIX_2_562915447     /* v2.4h[1] */
-    .short     1 << (CONST_BITS+1) /* v2.4h[2] */
-    .short     0                   /* v2.4h[3] */
+Ljsimd_idct_4x4_neon_consts:
+  .short FIX_1_847759065      /* v0.h[0] */
+  .short -FIX_0_765366865     /* v0.h[1] */
+  .short -FIX_0_211164243     /* v0.h[2] */
+  .short FIX_1_451774981      /* v0.h[3] */
+  .short -FIX_2_172734803     /* d1[0] */
+  .short FIX_1_061594337      /* d1[1] */
+  .short -FIX_0_509795579     /* d1[2] */
+  .short -FIX_0_601344887     /* d1[3] */
+  .short FIX_0_899976223      /* v2.h[0] */
+  .short FIX_2_562915447      /* v2.h[1] */
+  .short 1 << (CONST_BITS+1)  /* v2.h[2] */
+  .short 0                    /* v2.h[3] */
 
 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    smull           v28.4s, \x4,    v2.4h[2]
-    smlal           v28.4s, \x8,    v0.4h[0]
-    smlal           v28.4s, \x14,   v0.4h[1]
+    smull           v28.4s, \x4, v2.h[2]
+    smlal           v28.4s, \x8, v0.h[0]
+    smlal           v28.4s, \x14, v0.h[1]
 
-    smull           v26.4s, \x16,   v1.4h[2]
-    smlal           v26.4s, \x12,   v1.4h[3]
-    smlal           v26.4s, \x10,   v2.4h[0]
-    smlal           v26.4s, \x6,    v2.4h[1]
+    smull           v26.4s, \x16, v1.h[2]
+    smlal           v26.4s, \x12, v1.h[3]
+    smlal           v26.4s, \x10, v2.h[0]
+    smlal           v26.4s, \x6, v2.h[1]
 
-    smull           v30.4s, \x4,    v2.4h[2]
-    smlsl           v30.4s, \x8,    v0.4h[0]
-    smlsl           v30.4s, \x14,   v0.4h[1]
+    smull           v30.4s, \x4, v2.h[2]
+    smlsl           v30.4s, \x8, v0.h[0]
+    smlsl           v30.4s, \x14, v0.h[1]
 
-    smull           v24.4s, \x16,   v0.4h[2]
-    smlal           v24.4s, \x12,   v0.4h[3]
-    smlal           v24.4s, \x10,   v1.4h[0]
-    smlal           v24.4s, \x6,    v1.4h[1]
+    smull           v24.4s, \x16, v0.h[2]
+    smlal           v24.4s, \x12, v0.h[3]
+    smlal           v24.4s, \x10, v1.h[0]
+    smlal           v24.4s, \x6, v1.h[1]
 
     add             v20.4s, v28.4s, v26.4s
     sub             v28.4s, v28.4s, v26.4s
 
-.if \shift > 16
+  .if \shift > 16
     srshr           v20.4s, v20.4s, #\shift
     srshr           v28.4s, v28.4s, #\shift
-    xtn             \y26,   v20.4s
-    xtn             \y29,   v28.4s
-.else
-    rshrn           \y26,   v20.4s, #\shift
-    rshrn           \y29,   v28.4s, #\shift
-.endif
+    xtn             \y26, v20.4s
+    xtn             \y29, v28.4s
+  .else
+    rshrn           \y26, v20.4s, #\shift
+    rshrn           \y29, v28.4s, #\shift
+  .endif
 
     add             v20.4s, v30.4s, v24.4s
     sub             v30.4s, v30.4s, v24.4s
 
-.if \shift > 16
+  .if \shift > 16
     srshr           v20.4s, v20.4s, #\shift
     srshr           v30.4s, v30.4s, #\shift
-    xtn             \y27,   v20.4s
-    xtn             \y28,   v30.4s
-.else
-    rshrn           \y27,   v20.4s, #\shift
-    rshrn           \y28,   v30.4s, #\shift
-.endif
-
+    xtn             \y27, v20.4s
+    xtn             \y28, v30.4s
+  .else
+    rshrn           \y27, v20.4s, #\shift
+    rshrn           \y28, v30.4s, #\shift
+  .endif
 .endm
 
 asm_function jsimd_idct_4x4_neon
@@ -1171,15 +1105,15 @@
     sub             sp, sp, 272
     str             x15, [sp], 16
     /* Load constants (v3.4h is just used for padding) */
-    adr             TMP4, jsimd_idct_4x4_neon_consts
-    st1             {v0.8b - v3.8b}, [sp], 32
-    st1             {v4.8b - v7.8b}, [sp], 32
-    st1             {v8.8b - v11.8b}, [sp], 32
-    st1             {v12.8b - v15.8b}, [sp], 32
-    st1             {v16.8b - v19.8b}, [sp], 32
-    st1             {v20.8b - v23.8b}, [sp], 32
-    st1             {v24.8b - v27.8b}, [sp], 32
-    st1             {v28.8b - v31.8b}, [sp], 32
+    adr             TMP4, Ljsimd_idct_4x4_neon_consts
+    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
 
     /* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1203,45 +1137,49 @@
     ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
     mul             v4.4h, v4.4h, v18.4h
     mul             v5.4h, v5.4h, v19.4h
-    ins             v4.2d[1], v5.2d[0]    /* 128 bit q4 */
+    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
     ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
     mul             v6.4h, v6.4h, v20.4h
     mul             v7.4h, v7.4h, v21.4h
-    ins             v6.2d[1], v7.2d[0]    /* 128 bit q6 */
+    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
     mul             v8.4h, v8.4h, v22.4h
     mul             v9.4h, v9.4h, v23.4h
-    ins             v8.2d[1], v9.2d[0]    /* 128 bit q8 */
+    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
     add             DCT_TABLE, DCT_TABLE, #16
     ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
     mul             v10.4h, v10.4h, v24.4h
     mul             v11.4h, v11.4h, v25.4h
-    ins             v10.2d[1], v11.2d[0]  /* 128 bit q10 */
+    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
     mul             v12.4h, v12.4h, v26.4h
     mul             v13.4h, v13.4h, v27.4h
-    ins             v12.2d[1], v13.2d[0]  /* 128 bit q12 */
+    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
     ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
     mul             v14.4h, v14.4h, v28.4h
     mul             v15.4h, v15.4h, v29.4h
-    ins             v14.2d[1], v15.2d[0]  /* 128 bit q14 */
+    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
     mul             v16.4h, v16.4h, v30.4h
     mul             v17.4h, v17.4h, v31.4h
-    ins             v16.2d[1], v17.2d[0]  /* 128 bit q16 */
+    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
 
     /* Pass 1 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
+    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
+                    v4.4h, v6.4h, v8.4h, v10.4h
     transpose_4x4   v4, v6, v8, v10, v3
-    ins             v10.2d[1], v11.2d[0]
-    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
+    ins             v10.d[1], v11.d[0]
+    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
+                    v5.4h, v7.4h, v9.4h, v11.4h
     transpose_4x4   v5, v7, v9, v11, v3
-    ins             v10.2d[1], v11.2d[0]
+    ins             v10.d[1], v11.d[0]
+
     /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
+    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
+                    v26.4h, v27.4h, v28.4h, v29.4h
     transpose_4x4   v26, v27, v28, v29, v3
 
     /* Range limit */
     movi            v30.8h, #0x80
-    ins             v26.2d[1], v27.2d[0]
-    ins             v28.2d[1], v29.2d[0]
+    ins             v26.d[1], v27.d[0]
+    ins             v28.d[1], v29.d[0]
     add             v26.8h, v26.8h, v30.8h
     add             v28.8h, v28.8h, v30.8h
     sqxtun          v26.8b, v26.8h
@@ -1286,14 +1224,14 @@
     /* vpop            {v8.4h - v15.4h}    ;not available */
     sub             sp, sp, #272
     ldr             x15, [sp], 16
-    ld1             {v0.8b - v3.8b}, [sp], 32
-    ld1             {v4.8b - v7.8b}, [sp], 32
-    ld1             {v8.8b - v11.8b}, [sp], 32
-    ld1             {v12.8b - v15.8b}, [sp], 32
-    ld1             {v16.8b - v19.8b}, [sp], 32
-    ld1             {v20.8b - v23.8b}, [sp], 32
-    ld1             {v24.8b - v27.8b}, [sp], 32
-    ld1             {v28.8b - v31.8b}, [sp], 32
+    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     blr             x30
 
     .unreq          DCT_TABLE
@@ -1325,32 +1263,31 @@
  */
 
 .balign 8
-jsimd_idct_2x2_neon_consts:
-    .short     -FIX_0_720959822    /* v14[0] */
-    .short     FIX_0_850430095     /* v14[1] */
-    .short     -FIX_1_272758580    /* v14[2] */
-    .short     FIX_3_624509785     /* v14[3] */
+Ljsimd_idct_2x2_neon_consts:
+  .short -FIX_0_720959822  /* v14[0] */
+  .short FIX_0_850430095   /* v14[1] */
+  .short -FIX_1_272758580  /* v14[2] */
+  .short FIX_3_624509785   /* v14[3] */
 
 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    sshll      v15.4s, \x4,    #15
-    smull      v26.4s, \x6,    v14.4h[3]
-    smlal      v26.4s, \x10,   v14.4h[2]
-    smlal      v26.4s, \x12,   v14.4h[1]
-    smlal      v26.4s, \x16,   v14.4h[0]
+    sshll           v15.4s, \x4, #15
+    smull           v26.4s, \x6, v14.h[3]
+    smlal           v26.4s, \x10, v14.h[2]
+    smlal           v26.4s, \x12, v14.h[1]
+    smlal           v26.4s, \x16, v14.h[0]
 
-    add        v20.4s, v15.4s, v26.4s
-    sub        v15.4s, v15.4s, v26.4s
+    add             v20.4s, v15.4s, v26.4s
+    sub             v15.4s, v15.4s, v26.4s
 
-.if \shift > 16
-    srshr      v20.4s, v20.4s, #\shift
-    srshr      v15.4s, v15.4s, #\shift
-    xtn        \y26,   v20.4s
-    xtn        \y27,   v15.4s
-.else
-    rshrn      \y26,   v20.4s, #\shift
-    rshrn      \y27,   v15.4s, #\shift
-.endif
-
+  .if \shift > 16
+    srshr           v20.4s, v20.4s, #\shift
+    srshr           v15.4s, v15.4s, #\shift
+    xtn             \y26, v20.4s
+    xtn             \y27, v15.4s
+  .else
+    rshrn           \y26, v20.4s, #\shift
+    rshrn           \y27, v15.4s, #\shift
+  .endif
 .endm
 
 asm_function jsimd_idct_2x2_neon
@@ -1367,14 +1304,14 @@
     str             x15, [sp], 16
 
     /* Load constants */
-    adr             TMP2, jsimd_idct_2x2_neon_consts
-    st1             {v4.8b - v7.8b}, [sp], 32
-    st1             {v8.8b - v11.8b}, [sp], 32
-    st1             {v12.8b - v15.8b}, [sp], 32
-    st1             {v16.8b - v19.8b}, [sp], 32
-    st1             {v21.8b - v22.8b}, [sp], 16
-    st1             {v24.8b - v27.8b}, [sp], 32
-    st1             {v30.8b - v31.8b}, [sp], 16
+    adr             TMP2, Ljsimd_idct_2x2_neon_consts
+    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    st1             {v21.8b, v22.8b}, [sp], 16
+    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    st1             {v30.8b, v31.8b}, [sp], 16
     ld1             {v14.4h}, [TMP2]
 
     /* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1400,57 +1337,57 @@
     ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
     mul             v4.4h, v4.4h, v18.4h
     mul             v5.4h, v5.4h, v19.4h
-    ins             v4.2d[1], v5.2d[0]
+    ins             v4.d[1], v5.d[0]
     mul             v6.4h, v6.4h, v20.4h
     mul             v7.4h, v7.4h, v21.4h
-    ins             v6.2d[1], v7.2d[0]
+    ins             v6.d[1], v7.d[0]
     add             DCT_TABLE, DCT_TABLE, #16
     ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
     mul             v10.4h, v10.4h, v24.4h
     mul             v11.4h, v11.4h, v25.4h
-    ins             v10.2d[1], v11.2d[0]
+    ins             v10.d[1], v11.d[0]
     add             DCT_TABLE, DCT_TABLE, #16
     ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
     mul             v12.4h, v12.4h, v26.4h
     mul             v13.4h, v13.4h, v27.4h
-    ins             v12.2d[1], v13.2d[0]
+    ins             v12.d[1], v13.d[0]
     add             DCT_TABLE, DCT_TABLE, #16
     ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
     mul             v16.4h, v16.4h, v30.4h
     mul             v17.4h, v17.4h, v31.4h
-    ins             v16.2d[1], v17.2d[0]
+    ins             v16.d[1], v17.d[0]
 
     /* Pass 1 */
 #if 0
     idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
-    transpose_4x4   v4.4h, v6.4h, v8.4h,  v10.4h
+    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
     idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
-    transpose_4x4   v5.4h, v7.4h, v9.4h,  v11.4h
+    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
 #else
-    smull           v26.4s, v6.4h,  v14.4h[3]
-    smlal           v26.4s, v10.4h, v14.4h[2]
-    smlal           v26.4s, v12.4h, v14.4h[1]
-    smlal           v26.4s, v16.4h, v14.4h[0]
-    smull           v24.4s, v7.4h,  v14.4h[3]
-    smlal           v24.4s, v11.4h, v14.4h[2]
-    smlal           v24.4s, v13.4h, v14.4h[1]
-    smlal           v24.4s, v17.4h, v14.4h[0]
-    sshll           v15.4s, v4.4h,  #15
-    sshll           v30.4s, v5.4h,  #15
+    smull           v26.4s, v6.4h, v14.h[3]
+    smlal           v26.4s, v10.4h, v14.h[2]
+    smlal           v26.4s, v12.4h, v14.h[1]
+    smlal           v26.4s, v16.4h, v14.h[0]
+    smull           v24.4s, v7.4h, v14.h[3]
+    smlal           v24.4s, v11.4h, v14.h[2]
+    smlal           v24.4s, v13.4h, v14.h[1]
+    smlal           v24.4s, v17.4h, v14.h[0]
+    sshll           v15.4s, v4.4h, #15
+    sshll           v30.4s, v5.4h, #15
     add             v20.4s, v15.4s, v26.4s
     sub             v15.4s, v15.4s, v26.4s
-    rshrn           v4.4h,  v20.4s, #13
-    rshrn           v6.4h,  v15.4s, #13
+    rshrn           v4.4h, v20.4s, #13
+    rshrn           v6.4h, v15.4s, #13
     add             v20.4s, v30.4s, v24.4s
     sub             v15.4s, v30.4s, v24.4s
-    rshrn           v5.4h,  v20.4s, #13
-    rshrn           v7.4h,  v15.4s, #13
-    ins             v4.2d[1], v5.2d[0]
-    ins             v6.2d[1], v7.2d[0]
+    rshrn           v5.4h, v20.4s, #13
+    rshrn           v7.4h, v15.4s, #13
+    ins             v4.d[1], v5.d[0]
+    ins             v6.d[1], v7.d[0]
     transpose       v4, v6, v3, .16b, .8h
     transpose       v6, v10, v3, .16b, .4s
-    ins             v11.2d[0], v10.2d[1]
-    ins             v7.2d[0], v6.2d[1]
+    ins             v11.d[0], v10.d[1]
+    ins             v7.d[0], v6.d[1]
 #endif
 
     /* Pass 2 */
@@ -1458,10 +1395,10 @@
 
     /* Range limit */
     movi            v30.8h, #0x80
-    ins             v26.2d[1], v27.2d[0]
+    ins             v26.d[1], v27.d[0]
     add             v26.8h, v26.8h, v30.8h
     sqxtun          v30.8b, v26.8h
-    ins             v26.2d[0], v30.2d[0]
+    ins             v26.d[0], v30.d[0]
     sqxtun          v27.8b, v26.8h
 
     /* Store results to the output buffer */
@@ -1476,13 +1413,13 @@
 
     sub             sp, sp, #208
     ldr             x15, [sp], 16
-    ld1             {v4.8b - v7.8b}, [sp], 32
-    ld1             {v8.8b - v11.8b}, [sp], 32
-    ld1             {v12.8b - v15.8b}, [sp], 32
-    ld1             {v16.8b - v19.8b}, [sp], 32
-    ld1             {v21.8b - v22.8b}, [sp], 16
-    ld1             {v24.8b - v27.8b}, [sp], 32
-    ld1             {v30.8b - v31.8b}, [sp], 16
+    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    ld1             {v21.8b, v22.8b}, [sp], 16
+    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    ld1             {v30.8b, v31.8b}, [sp], 16
     blr             x30
 
     .unreq          DCT_TABLE
@@ -1508,188 +1445,222 @@
  * Colorspace conversion YCbCr -> RGB
  */
 
-
 .macro do_load size
+  .if \size == 8
+    ld1             {v4.8b}, [U], 8
+    ld1             {v5.8b}, [V], 8
+    ld1             {v0.8b}, [Y], 8
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+  .elseif \size == 4
+    ld1             {v4.b}[0], [U], 1
+    ld1             {v4.b}[1], [U], 1
+    ld1             {v4.b}[2], [U], 1
+    ld1             {v4.b}[3], [U], 1
+    ld1             {v5.b}[0], [V], 1
+    ld1             {v5.b}[1], [V], 1
+    ld1             {v5.b}[2], [V], 1
+    ld1             {v5.b}[3], [V], 1
+    ld1             {v0.b}[0], [Y], 1
+    ld1             {v0.b}[1], [Y], 1
+    ld1             {v0.b}[2], [Y], 1
+    ld1             {v0.b}[3], [Y], 1
+  .elseif \size == 2
+    ld1             {v4.b}[4], [U], 1
+    ld1             {v4.b}[5], [U], 1
+    ld1             {v5.b}[4], [V], 1
+    ld1             {v5.b}[5], [V], 1
+    ld1             {v0.b}[4], [Y], 1
+    ld1             {v0.b}[5], [Y], 1
+  .elseif \size == 1
+    ld1             {v4.b}[6], [U], 1
+    ld1             {v5.b}[6], [V], 1
+    ld1             {v0.b}[6], [Y], 1
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_store bpp, size, fast_st3
+  .if \bpp == 24
     .if \size == 8
-        ld1  {v4.8b}, [U], 8
-        ld1  {v5.8b}, [V], 8
-        ld1  {v0.8b}, [Y], 8
-        prfm PLDL1KEEP, [U, #64]
-        prfm PLDL1KEEP, [V, #64]
-        prfm PLDL1KEEP, [Y, #64]
+      .if \fast_st3 == 1
+        st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
+      .else
+        st1         {v10.b}[0], [RGB], #1
+        st1         {v11.b}[0], [RGB], #1
+        st1         {v12.b}[0], [RGB], #1
+
+        st1         {v10.b}[1], [RGB], #1
+        st1         {v11.b}[1], [RGB], #1
+        st1         {v12.b}[1], [RGB], #1
+
+        st1         {v10.b}[2], [RGB], #1
+        st1         {v11.b}[2], [RGB], #1
+        st1         {v12.b}[2], [RGB], #1
+
+        st1         {v10.b}[3], [RGB], #1
+        st1         {v11.b}[3], [RGB], #1
+        st1         {v12.b}[3], [RGB], #1
+
+        st1         {v10.b}[4], [RGB], #1
+        st1         {v11.b}[4], [RGB], #1
+        st1         {v12.b}[4], [RGB], #1
+
+        st1         {v10.b}[5], [RGB], #1
+        st1         {v11.b}[5], [RGB], #1
+        st1         {v12.b}[5], [RGB], #1
+
+        st1         {v10.b}[6], [RGB], #1
+        st1         {v11.b}[6], [RGB], #1
+        st1         {v12.b}[6], [RGB], #1
+
+        st1         {v10.b}[7], [RGB], #1
+        st1         {v11.b}[7], [RGB], #1
+        st1         {v12.b}[7], [RGB], #1
+      .endif
     .elseif \size == 4
-        ld1  {v4.b}[0], [U], 1
-        ld1  {v4.b}[1], [U], 1
-        ld1  {v4.b}[2], [U], 1
-        ld1  {v4.b}[3], [U], 1
-        ld1  {v5.b}[0], [V], 1
-        ld1  {v5.b}[1], [V], 1
-        ld1  {v5.b}[2], [V], 1
-        ld1  {v5.b}[3], [V], 1
-        ld1  {v0.b}[0], [Y], 1
-        ld1  {v0.b}[1], [Y], 1
-        ld1  {v0.b}[2], [Y], 1
-        ld1  {v0.b}[3], [Y], 1
+      st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
     .elseif \size == 2
-        ld1  {v4.b}[4], [U], 1
-        ld1  {v4.b}[5], [U], 1
-        ld1  {v5.b}[4], [V], 1
-        ld1  {v5.b}[5], [V], 1
-        ld1  {v0.b}[4], [Y], 1
-        ld1  {v0.b}[5], [Y], 1
+      st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
+      st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
     .elseif \size == 1
-        ld1  {v4.b}[6], [U], 1
-        ld1  {v5.b}[6], [V], 1
-        ld1  {v0.b}[6], [Y], 1
+      st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
     .else
-        .error unsupported macroblock size
+     .error unsupported macroblock size
     .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
+    .elseif \size == 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
+    .elseif \size == 2
+      st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
+      st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
+    .elseif \size == 1
+      st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp==16
+    .if \size == 8
+      st1           {v25.8h}, [RGB], 16
+    .elseif \size == 4
+      st1           {v25.4h}, [RGB], 8
+    .elseif \size == 2
+      st1           {v25.h}[4], [RGB], 2
+      st1           {v25.h}[5], [RGB], 2
+    .elseif \size == 1
+      st1           {v25.h}[6], [RGB], 2
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
 .endm
 
-.macro do_store bpp, size
-    .if \bpp == 24
-        .if \size == 8
-            st3  {v10.8b, v11.8b, v12.8b}, [RGB], 24
-        .elseif \size == 4
-            st3  {v10.b, v11.b, v12.b}[0], [RGB], 3
-            st3  {v10.b, v11.b, v12.b}[1], [RGB], 3
-            st3  {v10.b, v11.b, v12.b}[2], [RGB], 3
-            st3  {v10.b, v11.b, v12.b}[3], [RGB], 3
-        .elseif \size == 2
-            st3  {v10.b, v11.b, v12.b}[4], [RGB], 3
-            st3  {v10.b, v11.b, v12.b}[5], [RGB], 3
-        .elseif \size == 1
-            st3  {v10.b, v11.b, v12.b}[6], [RGB], 3
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 32
-        .if \size == 8
-            st4  {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
-        .elseif \size == 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
-        .elseif \size == 2
-            st4  {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
-            st4  {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
-        .elseif \size == 1
-            st4  {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp==16
-        .if \size == 8
-            st1  {v25.8h}, [RGB],16
-        .elseif \size == 4
-            st1  {v25.4h}, [RGB],8
-        .elseif \size == 2
-            st1  {v25.h}[4], [RGB],2
-            st1  {v25.h}[5], [RGB],2
-        .elseif \size == 1
-            st1  {v25.h}[6], [RGB],2
-        .else
-            .error unsupported macroblock size
-        .endif
-     .else
-        .error unsupported bpp
-    .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
+                                           g_offs, gsize, b_offs, bsize, \
+                                           defsize, fast_st3
 
 /*
  * 2-stage pipelined YCbCr->RGB conversion
  */
 
 .macro do_yuv_to_rgb_stage1
-    uaddw        v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
-    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
-    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
-    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
-    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
-    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
-    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
-    smull        v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
-    smull2       v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
+    uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
+    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
 .endm
 
 .macro do_yuv_to_rgb_stage2
-    rshrn        v20.4h, v20.4s, #15
-    rshrn2       v20.8h, v22.4s, #15
-    rshrn        v24.4h, v24.4s, #14
-    rshrn2       v24.8h, v26.4s, #14
-    rshrn        v28.4h, v28.4s, #14
-    rshrn2       v28.8h, v30.4s, #14
-    uaddw        v20.8h, v20.8h, v0.8b
-    uaddw        v24.8h, v24.8h, v0.8b
-    uaddw        v28.8h, v28.8h, v0.8b
-.if \bpp != 16
-    sqxtun       v1\g_offs\defsize, v20.8h
-    sqxtun       v1\r_offs\defsize, v24.8h
-    sqxtun       v1\b_offs\defsize, v28.8h
-.else
-    sqshlu       v21.8h, v20.8h, #8
-    sqshlu       v25.8h, v24.8h, #8
-    sqshlu       v29.8h, v28.8h, #8
-    sri          v25.8h, v21.8h, #5
-    sri          v25.8h, v29.8h, #11
-.endif
-
+    rshrn           v20.4h, v20.4s, #15
+    rshrn2          v20.8h, v22.4s, #15
+    rshrn           v24.4h, v24.4s, #14
+    rshrn2          v24.8h, v26.4s, #14
+    rshrn           v28.4h, v28.4s, #14
+    rshrn2          v28.8h, v30.4s, #14
+    uaddw           v20.8h, v20.8h, v0.8b
+    uaddw           v24.8h, v24.8h, v0.8b
+    uaddw           v28.8h, v28.8h, v0.8b
+  .if \bpp != 16
+    sqxtun          v1\g_offs\defsize, v20.8h
+    sqxtun          v1\r_offs\defsize, v24.8h
+    sqxtun          v1\b_offs\defsize, v28.8h
+  .else
+    sqshlu          v21.8h, v20.8h, #8
+    sqshlu          v25.8h, v24.8h, #8
+    sqshlu          v29.8h, v28.8h, #8
+    sri             v25.8h, v21.8h, #5
+    sri             v25.8h, v29.8h, #11
+  .endif
 .endm
 
-.macro do_yuv_to_rgb_stage2_store_load_stage1
-    rshrn        v20.4h, v20.4s, #15
-    rshrn        v24.4h, v24.4s, #14
-    rshrn        v28.4h, v28.4s, #14
-    ld1          {v4.8b}, [U], 8
-    rshrn2       v20.8h, v22.4s, #15
-    rshrn2       v24.8h, v26.4s, #14
-    rshrn2       v28.8h, v30.4s, #14
-    ld1          {v5.8b}, [V], 8
-    uaddw        v20.8h, v20.8h, v0.8b
-    uaddw        v24.8h, v24.8h, v0.8b
-    uaddw        v28.8h, v28.8h, v0.8b
-.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
-    sqxtun       v1\g_offs\defsize, v20.8h
-    ld1          {v0.8b}, [Y], 8
-    sqxtun       v1\r_offs\defsize, v24.8h
-    prfm         PLDL1KEEP, [U, #64]
-    prfm         PLDL1KEEP, [V, #64]
-    prfm         PLDL1KEEP, [Y, #64]
-    sqxtun       v1\b_offs\defsize, v28.8h
-    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
-    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
-    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
-    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
-    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
-    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
-    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
-.else /**************************** rgb565 ***********************************/
-    sqshlu       v21.8h, v20.8h, #8
-    sqshlu       v25.8h, v24.8h, #8
-    sqshlu       v29.8h, v28.8h, #8
-    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
-    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
-    ld1          {v0.8b}, [Y], 8
-    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
-    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
-    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
-    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
-    sri          v25.8h, v21.8h, #5
-    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
-    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
-    prfm         PLDL1KEEP, [U, #64]
-    prfm         PLDL1KEEP, [V, #64]
-    prfm         PLDL1KEEP, [Y, #64]
-    sri          v25.8h, v29.8h, #11
-.endif
-    do_store     \bpp, 8
-    smull        v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
-    smull2       v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
+.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
+    rshrn           v20.4h, v20.4s, #15
+    rshrn           v24.4h, v24.4s, #14
+    rshrn           v28.4h, v28.4s, #14
+    ld1             {v4.8b}, [U], 8
+    rshrn2          v20.8h, v22.4s, #15
+    rshrn2          v24.8h, v26.4s, #14
+    rshrn2          v28.8h, v30.4s, #14
+    ld1             {v5.8b}, [V], 8
+    uaddw           v20.8h, v20.8h, v0.8b
+    uaddw           v24.8h, v24.8h, v0.8b
+    uaddw           v28.8h, v28.8h, v0.8b
+  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
+    sqxtun          v1\g_offs\defsize, v20.8h
+    ld1             {v0.8b}, [Y], 8
+    sqxtun          v1\r_offs\defsize, v24.8h
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+    sqxtun          v1\b_offs\defsize, v28.8h
+    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+  .else  /**************************** rgb565 ********************************/
+    sqshlu          v21.8h, v20.8h, #8
+    sqshlu          v25.8h, v24.8h, #8
+    sqshlu          v29.8h, v28.8h, #8
+    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
+    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
+    ld1             {v0.8b}, [Y], 8
+    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
+    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
+    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
+    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
+    sri             v25.8h, v21.8h, #5
+    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
+    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
+    prfm            pldl1keep, [U, #64]
+    prfm            pldl1keep, [V, #64]
+    prfm            pldl1keep, [Y, #64]
+    sri             v25.8h, v29.8h, #11
+  .endif
+    do_store        \bpp, 8, \fast_st3
+    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
+    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
 .endm
 
 .macro do_yuv_to_rgb
@@ -1702,13 +1673,21 @@
  */
 
 .balign 16
-jsimd_ycc_\colorid\()_neon_consts:
-    .short          0,      0,     0,      0
-    .short          22971, -11277, -23401, 29033
-    .short          -128,  -128,   -128,   -128
-    .short          -128,  -128,   -128,   -128
+.if \fast_st3 == 1
+Ljsimd_ycc_\colorid\()_neon_consts:
+.else
+Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
+.endif
+  .short 0,      0,     0,      0
+  .short 22971, -11277, -23401, 29033
+  .short -128,  -128,   -128,   -128
+  .short -128,  -128,   -128,   -128
 
+.if \fast_st3 == 1
 asm_function jsimd_ycc_\colorid\()_convert_neon
+.else
+asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+.endif
     OUTPUT_WIDTH    .req x0
     INPUT_BUF       .req x1
     INPUT_ROW       .req x2
@@ -1717,7 +1696,7 @@
 
     INPUT_BUF0      .req x5
     INPUT_BUF1      .req x6
-    INPUT_BUF2      .req INPUT_BUF
+    INPUT_BUF2      .req x1
 
     RGB             .req x7
     Y               .req x8
@@ -1727,17 +1706,23 @@
 
     sub             sp, sp, 336
     str             x15, [sp], 16
+
     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
-    adr             x15, jsimd_ycc_\colorid\()_neon_consts
+    .if \fast_st3 == 1
+      adr           x15, Ljsimd_ycc_\colorid\()_neon_consts
+    .else
+      adr           x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
+    .endif
+
     /* Save NEON registers */
-    st1             {v0.8b - v3.8b}, [sp], 32
-    st1             {v4.8b - v7.8b}, [sp], 32
-    st1             {v8.8b - v11.8b}, [sp], 32
-    st1             {v12.8b - v15.8b}, [sp], 32
-    st1             {v16.8b - v19.8b}, [sp], 32
-    st1             {v20.8b - v23.8b}, [sp], 32
-    st1             {v24.8b - v27.8b}, [sp], 32
-    st1             {v28.8b - v31.8b}, [sp], 32
+    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     ld1             {v0.4h, v1.4h}, [x15], 16
     ld1             {v2.8h}, [x15]
 
@@ -1748,8 +1733,8 @@
     stp             x8, x9, [sp], 16
     stp             x10, x30, [sp], 16
     ldr             INPUT_BUF0, [INPUT_BUF]
-    ldr             INPUT_BUF1, [INPUT_BUF, 8]
-    ldr             INPUT_BUF2, [INPUT_BUF, 16]
+    ldr             INPUT_BUF1, [INPUT_BUF, #8]
+    ldr             INPUT_BUF2, [INPUT_BUF, #16]
     .unreq          INPUT_BUF
 
     /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
@@ -1758,7 +1743,7 @@
 
     /* Outer loop over scanlines */
     cmp             NUM_ROWS, #1
-    blt             9f
+    b.lt            9f
 0:
     lsl             x16, INPUT_ROW, #3
     ldr             Y, [INPUT_BUF0, x16]
@@ -1770,60 +1755,60 @@
 
     /* Inner loop over pixels */
     subs            N, N, #8
-    blt             3f
+    b.lt            3f
     do_load         8
     do_yuv_to_rgb_stage1
     subs            N, N, #8
-    blt             2f
+    b.lt            2f
 1:
-    do_yuv_to_rgb_stage2_store_load_stage1
+    do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
     subs            N, N, #8
-    bge             1b
+    b.ge            1b
 2:
     do_yuv_to_rgb_stage2
-    do_store        \bpp, 8
+    do_store        \bpp, 8, \fast_st3
     tst             N, #7
-    beq             8f
+    b.eq            8f
 3:
     tst             N, #4
-    beq             3f
+    b.eq            3f
     do_load         4
 3:
     tst             N, #2
-    beq             4f
+    b.eq            4f
     do_load         2
 4:
     tst             N, #1
-    beq             5f
+    b.eq            5f
     do_load         1
 5:
     do_yuv_to_rgb
     tst             N, #4
-    beq             6f
-    do_store        \bpp, 4
+    b.eq            6f
+    do_store        \bpp, 4, \fast_st3
 6:
     tst             N, #2
-    beq             7f
-    do_store        \bpp, 2
+    b.eq            7f
+    do_store        \bpp, 2, \fast_st3
 7:
     tst             N, #1
-    beq             8f
-    do_store        \bpp, 1
+    b.eq            8f
+    do_store        \bpp, 1, \fast_st3
 8:
     subs            NUM_ROWS, NUM_ROWS, #1
-    bgt             0b
+    b.gt            0b
 9:
     /* Restore all registers and return */
     sub             sp, sp, #336
     ldr             x15, [sp], 16
-    ld1             {v0.8b - v3.8b}, [sp], 32
-    ld1             {v4.8b - v7.8b}, [sp], 32
-    ld1             {v8.8b - v11.8b}, [sp], 32
-    ld1             {v12.8b - v15.8b}, [sp], 32
-    ld1             {v16.8b - v19.8b}, [sp], 32
-    ld1             {v20.8b - v23.8b}, [sp], 32
-    ld1             {v24.8b - v27.8b}, [sp], 32
-    ld1             {v28.8b - v31.8b}, [sp], 32
+    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
     /* pop             {r4, r5, r6, r7, r8, r9, r10, pc} */
     ldp             x4, x5, [sp], 16
     ldp             x6, x7, [sp], 16
@@ -1847,15 +1832,1622 @@
 .purgem do_yuv_to_rgb_stage1
 .purgem do_yuv_to_rgb_stage2
 .purgem do_yuv_to_rgb_stage2_store_load_stage1
+
 .endm
 
-/*--------------------------------- id ----- bpp R  rsize  G  gsize  B  bsize  defsize   */
-generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,   1, .4h,   2, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,   1, .4h,   0, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,   1, .4h,   2, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,   1, .4h,   0, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,   2, .4h,   1, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,   2, .4h,   3, .4h,   .8b
-generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,   0, .4h,   0, .4h,   .8b
+/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
+generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
+
+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
+
 .purgem do_load
 .purgem do_store
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+  .if \size == 8
+    st1             {v20.8b}, [Y], #8
+    st1             {v21.8b}, [U], #8
+    st1             {v22.8b}, [V], #8
+  .elseif \size == 4
+    st1             {v20.b}[0], [Y], #1
+    st1             {v20.b}[1], [Y], #1
+    st1             {v20.b}[2], [Y], #1
+    st1             {v20.b}[3], [Y], #1
+    st1             {v21.b}[0], [U], #1
+    st1             {v21.b}[1], [U], #1
+    st1             {v21.b}[2], [U], #1
+    st1             {v21.b}[3], [U], #1
+    st1             {v22.b}[0], [V], #1
+    st1             {v22.b}[1], [V], #1
+    st1             {v22.b}[2], [V], #1
+    st1             {v22.b}[3], [V], #1
+  .elseif \size == 2
+    st1             {v20.b}[4], [Y], #1
+    st1             {v20.b}[5], [Y], #1
+    st1             {v21.b}[4], [U], #1
+    st1             {v21.b}[5], [U], #1
+    st1             {v22.b}[4], [V], #1
+    st1             {v22.b}[5], [V], #1
+  .elseif \size == 1
+    st1             {v20.b}[6], [Y], #1
+    st1             {v21.b}[6], [U], #1
+    st1             {v22.b}[6], [V], #1
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_load bpp, size, fast_ld3
+  .if \bpp == 24
+    .if \size == 8
+      .if \fast_ld3 == 1
+        ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
+      .else
+        ld1         {v10.b}[0], [RGB], #1
+        ld1         {v11.b}[0], [RGB], #1
+        ld1         {v12.b}[0], [RGB], #1
+
+        ld1         {v10.b}[1], [RGB], #1
+        ld1         {v11.b}[1], [RGB], #1
+        ld1         {v12.b}[1], [RGB], #1
+
+        ld1         {v10.b}[2], [RGB], #1
+        ld1         {v11.b}[2], [RGB], #1
+        ld1         {v12.b}[2], [RGB], #1
+
+        ld1         {v10.b}[3], [RGB], #1
+        ld1         {v11.b}[3], [RGB], #1
+        ld1         {v12.b}[3], [RGB], #1
+
+        ld1         {v10.b}[4], [RGB], #1
+        ld1         {v11.b}[4], [RGB], #1
+        ld1         {v12.b}[4], [RGB], #1
+
+        ld1         {v10.b}[5], [RGB], #1
+        ld1         {v11.b}[5], [RGB], #1
+        ld1         {v12.b}[5], [RGB], #1
+
+        ld1         {v10.b}[6], [RGB], #1
+        ld1         {v11.b}[6], [RGB], #1
+        ld1         {v12.b}[6], [RGB], #1
+
+        ld1         {v10.b}[7], [RGB], #1
+        ld1         {v11.b}[7], [RGB], #1
+        ld1         {v12.b}[7], [RGB], #1
+      .endif
+      prfm          pldl1keep, [RGB, #128]
+    .elseif \size == 4
+      ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
+    .elseif \size == 2
+      ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
+      ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
+    .elseif \size == 1
+      ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
+      prfm          pldl1keep, [RGB, #128]
+    .elseif \size == 4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
+    .elseif \size == 2
+      ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
+      ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
+    .elseif \size == 1
+      ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
+                                           b_offs, fast_ld3
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+    ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
+    ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
+    ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
+    rev64           v18.4s, v1.4s
+    rev64           v26.4s, v1.4s
+    rev64           v28.4s, v1.4s
+    rev64           v30.4s, v1.4s
+    umull           v14.4s, v4.4h, v0.h[0]
+    umull2          v16.4s, v4.8h, v0.h[0]
+    umlsl           v18.4s, v4.4h, v0.h[3]
+    umlsl2          v26.4s, v4.8h, v0.h[3]
+    umlal           v28.4s, v4.4h, v0.h[5]
+    umlal2          v30.4s, v4.8h, v0.h[5]
+    umlal           v14.4s, v6.4h, v0.h[1]
+    umlal2          v16.4s, v6.8h, v0.h[1]
+    umlsl           v18.4s, v6.4h, v0.h[4]
+    umlsl2          v26.4s, v6.8h, v0.h[4]
+    umlsl           v28.4s, v6.4h, v0.h[6]
+    umlsl2          v30.4s, v6.8h, v0.h[6]
+    umlal           v14.4s, v8.4h, v0.h[2]
+    umlal2          v16.4s, v8.8h, v0.h[2]
+    umlal           v18.4s, v8.4h, v0.h[5]
+    umlal2          v26.4s, v8.8h, v0.h[5]
+    umlsl           v28.4s, v8.4h, v0.h[7]
+    umlsl2          v30.4s, v8.8h, v0.h[7]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+    rshrn           v20.4h, v14.4s, #16
+    shrn            v22.4h, v18.4s, #16
+    shrn            v24.4h, v28.4s, #16
+    rshrn2          v20.8h, v16.4s, #16
+    shrn2           v22.8h, v26.4s, #16
+    shrn2           v24.8h, v30.4s, #16
+    xtn             v20.8b, v20.8h       /* v20 = y */
+    xtn             v21.8b, v22.8h       /* v21 = u */
+    xtn             v22.8b, v24.8h       /* v22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+    do_rgb_to_yuv_stage1
+    do_rgb_to_yuv_stage2
+.endm
+
+/* TODO: expand macros and interleave instructions if some in-order
+ *       ARM64 processor actually can dual-issue LOAD/STORE with ALU */
+.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
+    do_rgb_to_yuv_stage2
+    do_load         \bpp, 8, \fast_ld3
+    st1             {v20.8b}, [Y], #8
+    st1             {v21.8b}, [U], #8
+    st1             {v22.8b}, [V], #8
+    do_rgb_to_yuv_stage1
+.endm
+
+.balign 16
+.if \fast_ld3 == 1
+Ljsimd_\colorid\()_ycc_neon_consts:
+.else
+Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
+.endif
+  .short 19595, 38470, 7471, 11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128, 32767, 128
+  .short 32767, 128, 32767, 128
+
+.if \fast_ld3 == 1
+asm_function jsimd_\colorid\()_ycc_convert_neon
+.else
+asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
+.endif
+    OUTPUT_WIDTH    .req w0
+    INPUT_BUF       .req x1
+    OUTPUT_BUF      .req x2
+    OUTPUT_ROW      .req x3
+    NUM_ROWS        .req x4
+
+    OUTPUT_BUF0     .req x5
+    OUTPUT_BUF1     .req x6
+    OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
+
+    RGB             .req x7
+    Y               .req x9
+    U               .req x10
+    V               .req x11
+    N               .req w12
+
+    /* Load constants to d0, d1, d2, d3 */
+    .if \fast_ld3 == 1
+      adr           x13, Ljsimd_\colorid\()_ycc_neon_consts
+    .else
+      adr           x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
+    .endif
+    ld1             {v0.8h, v1.8h}, [x13]
+
+    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
+    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
+    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
+    .unreq          OUTPUT_BUF
+
+    /* Save NEON registers */
+    sub             sp, sp, #64
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    b.lt            9f
+0:
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3]
+    add             OUTPUT_ROW, OUTPUT_ROW, #1
+    ldr             RGB, [INPUT_BUF], #8
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    b.lt            3f
+    do_load         \bpp, 8, \fast_ld3
+    do_rgb_to_yuv_stage1
+    subs            N, N, #8
+    b.lt            2f
+1:
+    do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
+    subs            N, N, #8
+    b.ge            1b
+2:
+    do_rgb_to_yuv_stage2
+    do_store        8
+    tst             N, #7
+    b.eq            8f
+3:
+    tbz             N, #2, 3f
+    do_load         \bpp, 4, \fast_ld3
+3:
+    tbz             N, #1, 4f
+    do_load         \bpp, 2, \fast_ld3
+4:
+    tbz             N, #0, 5f
+    do_load         \bpp, 1, \fast_ld3
+5:
+    do_rgb_to_yuv
+    tbz             N, #2, 6f
+    do_store        4
+6:
+    tbz             N, #1, 7f
+    do_store        2
+7:
+    tbz             N, #0, 8f
+    do_store        1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    b.gt            0b
+9:
+    /* Restore all registers and return */
+    sub             sp, sp, #64
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    br              x30
+
+    .unreq          OUTPUT_WIDTH
+    .unreq          OUTPUT_ROW
+    .unreq          INPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          OUTPUT_BUF0
+    .unreq          OUTPUT_BUF1
+    .unreq          OUTPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
+
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
+
+.purgem do_load
+.purgem do_store
+
+
+/*****************************************************************************/
+
+/*
+ * Load data into workspace, applying unsigned->signed conversion
+ *
+ * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
+ *       rid of VST1.16 instructions
+ */
+
+asm_function jsimd_convsamp_neon
+    SAMPLE_DATA     .req x0
+    START_COL       .req x1
+    WORKSPACE       .req x2
+    TMP1            .req x9
+    TMP2            .req x10
+    TMP3            .req x11
+    TMP4            .req x12
+    TMP5            .req x13
+    TMP6            .req x14
+    TMP7            .req x15
+    TMP8            .req x4
+    TMPDUP          .req w3
+
+    mov             TMPDUP, #128
+    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
+    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
+    dup             v0.8b, TMPDUP
+    add             TMP1, TMP1, START_COL
+    add             TMP2, TMP2, START_COL
+    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
+    add             TMP3, TMP3, START_COL
+    add             TMP4, TMP4, START_COL
+    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
+    add             TMP5, TMP5, START_COL
+    add             TMP6, TMP6, START_COL
+    ld1             {v16.8b}, [TMP1]
+    add             TMP7, TMP7, START_COL
+    add             TMP8, TMP8, START_COL
+    ld1             {v17.8b}, [TMP2]
+    usubl           v16.8h, v16.8b, v0.8b
+    ld1             {v18.8b}, [TMP3]
+    usubl           v17.8h, v17.8b, v0.8b
+    ld1             {v19.8b}, [TMP4]
+    usubl           v18.8h, v18.8b, v0.8b
+    ld1             {v20.8b}, [TMP5]
+    usubl           v19.8h, v19.8b, v0.8b
+    ld1             {v21.8b}, [TMP6]
+    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
+    usubl           v20.8h, v20.8b, v0.8b
+    ld1             {v22.8b}, [TMP7]
+    usubl           v21.8h, v21.8b, v0.8b
+    ld1             {v23.8b}, [TMP8]
+    usubl           v22.8h, v22.8b, v0.8b
+    usubl           v23.8h, v23.8b, v0.8b
+    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
+
+    br              x30
+
+    .unreq          SAMPLE_DATA
+    .unreq          START_COL
+    .unreq          WORKSPACE
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+    .unreq          TMP5
+    .unreq          TMP6
+    .unreq          TMP7
+    .unreq          TMP8
+    .unreq          TMPDUP
+
+/*****************************************************************************/
+
+/*
+ * jsimd_fdct_islow_neon
+ *
+ * This file contains a slow-but-accurate integer implementation of the
+ * forward DCT (Discrete Cosine Transform). The following code is based
+ * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
+ * more details.
+ *
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
+ *       rid of a bunch of VLD1.16 instructions
+ */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS+PASS1_BITS)
+
+#define F_0_298  2446  /* FIX(0.298631336) */
+#define F_0_390  3196  /* FIX(0.390180644) */
+#define F_0_541  4433  /* FIX(0.541196100) */
+#define F_0_765  6270  /* FIX(0.765366865) */
+#define F_0_899  7373  /* FIX(0.899976223) */
+#define F_1_175  9633  /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+#define XFIX_P_0_298 v0.h[0]
+#define XFIX_N_0_390 v0.h[1]
+#define XFIX_P_0_541 v0.h[2]
+#define XFIX_P_0_765 v0.h[3]
+#define XFIX_N_0_899 v0.h[4]
+#define XFIX_P_1_175 v0.h[5]
+#define XFIX_P_1_501 v0.h[6]
+#define XFIX_N_1_847 v0.h[7]
+#define XFIX_N_1_961 v1.h[0]
+#define XFIX_P_2_053 v1.h[1]
+#define XFIX_N_2_562 v1.h[2]
+#define XFIX_P_3_072 v1.h[3]
+
+asm_function jsimd_fdct_islow_neon
+
+    DATA            .req x0
+    TMP             .req x9
+
+    /* Load constants */
+    adr             TMP, Ljsimd_fdct_islow_neon_consts
+    ld1             {v0.8h, v1.8h}, [TMP]
+
+    /* Save NEON registers */
+    sub             sp, sp, #64
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+    /* Load all DATA into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17    | v16.8h
+     *   1 | d18     | d19    | v17.8h
+     *   2 | d20     | d21    | v18.8h
+     *   3 | d22     | d23    | v19.8h
+     *   4 | d24     | d25    | v20.8h
+     *   5 | d26     | d27    | v21.8h
+     *   6 | d28     | d29    | v22.8h
+     *   7 | d30     | d31    | v23.8h
+     */
+
+    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+    sub             DATA, DATA, #64
+
+    /* Transpose */
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+    /* 1-D FDCT */
+    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+
+    /* even part */
+
+    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
+    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
+
+    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
+    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
+
+    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
+
+    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+
+    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    mov             v22.16b, v18.16b
+    mov             v25.16b, v24.16b
+
+    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+    rshrn           v18.4h, v18.4s, #DESCALE_P1
+    rshrn           v22.4h, v22.4s, #DESCALE_P1
+    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+    /* Odd part */
+
+    add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
+    add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
+    add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
+    add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
+    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
+    smull2          v5.4s, v10.8h, XFIX_P_1_175
+    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+    smlal2          v5.4s, v11.8h, XFIX_P_1_175
+
+    smull2          v24.4s, v28.8h, XFIX_P_0_298
+    smull2          v25.4s, v29.8h, XFIX_P_2_053
+    smull2          v26.4s, v30.8h, XFIX_P_3_072
+    smull2          v27.4s, v31.8h, XFIX_P_1_501
+    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+    smull2          v12.4s, v8.8h, XFIX_N_0_899
+    smull2          v13.4s, v9.8h, XFIX_N_2_562
+    smull2          v14.4s, v10.8h, XFIX_N_1_961
+    smull2          v15.4s, v11.8h, XFIX_N_0_390
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+
+    add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
+    add             v14.4s, v14.4s, v5.4s
+    add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
+    add             v15.4s, v15.4s, v5.4s
+
+    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
+    add             v24.4s, v24.4s, v12.4s
+    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
+    add             v25.4s, v25.4s, v13.4s
+    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
+    add             v26.4s, v26.4s, v14.4s
+    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
+    add             v27.4s, v27.4s, v15.4s
+
+    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
+    add             v24.4s, v24.4s, v14.4s
+    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
+    add             v25.4s, v25.4s, v15.4s
+    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
+    add             v26.4s, v26.4s, v13.4s
+    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
+    add             v27.4s, v27.4s, v12.4s
+
+    rshrn           v23.4h, v28.4s, #DESCALE_P1
+    rshrn           v21.4h, v29.4s, #DESCALE_P1
+    rshrn           v19.4h, v30.4s, #DESCALE_P1
+    rshrn           v17.4h, v31.4s, #DESCALE_P1
+    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+    /* Transpose */
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+
+    /* 1-D FDCT */
+    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+
+    /* even part */
+    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
+    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
+
+    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
+    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
+
+    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
+
+    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
+    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
+
+    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    mov             v22.16b, v18.16b
+    mov             v25.16b, v24.16b
+
+    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+    rshrn           v18.4h, v18.4s, #DESCALE_P2
+    rshrn           v22.4h, v22.4s, #DESCALE_P2
+    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+    /* Odd part */
+    add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
+    add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
+    add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
+    add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
+
+    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
+    smull2          v5.4s, v10.8h, XFIX_P_1_175
+    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+    smlal2          v5.4s, v11.8h, XFIX_P_1_175
+
+    smull2          v24.4s, v28.8h, XFIX_P_0_298
+    smull2          v25.4s, v29.8h, XFIX_P_2_053
+    smull2          v26.4s, v30.8h, XFIX_P_3_072
+    smull2          v27.4s, v31.8h, XFIX_P_1_501
+    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+    smull2          v12.4s, v8.8h, XFIX_N_0_899
+    smull2          v13.4s, v9.8h, XFIX_N_2_562
+    smull2          v14.4s, v10.8h, XFIX_N_1_961
+    smull2          v15.4s, v11.8h, XFIX_N_0_390
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+
+    add             v10.4s, v10.4s, v4.4s
+    add             v14.4s, v14.4s, v5.4s
+    add             v11.4s, v11.4s, v4.4s
+    add             v15.4s, v15.4s, v5.4s
+
+    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
+    add             v24.4s, v24.4s, v12.4s
+    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
+    add             v25.4s, v25.4s, v13.4s
+    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
+    add             v26.4s, v26.4s, v14.4s
+    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
+    add             v27.4s, v27.4s, v15.4s
+
+    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
+    add             v24.4s, v24.4s, v14.4s
+    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
+    add             v25.4s, v25.4s, v15.4s
+    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
+    add             v26.4s, v26.4s, v13.4s
+    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
+    add             v27.4s, v27.4s, v12.4s
+
+    rshrn           v23.4h, v28.4s, #DESCALE_P2
+    rshrn           v21.4h, v29.4s, #DESCALE_P2
+    rshrn           v19.4h, v30.4s, #DESCALE_P2
+    rshrn           v17.4h, v31.4s, #DESCALE_P2
+    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+    /* store results */
+    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+
+    /* Restore NEON registers */
+    sub             sp, sp, #64
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+    br              x30
+
+    .unreq          DATA
+    .unreq          TMP
+
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_fdct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the forward DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
+ * function from jfdctfst.c
+ *
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
+ *       rid of a bunch of VLD1.16 instructions
+ */
+
+#undef XFIX_0_541196100
+#define XFIX_0_382683433 v0.h[0]
+#define XFIX_0_541196100 v0.h[1]
+#define XFIX_0_707106781 v0.h[2]
+#define XFIX_1_306562965 v0.h[3]
+
+.balign 16
+Ljsimd_fdct_ifast_neon_consts:
+  .short (98 * 128)               /* XFIX_0_382683433 */
+  .short (139 * 128)              /* XFIX_0_541196100 */
+  .short (181 * 128)              /* XFIX_0_707106781 */
+  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
+
+asm_function jsimd_fdct_ifast_neon
+
+    DATA            .req x0
+    TMP             .req x9
+
+    /* Load constants */
+    adr             TMP, Ljsimd_fdct_ifast_neon_consts
+    ld1             {v0.4h}, [TMP]
+
+    /* Load all DATA into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17    | v0.8h
+     *   1 | d18     | d19    | q9
+     *   2 | d20     | d21    | q10
+     *   3 | d22     | d23    | q11
+     *   4 | d24     | d25    | q12
+     *   5 | d26     | d27    | q13
+     *   6 | d28     | d29    | q14
+     *   7 | d30     | d31    | q15
+     */
+
+    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+    mov             TMP, #2
+    sub             DATA, DATA, #64
+1:
+    /* Transpose */
+    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
+    subs            TMP, TMP, #1
+    /* 1-D FDCT */
+    add             v4.8h, v19.8h, v20.8h
+    sub             v20.8h, v19.8h, v20.8h
+    sub             v28.8h, v18.8h, v21.8h
+    add             v18.8h, v18.8h, v21.8h
+    sub             v29.8h, v17.8h, v22.8h
+    add             v17.8h, v17.8h, v22.8h
+    sub             v21.8h, v16.8h, v23.8h
+    add             v16.8h, v16.8h, v23.8h
+    sub             v6.8h, v17.8h, v18.8h
+    sub             v7.8h, v16.8h, v4.8h
+    add             v5.8h, v17.8h, v18.8h
+    add             v6.8h, v6.8h, v7.8h
+    add             v4.8h, v16.8h, v4.8h
+    sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
+    add             v19.8h, v20.8h, v28.8h
+    add             v16.8h, v4.8h, v5.8h
+    sub             v20.8h, v4.8h, v5.8h
+    add             v5.8h, v28.8h, v29.8h
+    add             v29.8h, v29.8h, v21.8h
+    sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
+    sub             v28.8h, v19.8h, v29.8h
+    add             v18.8h, v7.8h, v6.8h
+    sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
+    sub             v22.8h, v7.8h, v6.8h
+    sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
+    sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
+    add             v6.8h, v21.8h, v5.8h
+    sub             v5.8h, v21.8h, v5.8h
+    add             v29.8h, v29.8h, v28.8h
+    add             v19.8h, v19.8h, v28.8h
+    add             v29.8h, v29.8h, v7.8h
+    add             v21.8h, v5.8h, v19.8h
+    sub             v19.8h, v5.8h, v19.8h
+    add             v17.8h, v6.8h, v29.8h
+    sub             v23.8h, v6.8h, v29.8h
+
+    b.ne            1b
+
+    /* store results */
+    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+
+    br              x30
+
+    .unreq          DATA
+    .unreq          TMP
+#undef XFIX_0_382683433
+#undef XFIX_0_541196100
+#undef XFIX_0_707106781
+#undef XFIX_1_306562965
+
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(void)
+ * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
+ *                      DCTELEM *workspace);
+ *
+ */
+asm_function jsimd_quantize_neon
+
+    COEF_BLOCK      .req x0
+    DIVISORS        .req x1
+    WORKSPACE       .req x2
+
+    RECIPROCAL      .req DIVISORS
+    CORRECTION      .req x9
+    SHIFT           .req x10
+    LOOP_COUNT      .req x11
+
+    mov             LOOP_COUNT, #2
+    add             CORRECTION, DIVISORS, #(64 * 2)
+    add             SHIFT, DIVISORS, #(64 * 6)
+1:
+    subs            LOOP_COUNT, LOOP_COUNT, #1
+    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
+    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
+    abs             v20.8h, v0.8h
+    abs             v21.8h, v1.8h
+    abs             v22.8h, v2.8h
+    abs             v23.8h, v3.8h
+    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
+    add             v20.8h, v20.8h, v4.8h  /* add correction */
+    add             v21.8h, v21.8h, v5.8h
+    add             v22.8h, v22.8h, v6.8h
+    add             v23.8h, v23.8h, v7.8h
+    umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
+    umull2          v16.4s, v20.8h, v28.8h
+    umull           v5.4s, v21.4h, v29.4h
+    umull2          v17.4s, v21.8h, v29.8h
+    umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
+    umull2          v18.4s, v22.8h, v30.8h
+    umull           v7.4s, v23.4h, v31.4h
+    umull2          v19.4s, v23.8h, v31.8h
+    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
+    shrn            v4.4h, v4.4s, #16
+    shrn            v5.4h, v5.4s, #16
+    shrn            v6.4h, v6.4s, #16
+    shrn            v7.4h, v7.4s, #16
+    shrn2           v4.8h, v16.4s, #16
+    shrn2           v5.8h, v17.4s, #16
+    shrn2           v6.8h, v18.4s, #16
+    shrn2           v7.8h, v19.4s, #16
+    neg             v24.8h, v24.8h
+    neg             v25.8h, v25.8h
+    neg             v26.8h, v26.8h
+    neg             v27.8h, v27.8h
+    sshr            v0.8h, v0.8h, #15  /* extract sign */
+    sshr            v1.8h, v1.8h, #15
+    sshr            v2.8h, v2.8h, #15
+    sshr            v3.8h, v3.8h, #15
+    ushl            v4.8h, v4.8h, v24.8h  /* shift */
+    ushl            v5.8h, v5.8h, v25.8h
+    ushl            v6.8h, v6.8h, v26.8h
+    ushl            v7.8h, v7.8h, v27.8h
+
+    eor             v4.16b, v4.16b, v0.16b  /* restore sign */
+    eor             v5.16b, v5.16b, v1.16b
+    eor             v6.16b, v6.16b, v2.16b
+    eor             v7.16b, v7.16b, v3.16b
+    sub             v4.8h, v4.8h, v0.8h
+    sub             v5.8h, v5.8h, v1.8h
+    sub             v6.8h, v6.8h, v2.8h
+    sub             v7.8h, v7.8h, v3.8h
+    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
+
+    b.ne            1b
+
+    br              x30  /* return */
+
+    .unreq          COEF_BLOCK
+    .unreq          DIVISORS
+    .unreq          WORKSPACE
+    .unreq          RECIPROCAL
+    .unreq          CORRECTION
+    .unreq          SHIFT
+    .unreq          LOOP_COUNT
+
+
+/*****************************************************************************/
+
+/*
+ * Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ *
+ * GLOBAL(void)
+ * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
+ *                             JDIMENSION v_samp_factor,
+ *                             JDIMENSION width_blocks, JSAMPARRAY input_data,
+ *                             JSAMPARRAY output_data);
+ */
+
+.balign 16
+Ljsimd_h2_downsample_neon_consts:
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
+        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
+        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
+        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
+  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
+        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
+  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
+
+asm_function jsimd_h2v1_downsample_neon
+    IMAGE_WIDTH     .req x0
+    MAX_V_SAMP      .req x1
+    V_SAMP          .req x2
+    BLOCK_WIDTH     .req x3
+    INPUT_DATA      .req x4
+    OUTPUT_DATA     .req x5
+    OUTPTR          .req x9
+    INPTR           .req x10
+    TMP1            .req x11
+    TMP2            .req x12
+    TMP3            .req x13
+    TMPDUP          .req w15
+
+    mov             TMPDUP, #0x10000
+    lsl             TMP2, BLOCK_WIDTH, #4
+    sub             TMP2, TMP2, IMAGE_WIDTH
+    adr             TMP3, Ljsimd_h2_downsample_neon_consts
+    add             TMP3, TMP3, TMP2, lsl #4
+    dup             v16.4s, TMPDUP
+    ld1             {v18.16b}, [TMP3]
+
+1:  /* row loop */
+    ldr             INPTR, [INPUT_DATA], #8
+    ldr             OUTPTR, [OUTPUT_DATA], #8
+    subs            TMP1, BLOCK_WIDTH, #1
+    b.eq            3f
+2:  /* columns */
+    ld1             {v0.16b}, [INPTR], #16
+    mov             v4.16b, v16.16b
+    subs            TMP1, TMP1, #1
+    uadalp          v4.8h, v0.16b
+    shrn            v6.8b, v4.8h, #1
+    st1             {v6.8b}, [OUTPTR], #8
+    b.ne            2b
+3:  /* last columns */
+    ld1             {v0.16b}, [INPTR]
+    mov             v4.16b, v16.16b
+    subs            V_SAMP, V_SAMP, #1
+    /* expand right */
+    tbl             v2.16b, {v0.16b}, v18.16b
+    uadalp          v4.8h, v2.16b
+    shrn            v6.8b, v4.8h, #1
+    st1             {v6.8b}, [OUTPTR], #8
+    b.ne            1b
+
+    br              x30
+
+    .unreq          IMAGE_WIDTH
+    .unreq          MAX_V_SAMP
+    .unreq          V_SAMP
+    .unreq          BLOCK_WIDTH
+    .unreq          INPUT_DATA
+    .unreq          OUTPUT_DATA
+    .unreq          OUTPTR
+    .unreq          INPTR
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMPDUP
+
+
+/*****************************************************************************/
+
+/*
+ * Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ *
+ * GLOBAL(void)
+ * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
+ *                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+ *                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+ */
+
+.balign 16
+asm_function jsimd_h2v2_downsample_neon
+    IMAGE_WIDTH     .req x0
+    MAX_V_SAMP      .req x1
+    V_SAMP          .req x2
+    BLOCK_WIDTH     .req x3
+    INPUT_DATA      .req x4
+    OUTPUT_DATA     .req x5
+    OUTPTR          .req x9
+    INPTR0          .req x10
+    INPTR1          .req x14
+    TMP1            .req x11
+    TMP2            .req x12
+    TMP3            .req x13
+    TMPDUP          .req w15
+
+    mov             TMPDUP, #1
+    lsl             TMP2, BLOCK_WIDTH, #4
+    lsl             TMPDUP, TMPDUP, #17
+    sub             TMP2, TMP2, IMAGE_WIDTH
+    adr             TMP3, Ljsimd_h2_downsample_neon_consts
+    orr             TMPDUP, TMPDUP, #1
+    add             TMP3, TMP3, TMP2, lsl #4
+    dup             v16.4s, TMPDUP
+    ld1             {v18.16b}, [TMP3]
+
+1:  /* row loop */
+    ldr             INPTR0, [INPUT_DATA], #8
+    ldr             OUTPTR, [OUTPUT_DATA], #8
+    ldr             INPTR1, [INPUT_DATA], #8
+    subs            TMP1, BLOCK_WIDTH, #1
+    b.eq            3f
+2:  /* columns */
+    ld1             {v0.16b}, [INPTR0], #16
+    ld1             {v1.16b}, [INPTR1], #16
+    mov             v4.16b, v16.16b
+    subs            TMP1, TMP1, #1
+    uadalp          v4.8h, v0.16b
+    uadalp          v4.8h, v1.16b
+    shrn            v6.8b, v4.8h, #2
+    st1             {v6.8b}, [OUTPTR], #8
+    b.ne            2b
+3:  /* last columns */
+    ld1             {v0.16b}, [INPTR0], #16
+    ld1             {v1.16b}, [INPTR1], #16
+    mov             v4.16b, v16.16b
+    subs            V_SAMP, V_SAMP, #1
+    /* expand right */
+    tbl             v2.16b, {v0.16b}, v18.16b
+    tbl             v3.16b, {v1.16b}, v18.16b
+    uadalp          v4.8h, v2.16b
+    uadalp          v4.8h, v3.16b
+    shrn            v6.8b, v4.8h, #2
+    st1             {v6.8b}, [OUTPTR], #8
+    b.ne            1b
+
+    br              x30
+
+    .unreq          IMAGE_WIDTH
+    .unreq          MAX_V_SAMP
+    .unreq          V_SAMP
+    .unreq          BLOCK_WIDTH
+    .unreq          INPUT_DATA
+    .unreq          OUTPUT_DATA
+    .unreq          OUTPTR
+    .unreq          INPTR0
+    .unreq          INPTR1
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMPDUP
+
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(JOCTET*)
+ * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
+ *                              JCOEFPTR block, int last_dc_val,
+ *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ *
+ */
+
+    BUFFER          .req x1
+    PUT_BUFFER      .req x6
+    PUT_BITS        .req x7
+    PUT_BITSw       .req w7
+
+.macro emit_byte
+    sub             PUT_BITS, PUT_BITS, #0x8
+    lsr             x19, PUT_BUFFER, PUT_BITS
+    uxtb            w19, w19
+    strb            w19, [BUFFER, #1]!
+    cmp             w19, #0xff
+    b.ne            14f
+    strb            wzr, [BUFFER, #1]!
+14:
+.endm
+.macro put_bits CODE, SIZE
+    lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
+    add             PUT_BITS, PUT_BITS, \SIZE
+    orr             PUT_BUFFER, PUT_BUFFER, \CODE
+.endm
+.macro checkbuf31
+    cmp             PUT_BITS, #0x20
+    b.lt            31f
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+31:
+.endm
+.macro checkbuf47
+    cmp             PUT_BITS, #0x30
+    b.lt            47f
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+    emit_byte
+47:
+.endm
+
+.macro generate_jsimd_huff_encode_one_block fast_tbl
+
+.balign 16
+.if \fast_tbl == 1
+Ljsimd_huff_encode_one_block_neon_consts:
+.else
+Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
+.endif
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+.if \fast_tbl == 1
+    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
+            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
+    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
+            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
+    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
+           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
+    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
+            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
+    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
+            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
+    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
+            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
+    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
+            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
+    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
+            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
+    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
+    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
+.endif
+
+.if \fast_tbl == 1
+asm_function jsimd_huff_encode_one_block_neon
+.else
+asm_function jsimd_huff_encode_one_block_neon_slowtbl
+.endif
+    sub             sp, sp, 272
+    sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
+    /* Save ARM registers */
+    stp             x19, x20, [sp], 16
+.if \fast_tbl == 1
+    adr             x15, Ljsimd_huff_encode_one_block_neon_consts
+.else
+    adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
+.endif
+    ldr             PUT_BUFFER, [x0, #0x10]
+    ldr             PUT_BITSw, [x0, #0x18]
+    ldrsh           w12, [x2]               /* load DC coeff in w12 */
+    /* prepare data */
+.if \fast_tbl == 1
+    ld1             {v23.16b}, [x15], #16
+    ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
+    ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
+    ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
+    ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
+    ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
+    sub             w12, w12, w3      /* last_dc_val, not used afterwards */
+    /* ZigZag 8x8 */
+    tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
+    tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
+    tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
+    tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
+    tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
+    tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
+    tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
+    tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
+    ins             v0.h[0], w12
+    tbx             v1.16b, {v28.16b}, v16.16b
+    tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
+    tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
+    tbx             v6.16b, {v31.16b}, v19.16b
+.else
+      add             x13, x2, #0x22
+      sub             w12, w12, w3    /* last_dc_val, not used afterwards */
+    ld1             {v23.16b}, [x15]
+      add             x14, x2, #0x18
+      add             x3, x2, #0x36
+    ins             v0.h[0], w12
+      add             x9, x2, #0x2
+    ld1             {v1.h}[0], [x13]
+      add             x15, x2, #0x30
+    ld1             {v2.h}[0], [x14]
+      add             x19, x2, #0x26
+    ld1             {v3.h}[0], [x3]
+      add             x20, x2, #0x28
+    ld1             {v0.h}[1], [x9]
+      add             x12, x2, #0x10
+    ld1             {v1.h}[1], [x15]
+      add             x13, x2, #0x40
+    ld1             {v2.h}[1], [x19]
+      add             x14, x2, #0x34
+    ld1             {v3.h}[1], [x20]
+      add             x3, x2, #0x1a
+    ld1             {v0.h}[2], [x12]
+      add             x9, x2, #0x20
+    ld1             {v1.h}[2], [x13]
+      add             x15, x2, #0x32
+    ld1             {v2.h}[2], [x14]
+      add             x19, x2, #0x42
+    ld1             {v3.h}[2], [x3]
+      add             x20, x2, #0xc
+    ld1             {v0.h}[3], [x9]
+      add             x12, x2, #0x12
+    ld1             {v1.h}[3], [x15]
+      add             x13, x2, #0x24
+    ld1             {v2.h}[3], [x19]
+      add             x14, x2, #0x50
+    ld1             {v3.h}[3], [x20]
+      add             x3, x2, #0xe
+    ld1             {v0.h}[4], [x12]
+      add             x9, x2, #0x4
+    ld1             {v1.h}[4], [x13]
+      add             x15, x2, #0x16
+    ld1             {v2.h}[4], [x14]
+      add             x19, x2, #0x60
+    ld1             {v3.h}[4], [x3]
+      add             x20, x2, #0x1c
+    ld1             {v0.h}[5], [x9]
+      add             x12, x2, #0x6
+    ld1             {v1.h}[5], [x15]
+      add             x13, x2, #0x8
+    ld1             {v2.h}[5], [x19]
+      add             x14, x2, #0x52
+    ld1             {v3.h}[5], [x20]
+      add             x3, x2, #0x2a
+    ld1             {v0.h}[6], [x12]
+      add             x9, x2, #0x14
+    ld1             {v1.h}[6], [x13]
+      add             x15, x2, #0xa
+    ld1             {v2.h}[6], [x14]
+      add             x19, x2, #0x44
+    ld1             {v3.h}[6], [x3]
+      add             x20, x2, #0x38
+    ld1             {v0.h}[7], [x9]
+      add             x12, x2, #0x46
+    ld1             {v1.h}[7], [x15]
+      add             x13, x2, #0x3a
+    ld1             {v2.h}[7], [x19]
+      add             x14, x2, #0x74
+    ld1             {v3.h}[7], [x20]
+      add             x3, x2, #0x6a
+    ld1             {v4.h}[0], [x12]
+      add             x9, x2, #0x54
+    ld1             {v5.h}[0], [x13]
+      add             x15, x2, #0x2c
+    ld1             {v6.h}[0], [x14]
+      add             x19, x2, #0x76
+    ld1             {v7.h}[0], [x3]
+      add             x20, x2, #0x78
+    ld1             {v4.h}[1], [x9]
+      add             x12, x2, #0x62
+    ld1             {v5.h}[1], [x15]
+      add             x13, x2, #0x1e
+    ld1             {v6.h}[1], [x19]
+      add             x14, x2, #0x68
+    ld1             {v7.h}[1], [x20]
+      add             x3, x2, #0x7a
+    ld1             {v4.h}[2], [x12]
+      add             x9, x2, #0x70
+    ld1             {v5.h}[2], [x13]
+      add             x15, x2, #0x2e
+    ld1             {v6.h}[2], [x14]
+      add             x19, x2, #0x5a
+    ld1             {v7.h}[2], [x3]
+      add             x20, x2, #0x6c
+    ld1             {v4.h}[3], [x9]
+      add             x12, x2, #0x72
+    ld1             {v5.h}[3], [x15]
+      add             x13, x2, #0x3c
+    ld1             {v6.h}[3], [x19]
+      add             x14, x2, #0x4c
+    ld1             {v7.h}[3], [x20]
+      add             x3, x2, #0x5e
+    ld1             {v4.h}[4], [x12]
+      add             x9, x2, #0x64
+    ld1             {v5.h}[4], [x13]
+      add             x15, x2, #0x4a
+    ld1             {v6.h}[4], [x14]
+      add             x19, x2, #0x3e
+    ld1             {v7.h}[4], [x3]
+      add             x20, x2, #0x6e
+    ld1             {v4.h}[5], [x9]
+      add             x12, x2, #0x56
+    ld1             {v5.h}[5], [x15]
+      add             x13, x2, #0x58
+    ld1             {v6.h}[5], [x19]
+      add             x14, x2, #0x4e
+    ld1             {v7.h}[5], [x20]
+      add             x3, x2, #0x7c
+    ld1             {v4.h}[6], [x12]
+      add             x9, x2, #0x48
+    ld1             {v5.h}[6], [x13]
+      add             x15, x2, #0x66
+    ld1             {v6.h}[6], [x14]
+      add             x19, x2, #0x5c
+    ld1             {v7.h}[6], [x3]
+      add             x20, x2, #0x7e
+    ld1             {v4.h}[7], [x9]
+    ld1             {v5.h}[7], [x15]
+    ld1             {v6.h}[7], [x19]
+    ld1             {v7.h}[7], [x20]
+.endif
+    cmlt            v24.8h, v0.8h, #0
+    cmlt            v25.8h, v1.8h, #0
+    cmlt            v26.8h, v2.8h, #0
+    cmlt            v27.8h, v3.8h, #0
+    cmlt            v28.8h, v4.8h, #0
+    cmlt            v29.8h, v5.8h, #0
+    cmlt            v30.8h, v6.8h, #0
+    cmlt            v31.8h, v7.8h, #0
+    abs             v0.8h, v0.8h
+    abs             v1.8h, v1.8h
+    abs             v2.8h, v2.8h
+    abs             v3.8h, v3.8h
+    abs             v4.8h, v4.8h
+    abs             v5.8h, v5.8h
+    abs             v6.8h, v6.8h
+    abs             v7.8h, v7.8h
+    eor             v24.16b, v24.16b, v0.16b
+    eor             v25.16b, v25.16b, v1.16b
+    eor             v26.16b, v26.16b, v2.16b
+    eor             v27.16b, v27.16b, v3.16b
+    eor             v28.16b, v28.16b, v4.16b
+    eor             v29.16b, v29.16b, v5.16b
+    eor             v30.16b, v30.16b, v6.16b
+    eor             v31.16b, v31.16b, v7.16b
+    cmeq            v16.8h, v0.8h, #0
+    cmeq            v17.8h, v1.8h, #0
+    cmeq            v18.8h, v2.8h, #0
+    cmeq            v19.8h, v3.8h, #0
+    cmeq            v20.8h, v4.8h, #0
+    cmeq            v21.8h, v5.8h, #0
+    cmeq            v22.8h, v6.8h, #0
+    xtn             v16.8b, v16.8h
+    xtn             v18.8b, v18.8h
+    xtn             v20.8b, v20.8h
+    xtn             v22.8b, v22.8h
+      umov            w14, v0.h[0]
+    xtn2            v16.16b, v17.8h
+      umov            w13, v24.h[0]
+    xtn2            v18.16b, v19.8h
+      clz             w14, w14
+    xtn2            v20.16b, v21.8h
+      lsl             w13, w13, w14
+    cmeq            v17.8h, v7.8h, #0
+      sub             w12, w14, #32
+    xtn2            v22.16b, v17.8h
+      lsr             w13, w13, w14
+    and             v16.16b, v16.16b, v23.16b
+      neg             w12, w12
+    and             v18.16b, v18.16b, v23.16b
+      add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
+    and             v20.16b, v20.16b, v23.16b
+      add             x15, sp, #0x80           /* x15 = t2 */
+    and             v22.16b, v22.16b, v23.16b
+      ldr             w10, [x4, x12, lsl #2]
+    addp            v16.16b, v16.16b, v18.16b
+      ldrb            w11, [x3, x12]
+    addp            v20.16b, v20.16b, v22.16b
+      checkbuf47
+    addp            v16.16b, v16.16b, v20.16b
+      put_bits        x10, x11
+    addp            v16.16b, v16.16b, v18.16b
+      checkbuf47
+    umov            x9,v16.D[0]
+      put_bits        x13, x12
+    cnt             v17.8b, v16.8b
+      mvn             x9, x9
+    addv            B18, v17.8b
+      add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
+    umov            w12, v18.b[0]
+      lsr             x9, x9, #0x1     /* clear AC coeff */
+    ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
+    rbit            x9, x9             /* x9 = index0 */
+    ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
+    cmp             w12, #(64-8)
+    mov             x11, sp
+    b.lt            4f
+    cbz             x9, 6f
+    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+    clz             x2, x9
+    add             x15, x15, x2, lsl #1
+    lsl             x9, x9, x2
+    ldrh            w20, [x15, #-126]
+2:
+    cmp             x2, #0x10
+    b.lt            3f
+    sub             x2, x2, #0x10
+    checkbuf47
+    put_bits        x13, x14
+    b               2b
+3:
+    clz             w20, w20
+    ldrh            w3, [x15, #2]!
+    sub             w11, w20, #32
+    lsl             w3, w3, w20
+    neg             w11, w11
+    lsr             w3, w3, w20
+    add             x2, x11, x2, lsl #4
+    lsl             x9, x9, #0x1
+    ldr             w12, [x5, x2, lsl #2]
+    ldrb            w10, [x4, x2]
+    checkbuf31
+    put_bits        x12, x10
+    put_bits        x3, x11
+    cbnz            x9, 1b
+    b               6f
+4:
+    movi            v21.8h, #0x0010
+    clz             v0.8h, v0.8h
+    clz             v1.8h, v1.8h
+    clz             v2.8h, v2.8h
+    clz             v3.8h, v3.8h
+    clz             v4.8h, v4.8h
+    clz             v5.8h, v5.8h
+    clz             v6.8h, v6.8h
+    clz             v7.8h, v7.8h
+    ushl            v24.8h, v24.8h, v0.8h
+    ushl            v25.8h, v25.8h, v1.8h
+    ushl            v26.8h, v26.8h, v2.8h
+    ushl            v27.8h, v27.8h, v3.8h
+    ushl            v28.8h, v28.8h, v4.8h
+    ushl            v29.8h, v29.8h, v5.8h
+    ushl            v30.8h, v30.8h, v6.8h
+    ushl            v31.8h, v31.8h, v7.8h
+    neg             v0.8h, v0.8h
+    neg             v1.8h, v1.8h
+    neg             v2.8h, v2.8h
+    neg             v3.8h, v3.8h
+    neg             v4.8h, v4.8h
+    neg             v5.8h, v5.8h
+    neg             v6.8h, v6.8h
+    neg             v7.8h, v7.8h
+    ushl            v24.8h, v24.8h, v0.8h
+    ushl            v25.8h, v25.8h, v1.8h
+    ushl            v26.8h, v26.8h, v2.8h
+    ushl            v27.8h, v27.8h, v3.8h
+    ushl            v28.8h, v28.8h, v4.8h
+    ushl            v29.8h, v29.8h, v5.8h
+    ushl            v30.8h, v30.8h, v6.8h
+    ushl            v31.8h, v31.8h, v7.8h
+    add             v0.8h, v21.8h, v0.8h
+    add             v1.8h, v21.8h, v1.8h
+    add             v2.8h, v21.8h, v2.8h
+    add             v3.8h, v21.8h, v3.8h
+    add             v4.8h, v21.8h, v4.8h
+    add             v5.8h, v21.8h, v5.8h
+    add             v6.8h, v21.8h, v6.8h
+    add             v7.8h, v21.8h, v7.8h
+    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+    clz             x2, x9
+    add             x15, x15, x2, lsl #1
+    lsl             x9, x9, x2
+    ldrh            w11, [x15, #-126]
+2:
+    cmp             x2, #0x10
+    b.lt            3f
+    sub             x2, x2, #0x10
+    checkbuf47
+    put_bits        x13, x14
+    b               2b
+3:
+    ldrh            w3, [x15, #2]!
+    add             x2, x11, x2, lsl #4
+    lsl             x9, x9, #0x1
+    ldr             w12, [x5, x2, lsl #2]
+    ldrb            w10, [x4, x2]
+    checkbuf31
+    put_bits        x12, x10
+    put_bits        x3, x11
+    cbnz            x9, 1b
+6:
+    add             x13, sp, #0xfe
+    cmp             x15, x13
+    b.hs            1f
+    ldr             w12, [x5]
+    ldrb            w14, [x4]
+    checkbuf47
+    put_bits        x12, x14
+1:
+    sub             sp, sp, 16
+    str             PUT_BUFFER, [x0, #0x10]
+    str             PUT_BITSw, [x0, #0x18]
+    ldp             x19, x20, [sp], 16
+    add             x0, BUFFER, #0x1
+    add             sp, sp, 256
+    br              x30
+
+.endm
+
+generate_jsimd_huff_encode_one_block 1
+generate_jsimd_huff_encode_one_block 0
+
+    .unreq          BUFFER
+    .unreq          PUT_BUFFER
+    .unreq          PUT_BITS
+    .unreq          PUT_BITSw
+
+.purgem emit_byte
+.purgem put_bits
+.purgem checkbuf31
+.purgem checkbuf47
diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S
index 7e8e134..568768f 100644
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -4,7 +4,10 @@
  * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
  * All rights reserved.
  * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2014 Siarhei Siamashka.  All Rights Reserved.
  * Copyright (C) 2014 Linaro Limited.  All Rights Reserved.
+ * Copyright (C) 2015 D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016 Matthieu Darbois.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -24,7 +27,7 @@
  */
 
 #if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
 #endif
 
 .text
@@ -32,6 +35,7 @@
 .arch armv7a
 .object_arch armv4
 .arm
+.syntax unified
 
 
 #define RESPECT_STRICT_ALIGNMENT 1
@@ -56,10 +60,10 @@
 
 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
 .macro transpose_4x4 x0, x1, x2, x3
-    vtrn.16 \x0, \x1
-    vtrn.16 \x2, \x3
-    vtrn.32 \x0, \x2
-    vtrn.32 \x1, \x3
+    vtrn.16         \x0, \x1
+    vtrn.16         \x2, \x3
+    vtrn.32         \x0, \x2
+    vtrn.32         \x1, \x3
 .endm
 
 
@@ -71,22 +75,22 @@
  * Perform dequantization and inverse DCT on one block of coefficients.
  *
  * GLOBAL(void)
- * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
+ * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
  *                        JSAMPARRAY output_buf, JDIMENSION output_col)
  */
 
-#define FIX_0_298631336  (2446)
-#define FIX_0_390180644  (3196)
-#define FIX_0_541196100  (4433)
-#define FIX_0_765366865  (6270)
-#define FIX_0_899976223  (7373)
-#define FIX_1_175875602  (9633)
-#define FIX_1_501321110  (12299)
-#define FIX_1_847759065  (15137)
-#define FIX_1_961570560  (16069)
-#define FIX_2_053119869  (16819)
-#define FIX_2_562915447  (20995)
-#define FIX_3_072711026  (25172)
+#define FIX_0_298631336 (2446)
+#define FIX_0_390180644 (3196)
+#define FIX_0_541196100 (4433)
+#define FIX_0_765366865 (6270)
+#define FIX_0_899976223 (7373)
+#define FIX_1_175875602 (9633)
+#define FIX_1_501321110 (12299)
+#define FIX_1_847759065 (15137)
+#define FIX_1_961570560 (16069)
+#define FIX_2_053119869 (16819)
+#define FIX_2_562915447 (20995)
+#define FIX_3_072711026 (25172)
 
 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
@@ -104,8 +108,8 @@
 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
 {                                                                             \
     DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
-    INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
-    INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
+    JLONG   q1, q2, q3, q4, q5, q6, q7;                                       \
+    JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
                                                                               \
     /* 1-D iDCT input data */                                                 \
     row0 = xrow0;                                                             \
@@ -126,7 +130,7 @@
     q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
          MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
     q4 = q6;                                                                  \
-    q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
+    q3 = ((JLONG) row0 - (JLONG) row4) << 13;                                 \
     q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
           MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
     /* now we can use q1 (reloadable constants have been used up) */          \
@@ -153,7 +157,7 @@
     /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
     tmp11_minus_tmp2 = q1;                                                    \
                                                                               \
-    q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
+    q1 = ((JLONG) row0 + (JLONG) row4) << 13;                                 \
     q2 = q1 + q6;                                                             \
     q1 = q1 - q6;                                                             \
                                                                               \
@@ -168,34 +172,34 @@
     tmp13 = q1;                                                               \
 }
 
-#define XFIX_0_899976223                    d0[0]
-#define XFIX_0_541196100                    d0[1]
-#define XFIX_2_562915447                    d0[2]
-#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
-#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
-#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
-#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
-#define XFIX_1_175875602                    d1[3]
-#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
-#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
-#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
-#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
+#define XFIX_0_899976223                   d0[0]
+#define XFIX_0_541196100                   d0[1]
+#define XFIX_2_562915447                   d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865  d1[2]
+#define XFIX_1_175875602                   d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
 
 .balign 16
 jsimd_idct_islow_neon_consts:
-    .short FIX_0_899976223                    /* d0[0] */
-    .short FIX_0_541196100                    /* d0[1] */
-    .short FIX_2_562915447                    /* d0[2] */
-    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
-    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
-    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
-    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
-    .short FIX_1_175875602                    /* d1[3] */
-    /* reloadable constants */
-    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
-    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
-    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
-    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
+  .short FIX_0_899976223                    /* d0[0] */
+  .short FIX_0_541196100                    /* d0[1] */
+  .short FIX_2_562915447                    /* d0[2] */
+  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
+  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
+  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
+  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
+  .short FIX_1_175875602                    /* d1[3] */
+  /* reloadable constants */
+  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
+  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
+  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
+  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
 
 asm_function jsimd_idct_islow_neon
 
@@ -254,140 +258,141 @@
     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
     vmul.s16        q14, q14, q2
     vmul.s16        q13, q13, q1
-    vld1.16         {d0, d1, d2, d3}, [ip, :128] /* load constants */
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
     add             ip, ip, #16
     vmul.s16        q15, q15, q3
-    vpush           {d8-d15} /* save NEON registers */
+    vpush           {d8-d15}                      /* save NEON registers */
     /* 1-D IDCT, pass 1, left 4x8 half */
-    vadd.s16        d4,    ROW7L, ROW3L
-    vadd.s16        d5,    ROW5L, ROW1L
-    vmull.s16       q6,    d4,    XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6,    d5,    XFIX_1_175875602
-    vmull.s16       q7,    d4,    XFIX_1_175875602
+    vadd.s16        d4, ROW7L, ROW3L
+    vadd.s16        d5, ROW5L, ROW1L
+    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d5, XFIX_1_175875602
+    vmull.s16       q7, d4, XFIX_1_175875602
       /* Check for the zero coefficients in the right 4x8 half */
       push            {r4, r5}
-    vmlal.s16       q7,    d5,    XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3,    ROW0L, ROW4L
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
-    vmull.s16       q2,    ROW2L, XFIX_0_541196100
-    vmlal.s16       q2,    ROW6L, XFIX_0_541196100_MINUS_1_847759065
-      orr             r0,    r4,    r5
-    vmov            q4,    q6
-    vmlsl.s16       q6,    ROW5L, XFIX_2_562915447
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
-    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3,    q3,    #13
-      orr             r0,    r0,    r4
-    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
-      orr             r0,    r0,    r5
-    vadd.s32        q1,    q3,    q2
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
-    vmov            q5,    q7
-    vadd.s32        q1,    q1,    q6
-      orr             r0,    r0,    r4
-    vmlsl.s16       q7,    ROW7L, XFIX_0_899976223
-      orr             r0,    r0,    r5
-    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1L, q1,    #11
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
-    vsub.s32        q1,    q1,    q6
-    vmlal.s16       q5,    ROW5L, XFIX_2_053119869_MINUS_2_562915447
-      orr             r0,    r0,    r4
-    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
-      orr             r0,    r0,    r5
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
-    vmlal.s16       q6,    ROW6L, XFIX_0_541196100
-    vsub.s32        q3,    q3,    q2
-      orr             r0,    r0,    r4
-    vrshrn.s32      ROW6L, q1,    #11
-      orr             r0,    r0,    r5
-    vadd.s32        q1,    q3,    q5
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
-    vsub.s32        q3,    q3,    q5
-    vaddl.s16       q5,    ROW0L, ROW4L
-      orr             r0,    r0,    r4
-    vrshrn.s32      ROW2L, q1,    #11
-      orr             r0,    r0,    r5
-    vrshrn.s32      ROW5L, q3,    #11
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
-    vshl.s32        q5,    q5,    #13
-    vmlal.s16       q4,    ROW7L, XFIX_0_298631336_MINUS_0_899976223
-      orr             r0,    r0,    r4
-    vadd.s32        q2,    q5,    q6
-      orrs            r0,    r0,    r5
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-      ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-      orr             r0,    r4,    r5
-    vsub.s32        q3,    q1,    q4
+    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW4L
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+      orr             r0, r4, r5
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+      orr             r0, r0, r4
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q2
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      orr             r0, r0, r4
+    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1L, q1, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+      orr             r0, r0, r4
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+      orr             r0, r0, r5
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+    vmlal.s16       q6, ROW6L, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+      orr             r0, r0, r4
+    vrshrn.s32      ROW6L, q1, #11
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q5
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW4L
+      orr             r0, r0, r4
+    vrshrn.s32      ROW2L, q1, #11
+      orr             r0, r0, r5
+    vrshrn.s32      ROW5L, q3, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+      orr             r0, r0, r4
+    vadd.s32        q2, q5, q6
+      orrs            r0, r0, r5
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+      orr             r0, r4, r5
+    vsub.s32        q3, q1, q4
       pop             {r4, r5}
-    vrshrn.s32      ROW7L, q2,    #11
-    vrshrn.s32      ROW3L, q5,    #11
-    vrshrn.s32      ROW0L, q6,    #11
-    vrshrn.s32      ROW4L, q3,    #11
+    vrshrn.s32      ROW7L, q2, #11
+    vrshrn.s32      ROW3L, q5, #11
+    vrshrn.s32      ROW0L, q6, #11
+    vrshrn.s32      ROW4L, q3, #11
 
-      beq             3f /* Go to do some special handling for the sparse right 4x8 half */
+      beq             3f  /* Go to do some special handling for the sparse
+                             right 4x8 half */
 
     /* 1-D IDCT, pass 1, right 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vadd.s16        d10,   ROW7R, ROW3R
-    vadd.s16        d8,    ROW5R, ROW1R
+    vld1.s16        {d2}, [ip, :64]  /* reload constants */
+    vadd.s16        d10, ROW7R, ROW3R
+    vadd.s16        d8, ROW5R, ROW1R
       /* Transpose left 4x8 half */
       vtrn.16         ROW6L, ROW7L
-    vmull.s16       q6,    d10,   XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6,    d8,    XFIX_1_175875602
+    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d8, XFIX_1_175875602
       vtrn.16         ROW2L, ROW3L
-    vmull.s16       q7,    d10,   XFIX_1_175875602
-    vmlal.s16       q7,    d8,    XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q7, d10, XFIX_1_175875602
+    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
       vtrn.16         ROW0L, ROW1L
-    vsubl.s16       q3,    ROW0R, ROW4R
-    vmull.s16       q2,    ROW2R, XFIX_0_541196100
-    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
+    vsubl.s16       q3, ROW0R, ROW4R
+    vmull.s16       q2, ROW2R, XFIX_0_541196100
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
       vtrn.16         ROW4L, ROW5L
-    vmov            q4,    q6
-    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
-    vmlal.s16       q6,    ROW3R, XFIX_3_072711026_MINUS_2_562915447
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
       vtrn.32         ROW1L, ROW3L
-    vshl.s32        q3,    q3,    #13
-    vmlsl.s16       q4,    ROW1R, XFIX_0_899976223
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
       vtrn.32         ROW4L, ROW6L
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vadd.s32        q1,    q1,    q6
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
       vtrn.32         ROW0L, ROW2L
-    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
-    vmlal.s16       q7,    ROW1R, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1R, q1,    #11
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1R, q1, #11
       vtrn.32         ROW5L, ROW7L
-    vsub.s32        q1,    q1,    q6
-    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5,    ROW3R, XFIX_2_562915447
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW2R, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
-    vsub.s32        q3,    q3,    q2
-    vrshrn.s32      ROW6R, q1,    #11
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vaddl.s16       q5,    ROW0R, ROW4R
-    vrshrn.s32      ROW2R, q1,    #11
-    vrshrn.s32      ROW5R, q3,    #11
-    vshl.s32        q5,    q5,    #13
-    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vrshrn.s32      ROW7R, q2,    #11
-    vrshrn.s32      ROW3R, q5,    #11
-    vrshrn.s32      ROW0R, q6,    #11
-    vrshrn.s32      ROW4R, q3,    #11
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vrshrn.s32      ROW6R, q1, #11
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0R, ROW4R
+    vrshrn.s32      ROW2R, q1, #11
+    vrshrn.s32      ROW5R, q3, #11
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vrshrn.s32      ROW7R, q2, #11
+    vrshrn.s32      ROW3R, q5, #11
+    vrshrn.s32      ROW0R, q6, #11
+    vrshrn.s32      ROW4R, q3, #11
     /* Transpose right 4x8 half */
     vtrn.16         ROW6R, ROW7R
     vtrn.16         ROW2R, ROW3R
@@ -399,122 +404,122 @@
     vtrn.32         ROW5R, ROW7R
 
 1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vmull.s16       q6,    ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
-    vmlal.s16       q6,    ROW1L, XFIX_1_175875602
-    vmlal.s16       q6,    ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
-    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7,    ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
-    vmlal.s16       q7,    ROW3L, XFIX_1_175875602
-    vmlal.s16       q7,    ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
-    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
-    vmull.s16       q2,    ROW2L, XFIX_0_541196100
-    vmlal.s16       q2,    ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
-    vmov            q4,    q6
-    vmlsl.s16       q6,    ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
-    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3,    q3,    #13
-    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vadd.s32        q1,    q1,    q6
-    vmlsl.s16       q7,    ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
-    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vshrn.s32       ROW1L, q1,    #16
-    vsub.s32        q1,    q1,    q6
-    vmlal.s16       q5,    ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
-    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6,    ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
-    vsub.s32        q3,    q3,    q2
-    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vaddl.s16       q5,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW2L, q1,    #16
-    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
-    vshl.s32        q5,    q5,    #13
-    vmlal.s16       q4,    ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5,    #16
-    vshrn.s32       ROW0L, q6,    #16
-    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
     /* 1-D IDCT, pass 2, right 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vmull.s16       q6,    ROW5R, XFIX_1_175875602
-    vmlal.s16       q6,    ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
-    vmlal.s16       q6,    ROW7R, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
-    vmull.s16       q7,    ROW7R, XFIX_1_175875602
-    vmlal.s16       q7,    ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
-    vmlal.s16       q7,    ROW5R, XFIX_1_175875602_MINUS_0_390180644
-    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
-    vsubl.s16       q3,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
-    vmull.s16       q2,    ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
-    vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
-    vmov            q4,    q6
-    vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
-    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
-    vshl.s32        q3,    q3,    #13
-    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vadd.s32        q1,    q1,    q6
-    vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
-    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
-    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
-    vsub.s32        q1,    q1,    q6
-    vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
-    vmlal.s16       q6,    ROW6R, XFIX_0_541196100
-    vsub.s32        q3,    q3,    q2
-    vshrn.s32       ROW6R, q1,    #16
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vaddl.s16       q5,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3,    #16
-    vshl.s32        q5,    q5,    #13
-    vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vshrn.s32       ROW7R, q2,    #16
-    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3,    #16
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5R, XFIX_1_175875602
+    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmull.s16       q7, ROW7R, XFIX_1_175875602
+    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
 
 2:  /* Descale to 8-bit and range limit */
-    vqrshrn.s16     d16,   q8,    #2
-    vqrshrn.s16     d17,   q9,    #2
-    vqrshrn.s16     d18,   q10,   #2
-    vqrshrn.s16     d19,   q11,   #2
-    vpop            {d8-d15} /* restore NEON registers */
-    vqrshrn.s16     d20,   q12,   #2
+    vqrshrn.s16     d16, q8, #2
+    vqrshrn.s16     d17, q9, #2
+    vqrshrn.s16     d18, q10, #2
+    vqrshrn.s16     d19, q11, #2
+    vpop            {d8-d15}                      /* restore NEON registers */
+    vqrshrn.s16     d20, q12, #2
       /* Transpose the final 8-bit samples and do signed->unsigned conversion */
-      vtrn.16         q8,    q9
-    vqrshrn.s16     d21,   q13,   #2
-    vqrshrn.s16     d22,   q14,   #2
-      vmov.u8         q0,    #(CENTERJSAMPLE)
-    vqrshrn.s16     d23,   q15,   #2
-      vtrn.8          d16,   d17
-      vtrn.8          d18,   d19
-      vadd.u8         q8,    q8,    q0
-      vadd.u8         q9,    q9,    q0
-      vtrn.16         q10,   q11
+      vtrn.16         q8, q9
+    vqrshrn.s16     d21, q13, #2
+    vqrshrn.s16     d22, q14, #2
+      vmov.u8         q0, #(CENTERJSAMPLE)
+    vqrshrn.s16     d23, q15, #2
+      vtrn.8          d16, d17
+      vtrn.8          d18, d19
+      vadd.u8         q8, q8, q0
+      vadd.u8         q9, q9, q0
+      vtrn.16         q10, q11
         /* Store results to the output buffer */
         ldmia           OUTPUT_BUF!, {TMP1, TMP2}
         add             TMP1, TMP1, OUTPUT_COL
@@ -526,7 +531,7 @@
         add             TMP1, TMP1, OUTPUT_COL
         add             TMP2, TMP2, OUTPUT_COL
         vst1.8          {d18}, [TMP1]
-      vadd.u8         q10,   q10,   q0
+      vadd.u8         q10, q10, q0
         vst1.8          {d19}, [TMP2]
         ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
         add             TMP1, TMP1, OUTPUT_COL
@@ -535,7 +540,7 @@
         add             TMP4, TMP4, OUTPUT_COL
       vtrn.8          d22, d23
         vst1.8          {d20}, [TMP1]
-      vadd.u8         q11,   q11,   q0
+      vadd.u8         q11, q11, q0
         vst1.8          {d21}, [TMP2]
         vst1.8          {d22}, [TMP3]
         vst1.8          {d23}, [TMP4]
@@ -548,14 +553,15 @@
     vtrn.16         ROW2L, ROW3L
     vtrn.16         ROW0L, ROW1L
     vtrn.16         ROW4L, ROW5L
-    vshl.s16        ROW0R, ROW0R, #2 /* PASS1_BITS */
+    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
     vtrn.32         ROW1L, ROW3L
     vtrn.32         ROW4L, ROW6L
     vtrn.32         ROW0L, ROW2L
     vtrn.32         ROW5L, ROW7L
 
     cmp             r0, #0
-    beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
+    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
+                           pass */
 
     /* Only row 0 is non-zero for the right 4x8 half  */
     vdup.s16        ROW1R, ROW0R[1]
@@ -566,83 +572,83 @@
     vdup.s16        ROW6R, ROW0R[2]
     vdup.s16        ROW7R, ROW0R[3]
     vdup.s16        ROW0R, ROW0R[0]
-    b               1b /* Go to 'normal' second pass */
+    b               1b  /* Go to 'normal' second pass */
 
 4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vmull.s16       q6,    ROW1L, XFIX_1_175875602
-    vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7,    ROW3L, XFIX_1_175875602
-    vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2,    ROW2L, XFIX_0_541196100
-    vshll.s16       q3,    ROW0L, #13
-    vmov            q4,    q6
-    vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1,    q1,    q6
-    vadd.s32        q6,    q6,    q6
-    vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
-    vshrn.s32       ROW1L, q1,    #16
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3,    q3,    q2
-    vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vshll.s16       q5,    ROW0L, #13
-    vshrn.s32       ROW2L, q1,    #16
-    vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5,    #16
-    vshrn.s32       ROW0L, q6,    #16
-    vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vshll.s16       q3, ROW0L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW0L, #13
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
     /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
-    vld1.s16        {d2},  [ip, :64]    /* reload constants */
-    vmull.s16       q6,    ROW5L, XFIX_1_175875602
-    vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7,    ROW7L, XFIX_1_175875602
-    vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2,    ROW6L, XFIX_0_541196100
-    vshll.s16       q3,    ROW4L, #13
-    vmov            q4,    q6
-    vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4,    ROW5L, XFIX_0_899976223
-    vadd.s32        q1,    q3,    q2
-    vmov            q5,    q7
-    vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1,    q1,    q6
-    vadd.s32        q6,    q6,    q6
-    vmlsl.s16       q5,    ROW7L, XFIX_2_562915447
-    vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
-    vsub.s32        q1,    q1,    q6
-    vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3,    q3,    q2
-    vshrn.s32       ROW6R, q1,    #16
-    vadd.s32        q1,    q3,    q5
-    vsub.s32        q3,    q3,    q5
-    vshll.s16       q5,    ROW4L, #13
-    vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3,    #16
-    vadd.s32        q2,    q5,    q6
-    vsub.s32        q1,    q5,    q6
-    vadd.s32        q6,    q2,    q7
-    vsub.s32        q2,    q2,    q7
-    vadd.s32        q5,    q1,    q4
-    vsub.s32        q3,    q1,    q4
-    vshrn.s32       ROW7R, q2,    #16
-    vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3,    #16
-    b               2b /* Go to epilogue */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5L, XFIX_1_175875602
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW7L, XFIX_1_175875602
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW6L, XFIX_0_541196100
+    vshll.s16       q3, ROW4L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW4L, #13
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+    b               2b                            /* Go to epilogue */
 
     .unreq          DCT_TABLE
     .unreq          COEF_BLOCK
@@ -696,10 +702,10 @@
 
 .balign 16
 jsimd_idct_ifast_neon_consts:
-    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
-    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
-    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
-    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
 
 asm_function jsimd_idct_ifast_neon
 
@@ -729,9 +735,9 @@
     vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
     vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
-    vmul.s16        q8,  q8,  q0
+    vmul.s16        q8, q8, q0
     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q9,  q9,  q1
+    vmul.s16        q9, q9, q1
     vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
     vmul.s16        q10, q10, q2
     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
@@ -741,124 +747,124 @@
     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
     vmul.s16        q14, q14, q2
     vmul.s16        q13, q13, q1
-    vld1.16         {d0}, [ip, :64] /* load constants */
+    vld1.16         {d0}, [ip, :64]  /* load constants */
     vmul.s16        q15, q15, q3
-    vpush           {d8-d13}        /* save NEON registers */
+    vpush           {d8-d13}         /* save NEON registers */
     /* 1-D IDCT, pass 1 */
-    vsub.s16        q2,  q10, q14
+    vsub.s16        q2, q10, q14
     vadd.s16        q14, q10, q14
-    vsub.s16        q1,  q11, q13
+    vsub.s16        q1, q11, q13
     vadd.s16        q13, q11, q13
-    vsub.s16        q5,  q9,  q15
-    vadd.s16        q15, q9,  q15
-    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
-    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
-    vadd.s16        q3,  q1,  q1
-    vsub.s16        q1,  q5,  q1
-    vadd.s16        q10, q2,  q4
-    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
-    vsub.s16        q2,  q15, q13
-    vadd.s16        q3,  q3,  q6
-    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
-    vadd.s16        q1,  q1,  q4
-    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
     vsub.s16        q10, q10, q14
-    vadd.s16        q2,  q2,  q6
-    vsub.s16        q6,  q8,  q12
-    vadd.s16        q12, q8,  q12
-    vadd.s16        q9,  q5,  q4
-    vadd.s16        q5,  q6,  q10
-    vsub.s16        q10, q6,  q10
-    vadd.s16        q6,  q15, q13
-    vadd.s16        q8,  q12, q14
-    vsub.s16        q3,  q6,  q3
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
     vsub.s16        q12, q12, q14
-    vsub.s16        q3,  q3,  q1
-    vsub.s16        q1,  q9,  q1
-    vadd.s16        q2,  q3,  q2
-    vsub.s16        q15, q8,  q6
-    vadd.s16        q1,  q1,  q2
-    vadd.s16        q8,  q8,  q6
-    vadd.s16        q14, q5,  q3
-    vsub.s16        q9,  q5,  q3
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
     vsub.s16        q13, q10, q2
     vadd.s16        q10, q10, q2
       /* Transpose */
-      vtrn.16         q8,  q9
+      vtrn.16         q8, q9
     vsub.s16        q11, q12, q1
       vtrn.16         q14, q15
     vadd.s16        q12, q12, q1
       vtrn.16         q10, q11
       vtrn.16         q12, q13
-      vtrn.32         q9,  q11
+      vtrn.32         q9, q11
       vtrn.32         q12, q14
-      vtrn.32         q8,  q10
+      vtrn.32         q8, q10
       vtrn.32         q13, q15
       vswp            d28, d21
       vswp            d26, d19
     /* 1-D IDCT, pass 2 */
-    vsub.s16        q2,  q10, q14
+    vsub.s16        q2, q10, q14
       vswp            d30, d23
     vadd.s16        q14, q10, q14
       vswp            d24, d17
-    vsub.s16        q1,  q11, q13
+    vsub.s16        q1, q11, q13
     vadd.s16        q13, q11, q13
-    vsub.s16        q5,  q9,  q15
-    vadd.s16        q15, q9,  q15
-    vqdmulh.s16     q4,  q2,  XFIX_1_414213562
-    vqdmulh.s16     q6,  q1,  XFIX_2_613125930
-    vadd.s16        q3,  q1,  q1
-    vsub.s16        q1,  q5,  q1
-    vadd.s16        q10, q2,  q4
-    vqdmulh.s16     q4,  q1,  XFIX_1_847759065
-    vsub.s16        q2,  q15, q13
-    vadd.s16        q3,  q3,  q6
-    vqdmulh.s16     q6,  q2,  XFIX_1_414213562
-    vadd.s16        q1,  q1,  q4
-    vqdmulh.s16     q4,  q5,  XFIX_1_082392200
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
     vsub.s16        q10, q10, q14
-    vadd.s16        q2,  q2,  q6
-    vsub.s16        q6,  q8,  q12
-    vadd.s16        q12, q8,  q12
-    vadd.s16        q9,  q5,  q4
-    vadd.s16        q5,  q6,  q10
-    vsub.s16        q10, q6,  q10
-    vadd.s16        q6,  q15, q13
-    vadd.s16        q8,  q12, q14
-    vsub.s16        q3,  q6,  q3
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
     vsub.s16        q12, q12, q14
-    vsub.s16        q3,  q3,  q1
-    vsub.s16        q1,  q9,  q1
-    vadd.s16        q2,  q3,  q2
-    vsub.s16        q15, q8,  q6
-    vadd.s16        q1,  q1,  q2
-    vadd.s16        q8,  q8,  q6
-    vadd.s16        q14, q5,  q3
-    vsub.s16        q9,  q5,  q3
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
     vsub.s16        q13, q10, q2
-    vpop            {d8-d13}        /* restore NEON registers */
+    vpop            {d8-d13}      /* restore NEON registers */
     vadd.s16        q10, q10, q2
     vsub.s16        q11, q12, q1
     vadd.s16        q12, q12, q1
     /* Descale to 8-bit and range limit */
-    vmov.u8         q0,  #0x80
-    vqshrn.s16      d16, q8,  #5
-    vqshrn.s16      d17, q9,  #5
+    vmov.u8         q0, #0x80
+    vqshrn.s16      d16, q8, #5
+    vqshrn.s16      d17, q9, #5
     vqshrn.s16      d18, q10, #5
     vqshrn.s16      d19, q11, #5
     vqshrn.s16      d20, q12, #5
     vqshrn.s16      d21, q13, #5
     vqshrn.s16      d22, q14, #5
     vqshrn.s16      d23, q15, #5
-    vadd.u8         q8,  q8,  q0
-    vadd.u8         q9,  q9,  q0
+    vadd.u8         q8, q8, q0
+    vadd.u8         q9, q9, q0
     vadd.u8         q10, q10, q0
     vadd.u8         q11, q11, q0
     /* Transpose the final 8-bit samples */
-    vtrn.16         q8,  q9
+    vtrn.16         q8, q9
     vtrn.16         q10, q11
-    vtrn.32         q8,  q10
-    vtrn.32         q9,  q11
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
     vtrn.8          d16, d17
     vtrn.8          d18, d19
       /* Store results to the output buffer */
@@ -917,81 +923,80 @@
 
 #define CONST_BITS  13
 
-#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
+#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
+#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
+#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
+#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
+#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
+#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
+#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
+#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
+#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
+#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
+#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
+#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
+#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
+#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
 
 .balign 16
 jsimd_idct_4x4_neon_consts:
-    .short     FIX_1_847759065     /* d0[0] */
-    .short     -FIX_0_765366865    /* d0[1] */
-    .short     -FIX_0_211164243    /* d0[2] */
-    .short     FIX_1_451774981     /* d0[3] */
-    .short     -FIX_2_172734803    /* d1[0] */
-    .short     FIX_1_061594337     /* d1[1] */
-    .short     -FIX_0_509795579    /* d1[2] */
-    .short     -FIX_0_601344887    /* d1[3] */
-    .short     FIX_0_899976223     /* d2[0] */
-    .short     FIX_2_562915447     /* d2[1] */
-    .short     1 << (CONST_BITS+1) /* d2[2] */
-    .short     0                   /* d2[3] */
+  .short FIX_1_847759065      /* d0[0] */
+  .short -FIX_0_765366865     /* d0[1] */
+  .short -FIX_0_211164243     /* d0[2] */
+  .short FIX_1_451774981      /* d0[3] */
+  .short -FIX_2_172734803     /* d1[0] */
+  .short FIX_1_061594337      /* d1[1] */
+  .short -FIX_0_509795579     /* d1[2] */
+  .short -FIX_0_601344887     /* d1[3] */
+  .short FIX_0_899976223      /* d2[0] */
+  .short FIX_2_562915447      /* d2[1] */
+  .short 1 << (CONST_BITS+1)  /* d2[2] */
+  .short 0                    /* d2[3] */
 
 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    vmull.s16       q14, \x4,  d2[2]
-    vmlal.s16       q14, \x8,  d0[0]
+    vmull.s16       q14, \x4, d2[2]
+    vmlal.s16       q14, \x8, d0[0]
     vmlal.s16       q14, \x14, d0[1]
 
     vmull.s16       q13, \x16, d1[2]
     vmlal.s16       q13, \x12, d1[3]
     vmlal.s16       q13, \x10, d2[0]
-    vmlal.s16       q13, \x6,  d2[1]
+    vmlal.s16       q13, \x6, d2[1]
 
-    vmull.s16       q15, \x4,  d2[2]
-    vmlsl.s16       q15, \x8,  d0[0]
+    vmull.s16       q15, \x4, d2[2]
+    vmlsl.s16       q15, \x8, d0[0]
     vmlsl.s16       q15, \x14, d0[1]
 
     vmull.s16       q12, \x16, d0[2]
     vmlal.s16       q12, \x12, d0[3]
     vmlal.s16       q12, \x10, d1[0]
-    vmlal.s16       q12, \x6,  d1[1]
+    vmlal.s16       q12, \x6, d1[1]
 
     vadd.s32        q10, q14, q13
     vsub.s32        q14, q14, q13
 
-.if \shift > 16
-    vrshr.s32       q10,  q10, #\shift
-    vrshr.s32       q14,  q14, #\shift
+  .if \shift > 16
+    vrshr.s32       q10, q10, #\shift
+    vrshr.s32       q14, q14, #\shift
     vmovn.s32       \y26, q10
     vmovn.s32       \y29, q14
-.else
+  .else
     vrshrn.s32      \y26, q10, #\shift
     vrshrn.s32      \y29, q14, #\shift
-.endif
+  .endif
 
     vadd.s32        q10, q15, q12
     vsub.s32        q15, q15, q12
 
-.if \shift > 16
-    vrshr.s32       q10,  q10, #\shift
-    vrshr.s32       q15,  q15, #\shift
+  .if \shift > 16
+    vrshr.s32       q10, q10, #\shift
+    vrshr.s32       q15, q15, #\shift
     vmovn.s32       \y27, q10
     vmovn.s32       \y28, q15
-.else
+  .else
     vrshrn.s32      \y27, q10, #\shift
     vrshrn.s32      \y28, q15, #\shift
-.endif
-
+  .endif
 .endm
 
 asm_function jsimd_idct_4x4_neon
@@ -1127,31 +1132,30 @@
 
 .balign 8
 jsimd_idct_2x2_neon_consts:
-    .short     -FIX_0_720959822    /* d0[0] */
-    .short     FIX_0_850430095     /* d0[1] */
-    .short     -FIX_1_272758580    /* d0[2] */
-    .short     FIX_3_624509785     /* d0[3] */
+  .short -FIX_0_720959822  /* d0[0] */
+  .short FIX_0_850430095   /* d0[1] */
+  .short -FIX_1_272758580  /* d0[2] */
+  .short FIX_3_624509785   /* d0[3] */
 
 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    vshll.s16  q14,  \x4,  #15
-    vmull.s16  q13,  \x6,  d0[3]
-    vmlal.s16  q13,  \x10, d0[2]
-    vmlal.s16  q13,  \x12, d0[1]
-    vmlal.s16  q13,  \x16, d0[0]
+    vshll.s16       q14, \x4, #15
+    vmull.s16       q13, \x6, d0[3]
+    vmlal.s16       q13, \x10, d0[2]
+    vmlal.s16       q13, \x12, d0[1]
+    vmlal.s16       q13, \x16, d0[0]
 
-    vadd.s32   q10,  q14,  q13
-    vsub.s32   q14,  q14,  q13
+    vadd.s32        q10, q14, q13
+    vsub.s32        q14, q14, q13
 
-.if \shift > 16
-    vrshr.s32  q10,  q10,  #\shift
-    vrshr.s32  q14,  q14,  #\shift
-    vmovn.s32  \y26, q10
-    vmovn.s32  \y27, q14
-.else
-    vrshrn.s32 \y26, q10,  #\shift
-    vrshrn.s32 \y27, q14,  #\shift
-.endif
-
+  .if \shift > 16
+    vrshr.s32       q10, q10, #\shift
+    vrshr.s32       q14, q14, #\shift
+    vmovn.s32       \y26, q10
+    vmovn.s32       \y27, q14
+  .else
+    vrshrn.s32      \y26, q10, #\shift
+    vrshrn.s32      \y27, q14, #\shift
+  .endif
 .endm
 
 asm_function jsimd_idct_2x2_neon
@@ -1205,30 +1209,30 @@
     /* Pass 1 */
 #if 0
     idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
-    transpose_4x4   d4, d6, d8,  d10
+    transpose_4x4   d4, d6, d8, d10
     idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
-    transpose_4x4   d5, d7, d9,  d11
+    transpose_4x4   d5, d7, d9, d11
 #else
-    vmull.s16       q13, d6,  d0[3]
+    vmull.s16       q13, d6, d0[3]
     vmlal.s16       q13, d10, d0[2]
     vmlal.s16       q13, d12, d0[1]
     vmlal.s16       q13, d16, d0[0]
-    vmull.s16       q12, d7,  d0[3]
+    vmull.s16       q12, d7, d0[3]
     vmlal.s16       q12, d11, d0[2]
     vmlal.s16       q12, d13, d0[1]
     vmlal.s16       q12, d17, d0[0]
-    vshll.s16       q14, d4,  #15
-    vshll.s16       q15, d5,  #15
+    vshll.s16       q14, d4, #15
+    vshll.s16       q15, d5, #15
     vadd.s32        q10, q14, q13
     vsub.s32        q14, q14, q13
-    vrshrn.s32      d4,  q10, #13
-    vrshrn.s32      d6,  q14, #13
+    vrshrn.s32      d4, q10, #13
+    vrshrn.s32      d6, q14, #13
     vadd.s32        q10, q15, q12
     vsub.s32        q14, q15, q12
-    vrshrn.s32      d5,  q10, #13
-    vrshrn.s32      d7,  q14, #13
-    vtrn.16         q2,  q3
-    vtrn.32         q3,  q5
+    vrshrn.s32      d5, q10, #13
+    vrshrn.s32      d7, q14, #13
+    vtrn.16         q2, q3
+    vtrn.32         q3, q5
 #endif
 
     /* Pass 2 */
@@ -1278,110 +1282,110 @@
 
 
 .macro do_load size
-    .if \size == 8
-        vld1.8  {d4}, [U, :64]!
-        vld1.8  {d5}, [V, :64]!
-        vld1.8  {d0}, [Y, :64]!
-        pld     [U, #64]
-        pld     [V, #64]
-        pld     [Y, #64]
-    .elseif \size == 4
-        vld1.8  {d4[0]}, [U]!
-        vld1.8  {d4[1]}, [U]!
-        vld1.8  {d4[2]}, [U]!
-        vld1.8  {d4[3]}, [U]!
-        vld1.8  {d5[0]}, [V]!
-        vld1.8  {d5[1]}, [V]!
-        vld1.8  {d5[2]}, [V]!
-        vld1.8  {d5[3]}, [V]!
-        vld1.8  {d0[0]}, [Y]!
-        vld1.8  {d0[1]}, [Y]!
-        vld1.8  {d0[2]}, [Y]!
-        vld1.8  {d0[3]}, [Y]!
-    .elseif \size == 2
-        vld1.8  {d4[4]}, [U]!
-        vld1.8  {d4[5]}, [U]!
-        vld1.8  {d5[4]}, [V]!
-        vld1.8  {d5[5]}, [V]!
-        vld1.8  {d0[4]}, [Y]!
-        vld1.8  {d0[5]}, [Y]!
-    .elseif \size == 1
-        vld1.8  {d4[6]}, [U]!
-        vld1.8  {d5[6]}, [V]!
-        vld1.8  {d0[6]}, [Y]!
-    .else
-        .error unsupported macroblock size
-    .endif
+  .if \size == 8
+    vld1.8          {d4}, [U, :64]!
+    vld1.8          {d5}, [V, :64]!
+    vld1.8          {d0}, [Y, :64]!
+    pld             [U, #64]
+    pld             [V, #64]
+    pld             [Y, #64]
+  .elseif \size == 4
+    vld1.8          {d4[0]}, [U]!
+    vld1.8          {d4[1]}, [U]!
+    vld1.8          {d4[2]}, [U]!
+    vld1.8          {d4[3]}, [U]!
+    vld1.8          {d5[0]}, [V]!
+    vld1.8          {d5[1]}, [V]!
+    vld1.8          {d5[2]}, [V]!
+    vld1.8          {d5[3]}, [V]!
+    vld1.8          {d0[0]}, [Y]!
+    vld1.8          {d0[1]}, [Y]!
+    vld1.8          {d0[2]}, [Y]!
+    vld1.8          {d0[3]}, [Y]!
+  .elseif \size == 2
+    vld1.8          {d4[4]}, [U]!
+    vld1.8          {d4[5]}, [U]!
+    vld1.8          {d5[4]}, [V]!
+    vld1.8          {d5[5]}, [V]!
+    vld1.8          {d0[4]}, [Y]!
+    vld1.8          {d0[5]}, [Y]!
+  .elseif \size == 1
+    vld1.8          {d4[6]}, [U]!
+    vld1.8          {d5[6]}, [V]!
+    vld1.8          {d0[6]}, [Y]!
+  .else
+    .error unsupported macroblock size
+  .endif
 .endm
 
 .macro do_store bpp, size
-    .if \bpp == 24
-        .if \size == 8
-            vst3.8  {d10, d11, d12}, [RGB]!
-        .elseif \size == 4
-            vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
-            vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
-            vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
-            vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
-        .elseif \size == 2
-            vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
-            vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
-        .elseif \size == 1
-            vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 32
-        .if \size == 8
-            vst4.8  {d10, d11, d12, d13}, [RGB]!
-        .elseif \size == 4
-            vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-            vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-            vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-            vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-        .elseif \size == 2
-            vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-            vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-        .elseif \size == 1
-            vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 16
-        .if \size == 8
-            vst1.16  {q15}, [RGB]!
-        .elseif \size == 4
-            vst1.16  {d30}, [RGB]!
-        .elseif \size == 2
-            vst1.16  {d31[0]}, [RGB]!
-            vst1.16  {d31[1]}, [RGB]!
-        .elseif \size == 1
-            vst1.16  {d31[2]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
+  .if \bpp == 24
+    .if \size == 8
+      vst3.8        {d10, d11, d12}, [RGB]!
+    .elseif \size == 4
+      vst3.8        {d10[0], d11[0], d12[0]}, [RGB]!
+      vst3.8        {d10[1], d11[1], d12[1]}, [RGB]!
+      vst3.8        {d10[2], d11[2], d12[2]}, [RGB]!
+      vst3.8        {d10[3], d11[3], d12[3]}, [RGB]!
+    .elseif \size == 2
+      vst3.8        {d10[4], d11[4], d12[4]}, [RGB]!
+      vst3.8        {d10[5], d11[5], d12[5]}, [RGB]!
+    .elseif \size == 1
+      vst3.8        {d10[6], d11[6], d12[6]}, [RGB]!
     .else
-        .error unsupported bpp
+      .error unsupported macroblock size
     .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      vst4.8        {d10, d11, d12, d13}, [RGB]!
+    .elseif \size == 4
+      vst4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+      vst4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+      vst4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+      vst4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+    .elseif \size == 2
+      vst4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+      vst4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+    .elseif \size == 1
+      vst4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 16
+    .if \size == 8
+      vst1.16       {q15}, [RGB]!
+    .elseif \size == 4
+      vst1.16       {d30}, [RGB]!
+    .elseif \size == 2
+      vst1.16       {d31[0]}, [RGB]!
+      vst1.16       {d31[1]}, [RGB]!
+    .elseif \size == 1
+      vst1.16       {d31[2]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
 .endm
 
 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
 
 /*
- * 2 stage pipelined YCbCr->RGB conversion
+ * 2-stage pipelined YCbCr->RGB conversion
  */
 
 .macro do_yuv_to_rgb_stage1
-    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
-    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
-    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
-    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
+    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
+    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
+    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
+    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
+    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
+    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
+    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
+    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
+    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
+    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
 .endm
 
 .macro do_yuv_to_rgb_stage2
@@ -1394,68 +1398,68 @@
     vaddw.u8        q11, q10, d0
     vaddw.u8        q12, q12, d0
     vaddw.u8        q14, q14, d0
-.if \bpp != 16
+  .if \bpp != 16
     vqmovun.s16     d1\g_offs, q11
     vqmovun.s16     d1\r_offs, q12
     vqmovun.s16     d1\b_offs, q14
-.else /* rgb565 */
+  .else  /* rgb565 */
     vqshlu.s16      q13, q11, #8
     vqshlu.s16      q15, q12, #8
     vqshlu.s16      q14, q14, #8
     vsri.u16        q15, q13, #5
     vsri.u16        q15, q14, #11
-.endif
+  .endif
 .endm
 
 .macro do_yuv_to_rgb_stage2_store_load_stage1
-      /* "do_yuv_to_rgb_stage2" and "store" */
-      vrshrn.s32      d20, q10, #15
+                                       /* "do_yuv_to_rgb_stage2" and "store" */
+                                       vrshrn.s32      d20, q10, #15
     /* "load" and "do_yuv_to_rgb_stage1" */
     pld             [U, #64]
-      vrshrn.s32      d21, q11, #15
+                                       vrshrn.s32      d21, q11, #15
     pld             [V, #64]
-      vrshrn.s32      d24, q12, #14
-      vrshrn.s32      d25, q13, #14
+                                       vrshrn.s32      d24, q12, #14
+                                       vrshrn.s32      d25, q13, #14
     vld1.8          {d4}, [U, :64]!
-      vrshrn.s32      d28, q14, #14
+                                       vrshrn.s32      d28, q14, #14
     vld1.8          {d5}, [V, :64]!
-      vrshrn.s32      d29, q15, #14
-    vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
-      vaddw.u8        q11, q10, d0
-    vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
-      vaddw.u8        q12, q12, d0
-      vaddw.u8        q14, q14, d0
-.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
-      vqmovun.s16     d1\g_offs, q11
+                                       vrshrn.s32      d29, q15, #14
+    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
+    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
+                                       vaddw.u8        q11, q10, d0
+    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
+    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
+                                       vaddw.u8        q12, q12, d0
+                                       vaddw.u8        q14, q14, d0
+  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
+                                       vqmovun.s16     d1\g_offs, q11
     pld             [Y, #64]
-      vqmovun.s16     d1\r_offs, q12
+                                       vqmovun.s16     d1\r_offs, q12
     vld1.8          {d0}, [Y, :64]!
-      vqmovun.s16     d1\b_offs, q14
-    vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
-      do_store        \bpp, 8
-    vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
-.else /**************************** rgb565 ***********************************/
-      vqshlu.s16      q13, q11, #8
+                                       vqmovun.s16     d1\b_offs, q14
+    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
+    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
+                                       do_store        \bpp, 8
+    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
+    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
+    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
+    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
+  .else  /**************************** rgb565 ********************************/
+                                       vqshlu.s16      q13, q11, #8
     pld             [Y, #64]
-      vqshlu.s16      q15, q12, #8
-      vqshlu.s16      q14, q14, #8
+                                       vqshlu.s16      q15, q12, #8
+                                       vqshlu.s16      q14, q14, #8
     vld1.8          {d0}, [Y, :64]!
     vmull.s16       q11, d7, d1[1]
     vmlal.s16       q11, d9, d1[2]
-      vsri.u16        q15, q13, #5
+                                       vsri.u16        q15, q13, #5
     vmull.s16       q12, d8, d1[0]
-      vsri.u16        q15, q14, #11
+                                       vsri.u16        q15, q14, #11
     vmull.s16       q13, d9, d1[0]
     vmull.s16       q14, d6, d1[3]
-      do_store        \bpp, 8
+                                       do_store        \bpp, 8
     vmull.s16       q15, d7, d1[3]
-.endif
+  .endif
 .endm
 
 .macro do_yuv_to_rgb
@@ -1469,10 +1473,10 @@
 
 .balign 16
 jsimd_ycc_\colorid\()_neon_consts:
-    .short          0,      0,     0,      0
-    .short          22971, -11277, -23401, 29033
-    .short          -128,  -128,   -128,   -128
-    .short          -128,  -128,   -128,   -128
+  .short 0,      0,     0,      0
+  .short 22971, -11277, -23401, 29033
+  .short -128,  -128,   -128,   -128
+  .short -128,  -128,   -128,   -128
 
 asm_function jsimd_ycc_\colorid\()_convert_neon
     OUTPUT_WIDTH    .req r0
@@ -1617,123 +1621,123 @@
  */
 
 .macro do_store size
-    .if \size == 8
-        vst1.8  {d20}, [Y]!
-        vst1.8  {d21}, [U]!
-        vst1.8  {d22}, [V]!
-    .elseif \size == 4
-        vst1.8  {d20[0]}, [Y]!
-        vst1.8  {d20[1]}, [Y]!
-        vst1.8  {d20[2]}, [Y]!
-        vst1.8  {d20[3]}, [Y]!
-        vst1.8  {d21[0]}, [U]!
-        vst1.8  {d21[1]}, [U]!
-        vst1.8  {d21[2]}, [U]!
-        vst1.8  {d21[3]}, [U]!
-        vst1.8  {d22[0]}, [V]!
-        vst1.8  {d22[1]}, [V]!
-        vst1.8  {d22[2]}, [V]!
-        vst1.8  {d22[3]}, [V]!
-    .elseif \size == 2
-        vst1.8  {d20[4]}, [Y]!
-        vst1.8  {d20[5]}, [Y]!
-        vst1.8  {d21[4]}, [U]!
-        vst1.8  {d21[5]}, [U]!
-        vst1.8  {d22[4]}, [V]!
-        vst1.8  {d22[5]}, [V]!
-    .elseif \size == 1
-        vst1.8  {d20[6]}, [Y]!
-        vst1.8  {d21[6]}, [U]!
-        vst1.8  {d22[6]}, [V]!
-    .else
-        .error unsupported macroblock size
-    .endif
+  .if \size == 8
+    vst1.8          {d20}, [Y]!
+    vst1.8          {d21}, [U]!
+    vst1.8          {d22}, [V]!
+  .elseif \size == 4
+    vst1.8          {d20[0]}, [Y]!
+    vst1.8          {d20[1]}, [Y]!
+    vst1.8          {d20[2]}, [Y]!
+    vst1.8          {d20[3]}, [Y]!
+    vst1.8          {d21[0]}, [U]!
+    vst1.8          {d21[1]}, [U]!
+    vst1.8          {d21[2]}, [U]!
+    vst1.8          {d21[3]}, [U]!
+    vst1.8          {d22[0]}, [V]!
+    vst1.8          {d22[1]}, [V]!
+    vst1.8          {d22[2]}, [V]!
+    vst1.8          {d22[3]}, [V]!
+  .elseif \size == 2
+    vst1.8          {d20[4]}, [Y]!
+    vst1.8          {d20[5]}, [Y]!
+    vst1.8          {d21[4]}, [U]!
+    vst1.8          {d21[5]}, [U]!
+    vst1.8          {d22[4]}, [V]!
+    vst1.8          {d22[5]}, [V]!
+  .elseif \size == 1
+    vst1.8          {d20[6]}, [Y]!
+    vst1.8          {d21[6]}, [U]!
+    vst1.8          {d22[6]}, [V]!
+  .else
+    .error unsupported macroblock size
+  .endif
 .endm
 
 .macro do_load bpp, size
-    .if \bpp == 24
-        .if \size == 8
-            vld3.8  {d10, d11, d12}, [RGB]!
-            pld     [RGB, #128]
-        .elseif \size == 4
-            vld3.8  {d10[0], d11[0], d12[0]}, [RGB]!
-            vld3.8  {d10[1], d11[1], d12[1]}, [RGB]!
-            vld3.8  {d10[2], d11[2], d12[2]}, [RGB]!
-            vld3.8  {d10[3], d11[3], d12[3]}, [RGB]!
-        .elseif \size == 2
-            vld3.8  {d10[4], d11[4], d12[4]}, [RGB]!
-            vld3.8  {d10[5], d11[5], d12[5]}, [RGB]!
-        .elseif \size == 1
-            vld3.8  {d10[6], d11[6], d12[6]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
-    .elseif \bpp == 32
-        .if \size == 8
-            vld4.8  {d10, d11, d12, d13}, [RGB]!
-            pld     [RGB, #128]
-        .elseif \size == 4
-            vld4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-            vld4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-            vld4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-            vld4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-        .elseif \size == 2
-            vld4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-            vld4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-        .elseif \size == 1
-            vld4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-        .else
-            .error unsupported macroblock size
-        .endif
+  .if \bpp == 24
+    .if \size == 8
+      vld3.8        {d10, d11, d12}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
+      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
+      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
+      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
+    .elseif \size == 2
+      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
+      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
+    .elseif \size == 1
+      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
     .else
-        .error unsupported bpp
+      .error unsupported macroblock size
     .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      vld4.8        {d10, d11, d12, d13}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+    .elseif \size == 2
+      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+    .elseif \size == 1
+      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
 .endm
 
 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
 
 /*
- * 2 stage pipelined RGB->YCbCr conversion
+ * 2-stage pipelined RGB->YCbCr conversion
  */
 
 .macro do_rgb_to_yuv_stage1
-    vmovl.u8    q2, d1\r_offs /* r = { d4, d5 } */
-    vmovl.u8    q3, d1\g_offs /* g = { d6, d7 } */
-    vmovl.u8    q4, d1\b_offs /* b = { d8, d9 } */
-    vmull.u16   q7, d4, d0[0]
-    vmlal.u16   q7, d6, d0[1]
-    vmlal.u16   q7, d8, d0[2]
-    vmull.u16   q8, d5, d0[0]
-    vmlal.u16   q8, d7, d0[1]
-    vmlal.u16   q8, d9, d0[2]
-    vrev64.32   q9,  q1
-    vrev64.32   q13, q1
-    vmlsl.u16   q9,  d4, d0[3]
-    vmlsl.u16   q9,  d6, d1[0]
-    vmlal.u16   q9,  d8, d1[1]
-    vmlsl.u16   q13, d5, d0[3]
-    vmlsl.u16   q13, d7, d1[0]
-    vmlal.u16   q13, d9, d1[1]
-    vrev64.32   q14, q1
-    vrev64.32   q15, q1
-    vmlal.u16   q14, d4, d1[1]
-    vmlsl.u16   q14, d6, d1[2]
-    vmlsl.u16   q14, d8, d1[3]
-    vmlal.u16   q15, d5, d1[1]
-    vmlsl.u16   q15, d7, d1[2]
-    vmlsl.u16   q15, d9, d1[3]
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vrev64.32       q9, q1
+    vrev64.32       q13, q1
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
 .endm
 
 .macro do_rgb_to_yuv_stage2
-    vrshrn.u32  d20, q7,  #16
-    vrshrn.u32  d21, q8,  #16
-    vshrn.u32   d22, q9,  #16
-    vshrn.u32   d23, q13, #16
-    vshrn.u32   d24, q14, #16
-    vshrn.u32   d25, q15, #16
-    vmovn.u16   d20, q10      /* d20 = y */
-    vmovn.u16   d21, q11      /* d21 = u */
-    vmovn.u16   d22, q12      /* d22 = v */
+    vrshrn.u32      d20, q7, #16
+    vrshrn.u32      d21, q8, #16
+    vshrn.u32       d22, q9, #16
+    vshrn.u32       d23, q13, #16
+    vshrn.u32       d24, q14, #16
+    vshrn.u32       d25, q15, #16
+    vmovn.u16       d20, q10       /* d20 = y */
+    vmovn.u16       d21, q11       /* d21 = u */
+    vmovn.u16       d22, q12       /* d22 = v */
 .endm
 
 .macro do_rgb_to_yuv
@@ -1742,52 +1746,52 @@
 .endm
 
 .macro do_rgb_to_yuv_stage2_store_load_stage1
-      vrshrn.u32  d20, q7,  #16
-      vrshrn.u32  d21, q8,  #16
-      vshrn.u32   d22, q9,  #16
-    vrev64.32   q9,  q1
-      vshrn.u32   d23, q13, #16
-    vrev64.32   q13, q1
-      vshrn.u32   d24, q14, #16
-      vshrn.u32   d25, q15, #16
-    do_load     \bpp, 8
-      vmovn.u16   d20, q10      /* d20 = y */
-    vmovl.u8    q2, d1\r_offs   /* r = { d4, d5 } */
-      vmovn.u16   d21, q11      /* d21 = u */
-    vmovl.u8    q3, d1\g_offs   /* g = { d6, d7 } */
-      vmovn.u16   d22, q12      /* d22 = v */
-    vmovl.u8    q4, d1\b_offs   /* b = { d8, d9 } */
-    vmull.u16   q7, d4, d0[0]
-    vmlal.u16   q7, d6, d0[1]
-    vmlal.u16   q7, d8, d0[2]
-      vst1.8      {d20}, [Y]!
-    vmull.u16   q8, d5, d0[0]
-    vmlal.u16   q8, d7, d0[1]
-    vmlal.u16   q8, d9, d0[2]
-    vmlsl.u16   q9,  d4, d0[3]
-    vmlsl.u16   q9,  d6, d1[0]
-    vmlal.u16   q9,  d8, d1[1]
-      vst1.8      {d21}, [U]!
-    vmlsl.u16   q13, d5, d0[3]
-    vmlsl.u16   q13, d7, d1[0]
-    vmlal.u16   q13, d9, d1[1]
-    vrev64.32   q14, q1
-    vrev64.32   q15, q1
-    vmlal.u16   q14, d4, d1[1]
-    vmlsl.u16   q14, d6, d1[2]
-    vmlsl.u16   q14, d8, d1[3]
-      vst1.8      {d22}, [V]!
-    vmlal.u16   q15, d5, d1[1]
-    vmlsl.u16   q15, d7, d1[2]
-    vmlsl.u16   q15, d9, d1[3]
+      vrshrn.u32      d20, q7, #16
+      vrshrn.u32      d21, q8, #16
+      vshrn.u32       d22, q9, #16
+    vrev64.32       q9, q1
+      vshrn.u32       d23, q13, #16
+    vrev64.32       q13, q1
+      vshrn.u32       d24, q14, #16
+      vshrn.u32       d25, q15, #16
+    do_load         \bpp, 8
+      vmovn.u16       d20, q10     /* d20 = y */
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+      vmovn.u16       d21, q11     /* d21 = u */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+      vmovn.u16       d22, q12     /* d22 = v */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+      vst1.8          {d20}, [Y]!
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+      vst1.8          {d21}, [U]!
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+      vst1.8          {d22}, [V]!
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
 .endm
 
 .balign 16
 jsimd_\colorid\()_ycc_neon_consts:
-    .short          19595, 38470, 7471,  11059
-    .short          21709, 32768, 27439, 5329
-    .short          32767, 128,   32767, 128
-    .short          32767, 128,   32767, 128
+  .short 19595, 38470, 7471,  11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128,   32767, 128
+  .short 32767, 128,   32767, 128
 
 asm_function jsimd_\colorid\()_ycc_convert_neon
     OUTPUT_WIDTH    .req r0
@@ -1997,10 +2001,10 @@
 
 .balign 16
 jsimd_fdct_ifast_neon_consts:
-    .short (98 * 128)              /* XFIX_0_382683433 */
-    .short (139 * 128)             /* XFIX_0_541196100 */
-    .short (181 * 128)             /* XFIX_0_707106781 */
-    .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
+  .short (98 * 128)               /* XFIX_0_382683433 */
+  .short (139 * 128)              /* XFIX_0_541196100 */
+  .short (181 * 128)              /* XFIX_0_707106781 */
+  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
 
 asm_function jsimd_fdct_ifast_neon
 
@@ -2037,52 +2041,52 @@
     /* Transpose */
     vtrn.16         q12, q13
     vtrn.16         q10, q11
-    vtrn.16         q8,  q9
+    vtrn.16         q8, q9
     vtrn.16         q14, q15
-    vtrn.32         q9,  q11
+    vtrn.32         q9, q11
     vtrn.32         q13, q15
-    vtrn.32         q8,  q10
+    vtrn.32         q8, q10
     vtrn.32         q12, q14
     vswp            d30, d23
     vswp            d24, d17
     vswp            d26, d19
       /* 1-D FDCT */
-      vadd.s16        q2,  q11, q12
+      vadd.s16        q2, q11, q12
     vswp            d28, d21
       vsub.s16        q12, q11, q12
-      vsub.s16        q6,  q10, q13
+      vsub.s16        q6, q10, q13
       vadd.s16        q10, q10, q13
-      vsub.s16        q7,  q9,  q14
-      vadd.s16        q9,  q9,  q14
-      vsub.s16        q1,  q8,  q15
-      vadd.s16        q8,  q8,  q15
-      vsub.s16        q4,  q9,  q10
-      vsub.s16        q5,  q8,  q2
-      vadd.s16        q3,  q9,  q10
-      vadd.s16        q4,  q4,  q5
-      vadd.s16        q2,  q8,  q2
-      vqdmulh.s16     q4,  q4,  XFIX_0_707106781
+      vsub.s16        q7, q9, q14
+      vadd.s16        q9, q9, q14
+      vsub.s16        q1, q8, q15
+      vadd.s16        q8, q8, q15
+      vsub.s16        q4, q9, q10
+      vsub.s16        q5, q8, q2
+      vadd.s16        q3, q9, q10
+      vadd.s16        q4, q4, q5
+      vadd.s16        q2, q8, q2
+      vqdmulh.s16     q4, q4, XFIX_0_707106781
       vadd.s16        q11, q12, q6
-      vadd.s16        q8,  q2,  q3
-      vsub.s16        q12, q2,  q3
-      vadd.s16        q3,  q6,  q7
-      vadd.s16        q7,  q7,  q1
-      vqdmulh.s16     q3,  q3,  XFIX_0_707106781
-      vsub.s16        q6,  q11, q7
-      vadd.s16        q10, q5,  q4
-      vqdmulh.s16     q6,  q6,  XFIX_0_382683433
-      vsub.s16        q14, q5,  q4
+      vadd.s16        q8, q2, q3
+      vsub.s16        q12, q2, q3
+      vadd.s16        q3, q6, q7
+      vadd.s16        q7, q7, q1
+      vqdmulh.s16     q3, q3, XFIX_0_707106781
+      vsub.s16        q6, q11, q7
+      vadd.s16        q10, q5, q4
+      vqdmulh.s16     q6, q6, XFIX_0_382683433
+      vsub.s16        q14, q5, q4
       vqdmulh.s16     q11, q11, XFIX_0_541196100
-      vqdmulh.s16     q5,  q7,  XFIX_1_306562965
-      vadd.s16        q4,  q1,  q3
-      vsub.s16        q3,  q1,  q3
-      vadd.s16        q7,  q7,  q6
+      vqdmulh.s16     q5, q7, XFIX_1_306562965
+      vadd.s16        q4, q1, q3
+      vsub.s16        q3, q1, q3
+      vadd.s16        q7, q7, q6
       vadd.s16        q11, q11, q6
-      vadd.s16        q7,  q7,  q5
-      vadd.s16        q13, q3,  q11
-      vsub.s16        q11, q3,  q11
-      vadd.s16        q9,  q4,  q7
-      vsub.s16        q15, q4,  q7
+      vadd.s16        q7, q7, q5
+      vadd.s16        q13, q3, q11
+      vsub.s16        q11, q3, q11
+      vadd.s16        q9, q4, q7
+      vsub.s16        q15, q4, q7
     subs            TMP, TMP, #1
     bne             1b
 
@@ -2103,8 +2107,8 @@
 
 /*
  * GLOBAL(void)
- * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
- *                      DCTELEM * workspace);
+ * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
+ *                      DCTELEM *workspace);
  *
  * Note: the code uses 2 stage pipelining in order to improve instructions
  *       scheduling and eliminate stalls (this provides ~15% better
@@ -2131,22 +2135,22 @@
     vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
     vabs.s16        q13, q1
     vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10 /* add correction */
+    vadd.u16        q12, q12, q10  /* add correction */
     vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
+    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
     vmull.u16       q11, d25, d17
-    vmull.u16       q8,  d26, d18
-    vmull.u16       q9,  d27, d19
+    vmull.u16       q8, d26, d18
+    vmull.u16       q9, d27, d19
     vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
     vshrn.u32       d20, q10, #16
     vshrn.u32       d21, q11, #16
-    vshrn.u32       d22, q8,  #16
-    vshrn.u32       d23, q9,  #16
+    vshrn.u32       d22, q8, #16
+    vshrn.u32       d23, q9, #16
     vneg.s16        q12, q12
     vneg.s16        q13, q13
-    vshr.s16        q2,  q0,  #15 /* extract sign */
-    vshr.s16        q3,  q1,  #15
-    vshl.u16        q14, q10, q12 /* shift */
+    vshr.s16        q2, q0, #15    /* extract sign */
+    vshr.s16        q3, q1, #15
+    vshl.u16        q14, q10, q12  /* shift */
     vshl.u16        q15, q11, q13
 
     push            {r4, r5}
@@ -2159,25 +2163,25 @@
     vabs.s16        q13, q1
       veor.u16        q15, q15, q3
     vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10 /* add correction */
+    vadd.u16        q12, q12, q10  /* add correction */
     vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16 /* multiply by reciprocal */
+    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
     vmull.u16       q11, d25, d17
-    vmull.u16       q8,  d26, d18
-    vmull.u16       q9,  d27, d19
+    vmull.u16       q8, d26, d18
+    vmull.u16       q9, d27, d19
       vsub.u16        q14, q14, q2
     vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
       vsub.u16        q15, q15, q3
     vshrn.u32       d20, q10, #16
     vshrn.u32       d21, q11, #16
       vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-    vshrn.u32       d22, q8,  #16
-    vshrn.u32       d23, q9,  #16
+    vshrn.u32       d22, q8, #16
+    vshrn.u32       d23, q9, #16
     vneg.s16        q12, q12
     vneg.s16        q13, q13
-    vshr.s16        q2,  q0,  #15 /* extract sign */
-    vshr.s16        q3,  q1,  #15
-    vshl.u16        q14, q10, q12 /* shift */
+    vshr.s16        q2, q0, #15    /* extract sign */
+    vshr.s16        q3, q1, #15
+    vshl.u16        q14, q10, q12  /* shift */
     vshl.u16        q15, q11, q13
     subs            LOOP_COUNT, LOOP_COUNT, #1
     bne             1b
@@ -2189,7 +2193,7 @@
       vsub.u16        q15, q15, q3
       vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
 
-    bx              lr /* return */
+    bx              lr  /* return */
 
     .unreq          COEF_BLOCK
     .unreq          DIVISORS
@@ -2204,10 +2208,10 @@
 
 /*
  * GLOBAL(void)
- * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
- *                                 JDIMENSION   downsampled_width,
- *                                 JSAMPARRAY   input_data,
- *                                 JSAMPARRAY * output_data_ptr);
+ * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
+ *                                 JDIMENSION downsampled_width,
+ *                                 JSAMPARRAY input_data,
+ *                                 JSAMPARRAY *output_data_ptr);
  *
  * Note: the use of unaligned writes is the main remaining bottleneck in
  *       this code, which can be potentially solved to get up to tens
@@ -2221,22 +2225,22 @@
  * Register d28 is used for multiplication by 3. Register q15 is used
  * for adding +1 bias.
  */
-.macro upsample16   OUTPTR, INPTR
+.macro upsample16 OUTPTR, INPTR
     vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8,  d0
-    vext.8          q2,  q1,  q0, #15
-    vmovl.u8        q9,  d1
+    vmovl.u8        q8, d0
+    vext.8          q2, q1, q0, #15
+    vmovl.u8        q9, d1
     vaddw.u8        q10, q15, d4
     vaddw.u8        q11, q15, d5
-    vmlal.u8        q8,  d4,  d28
-    vmlal.u8        q9,  d5,  d28
-    vmlal.u8        q10, d0,  d28
-    vmlal.u8        q11, d1,  d28
-    vmov            q1,  q0       /* backup source pixels to q1 */
-    vrshrn.u16      d6,  q8,  #2
-    vrshrn.u16      d7,  q9,  #2
-    vshrn.u16       d8,  q10, #2
-    vshrn.u16       d9,  q11, #2
+    vmlal.u8        q8, d4, d28
+    vmlal.u8        q9, d5, d28
+    vmlal.u8        q10, d0, d28
+    vmlal.u8        q11, d1, d28
+    vmov            q1, q0        /* backup source pixels to q1 */
+    vrshrn.u16      d6, q8, #2
+    vrshrn.u16      d7, q9, #2
+    vshrn.u16       d8, q10, #2
+    vshrn.u16       d9, q11, #2
     vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
 .endm
 
@@ -2247,39 +2251,39 @@
  * Also this unrolling allows to reorder loads and stores to compensate
  * multiplication latency and reduce stalls.
  */
-.macro upsample32   OUTPTR, INPTR
+.macro upsample32 OUTPTR, INPTR
     /* even 16 pixels group */
     vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8,  d0
-    vext.8          q2,  q1,  q0, #15
-    vmovl.u8        q9,  d1
+    vmovl.u8        q8, d0
+    vext.8          q2, q1, q0, #15
+    vmovl.u8        q9, d1
     vaddw.u8        q10, q15, d4
     vaddw.u8        q11, q15, d5
-    vmlal.u8        q8,  d4,  d28
-    vmlal.u8        q9,  d5,  d28
-    vmlal.u8        q10, d0,  d28
-    vmlal.u8        q11, d1,  d28
-        /* odd 16 pixels group */
-        vld1.8          {q1}, [\INPTR]!
-    vrshrn.u16      d6,  q8,  #2
-    vrshrn.u16      d7,  q9,  #2
-    vshrn.u16       d8,  q10, #2
-    vshrn.u16       d9,  q11, #2
-        vmovl.u8        q8,  d2
-        vext.8          q2,  q0,  q1, #15
-        vmovl.u8        q9,  d3
-        vaddw.u8        q10, q15, d4
-        vaddw.u8        q11, q15, d5
-        vmlal.u8        q8,  d4,  d28
-        vmlal.u8        q9,  d5,  d28
-        vmlal.u8        q10, d2,  d28
-        vmlal.u8        q11, d3,  d28
+    vmlal.u8        q8, d4, d28
+    vmlal.u8        q9, d5, d28
+    vmlal.u8        q10, d0, d28
+    vmlal.u8        q11, d1, d28
+      /* odd 16 pixels group */
+      vld1.8          {q1}, [\INPTR]!
+    vrshrn.u16      d6, q8, #2
+    vrshrn.u16      d7, q9, #2
+    vshrn.u16       d8, q10, #2
+    vshrn.u16       d9, q11, #2
+      vmovl.u8        q8, d2
+      vext.8          q2, q0, q1, #15
+      vmovl.u8        q9, d3
+      vaddw.u8        q10, q15, d4
+      vaddw.u8        q11, q15, d5
+      vmlal.u8        q8, d4, d28
+      vmlal.u8        q9, d5, d28
+      vmlal.u8        q10, d2, d28
+      vmlal.u8        q11, d3, d28
     vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-        vrshrn.u16      d6,  q8,  #2
-        vrshrn.u16      d7,  q9,  #2
-        vshrn.u16       d8,  q10, #2
-        vshrn.u16       d9,  q11, #2
-        vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
+      vrshrn.u16      d6, q8, #2
+      vrshrn.u16      d7, q9, #2
+      vshrn.u16       d8, q10, #2
+      vshrn.u16       d9, q11, #2
+      vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
 .endm
 
 /*
@@ -2340,21 +2344,21 @@
 2:
     tst             \WIDTH, #8
     beq             2f
-    vmov            d1,  d0
+    vmov            d1, d0
     sub             \INPTR, \INPTR, #8
     vld1.8          {d0}, [\INPTR]
 2:  /* upsample the remaining pixels */
-    vmovl.u8        q8,  d0
-    vext.8          q2,  q1,  q0, #15
-    vmovl.u8        q9,  d1
+    vmovl.u8        q8, d0
+    vext.8          q2, q1, q0, #15
+    vmovl.u8        q9, d1
     vaddw.u8        q10, q15, d4
     vaddw.u8        q11, q15, d5
-    vmlal.u8        q8,  d4,  d28
-    vmlal.u8        q9,  d5,  d28
-    vmlal.u8        q10, d0,  d28
-    vmlal.u8        q11, d1,  d28
-    vrshrn.u16      d10, q8,  #2
-    vrshrn.u16      d12, q9,  #2
+    vmlal.u8        q8, d4, d28
+    vmlal.u8        q9, d5, d28
+    vmlal.u8        q10, d0, d28
+    vmlal.u8        q11, d1, d28
+    vrshrn.u16      d10, q8, #2
+    vrshrn.u16      d12, q9, #2
     vshrn.u16       d11, q10, #2
     vshrn.u16       d13, q11, #2
     vzip.8          d10, d11
@@ -2363,12 +2367,12 @@
     tst             \WIDTH, #8
     beq             2f
     vst1.8          {d10, d11}, [\OUTPTR]!
-    vmov            q5,  q6
+    vmov            q5, q6
 2:
     tst             \WIDTH, #4
     beq             2f
     vst1.8          {d10}, [\OUTPTR]!
-    vmov            d10,  d11
+    vmov            d10, d11
 2:
     tst             \WIDTH, #2
     beq             2f
@@ -2432,7 +2436,443 @@
     .unreq          WIDTH
     .unreq          TMP
 
-
 .purgem upsample16
 .purgem upsample32
 .purgem upsample_row
+
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(JOCTET*)
+ * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
+ *                              JCOEFPTR block, int last_dc_val,
+ *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ *
+ */
+
+.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
+    sub             \PUT_BITS, \PUT_BITS, #0x8
+    lsr             \TMP, \PUT_BUFFER, \PUT_BITS
+    uxtb            \TMP, \TMP
+    strb            \TMP, [\BUFFER, #1]!
+    cmp             \TMP, #0xff
+    /*it eq*/
+    strbeq          \ZERO, [\BUFFER, #1]!
+.endm
+
+.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
+    /*lsl             \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
+    add             \PUT_BITS, \SIZE
+    /*orr             \PUT_BUFFER, \PUT_BUFFER, \CODE*/
+    orr             \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
+.endm
+
+.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
+  cmp               \PUT_BITS, #0x10
+  blt               15f
+    eor               \ZERO, \ZERO, \ZERO
+    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
+    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
+15:
+.endm
+
+.balign 16
+jsimd_huff_encode_one_block_neon_consts:
+  .byte 0x01
+  .byte 0x02
+  .byte 0x04
+  .byte 0x08
+  .byte 0x10
+  .byte 0x20
+  .byte 0x40
+  .byte 0x80
+
+asm_function jsimd_huff_encode_one_block_neon
+    push            {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+    add             r7, sp, #0x1c
+    sub             r4, sp, #0x40
+    bfc             r4, #0, #5
+    mov             sp, r4           /* align sp on 32 bytes */
+    vst1.64         {d8, d9, d10, d11}, [r4, :128]!
+    vst1.64         {d12, d13, d14, d15}, [r4, :128]
+    sub             sp, #0x140       /* reserve 320 bytes */
+    str             r0, [sp, #0x18]  /* working state > sp + Ox18 */
+    add             r4, sp, #0x20    /* r4 = t1 */
+    ldr             lr, [r7, #0x8]   /* lr = dctbl */
+    sub             r10, r1, #0x1    /* r10=buffer-- */
+    ldrsh           r1, [r2]
+    mov             r9, #0x10
+    mov             r8, #0x1
+    adr             r5, jsimd_huff_encode_one_block_neon_consts
+    /* prepare data */
+    vld1.8          {d26}, [r5, :64]
+    veor            q8, q8, q8
+    veor            q9, q9, q9
+    vdup.16         q14, r9
+    vdup.16         q15, r8
+    veor            q10, q10, q10
+    veor            q11, q11, q11
+    sub             r1, r1, r3
+    add             r9, r2, #0x22
+    add             r8, r2, #0x18
+    add             r3, r2, #0x36
+    vmov.16         d0[0], r1
+    vld1.16         {d2[0]}, [r9, :16]
+    vld1.16         {d4[0]}, [r8, :16]
+    vld1.16         {d6[0]}, [r3, :16]
+    add             r1, r2, #0x2
+    add             r9, r2, #0x30
+    add             r8, r2, #0x26
+    add             r3, r2, #0x28
+    vld1.16         {d0[1]}, [r1, :16]
+    vld1.16         {d2[1]}, [r9, :16]
+    vld1.16         {d4[1]}, [r8, :16]
+    vld1.16         {d6[1]}, [r3, :16]
+    add             r1, r2, #0x10
+    add             r9, r2, #0x40
+    add             r8, r2, #0x34
+    add             r3, r2, #0x1a
+    vld1.16         {d0[2]}, [r1, :16]
+    vld1.16         {d2[2]}, [r9, :16]
+    vld1.16         {d4[2]}, [r8, :16]
+    vld1.16         {d6[2]}, [r3, :16]
+    add             r1, r2, #0x20
+    add             r9, r2, #0x32
+    add             r8, r2, #0x42
+    add             r3, r2, #0xc
+    vld1.16         {d0[3]}, [r1, :16]
+    vld1.16         {d2[3]}, [r9, :16]
+    vld1.16         {d4[3]}, [r8, :16]
+    vld1.16         {d6[3]}, [r3, :16]
+    add             r1, r2, #0x12
+    add             r9, r2, #0x24
+    add             r8, r2, #0x50
+    add             r3, r2, #0xe
+    vld1.16         {d1[0]}, [r1, :16]
+    vld1.16         {d3[0]}, [r9, :16]
+    vld1.16         {d5[0]}, [r8, :16]
+    vld1.16         {d7[0]}, [r3, :16]
+    add             r1, r2, #0x4
+    add             r9, r2, #0x16
+    add             r8, r2, #0x60
+    add             r3, r2, #0x1c
+    vld1.16         {d1[1]}, [r1, :16]
+    vld1.16         {d3[1]}, [r9, :16]
+    vld1.16         {d5[1]}, [r8, :16]
+    vld1.16         {d7[1]}, [r3, :16]
+    add             r1, r2, #0x6
+    add             r9, r2, #0x8
+    add             r8, r2, #0x52
+    add             r3, r2, #0x2a
+    vld1.16         {d1[2]}, [r1, :16]
+    vld1.16         {d3[2]}, [r9, :16]
+    vld1.16         {d5[2]}, [r8, :16]
+    vld1.16         {d7[2]}, [r3, :16]
+    add             r1, r2, #0x14
+    add             r9, r2, #0xa
+    add             r8, r2, #0x44
+    add             r3, r2, #0x38
+    vld1.16         {d1[3]}, [r1, :16]
+    vld1.16         {d3[3]}, [r9, :16]
+    vld1.16         {d5[3]}, [r8, :16]
+    vld1.16         {d7[3]}, [r3, :16]
+    vcgt.s16        q8, q8, q0
+    vcgt.s16        q9, q9, q1
+    vcgt.s16        q10, q10, q2
+    vcgt.s16        q11, q11, q3
+    vabs.s16        q0, q0
+    vabs.s16        q1, q1
+    vabs.s16        q2, q2
+    vabs.s16        q3, q3
+    veor            q8, q8, q0
+    veor            q9, q9, q1
+    veor            q10, q10, q2
+    veor            q11, q11, q3
+    add             r9, r4, #0x20
+    add             r8, r4, #0x80
+    add             r3, r4, #0xa0
+    vclz.i16        q0, q0
+    vclz.i16        q1, q1
+    vclz.i16        q2, q2
+    vclz.i16        q3, q3
+    vsub.i16        q0, q14, q0
+    vsub.i16        q1, q14, q1
+    vsub.i16        q2, q14, q2
+    vsub.i16        q3, q14, q3
+    vst1.16         {d0, d1, d2, d3}, [r4, :256]
+    vst1.16         {d4, d5, d6, d7}, [r9, :256]
+    vshl.s16        q0, q15, q0
+    vshl.s16        q1, q15, q1
+    vshl.s16        q2, q15, q2
+    vshl.s16        q3, q15, q3
+    vsub.i16        q0, q0, q15
+    vsub.i16        q1, q1, q15
+    vsub.i16        q2, q2, q15
+    vsub.i16        q3, q3, q15
+    vand            q8, q8, q0
+    vand            q9, q9, q1
+    vand            q10, q10, q2
+    vand            q11, q11, q3
+    vst1.16         {d16, d17, d18, d19}, [r8, :256]
+    vst1.16         {d20, d21, d22, d23}, [r3, :256]
+    add             r1, r2, #0x46
+    add             r9, r2, #0x3a
+    add             r8, r2, #0x74
+    add             r3, r2, #0x6a
+    vld1.16         {d8[0]}, [r1, :16]
+    vld1.16         {d10[0]}, [r9, :16]
+    vld1.16         {d12[0]}, [r8, :16]
+    vld1.16         {d14[0]}, [r3, :16]
+    veor            q8, q8, q8
+    veor            q9, q9, q9
+    veor            q10, q10, q10
+    veor            q11, q11, q11
+    add             r1, r2, #0x54
+    add             r9, r2, #0x2c
+    add             r8, r2, #0x76
+    add             r3, r2, #0x78
+    vld1.16         {d8[1]}, [r1, :16]
+    vld1.16         {d10[1]}, [r9, :16]
+    vld1.16         {d12[1]}, [r8, :16]
+    vld1.16         {d14[1]}, [r3, :16]
+    add             r1, r2, #0x62
+    add             r9, r2, #0x1e
+    add             r8, r2, #0x68
+    add             r3, r2, #0x7a
+    vld1.16         {d8[2]}, [r1, :16]
+    vld1.16         {d10[2]}, [r9, :16]
+    vld1.16         {d12[2]}, [r8, :16]
+    vld1.16         {d14[2]}, [r3, :16]
+    add             r1, r2, #0x70
+    add             r9, r2, #0x2e
+    add             r8, r2, #0x5a
+    add             r3, r2, #0x6c
+    vld1.16         {d8[3]}, [r1, :16]
+    vld1.16         {d10[3]}, [r9, :16]
+    vld1.16         {d12[3]}, [r8, :16]
+    vld1.16         {d14[3]}, [r3, :16]
+    add             r1, r2, #0x72
+    add             r9, r2, #0x3c
+    add             r8, r2, #0x4c
+    add             r3, r2, #0x5e
+    vld1.16         {d9[0]}, [r1, :16]
+    vld1.16         {d11[0]}, [r9, :16]
+    vld1.16         {d13[0]}, [r8, :16]
+    vld1.16         {d15[0]}, [r3, :16]
+    add             r1, r2, #0x64
+    add             r9, r2, #0x4a
+    add             r8, r2, #0x3e
+    add             r3, r2, #0x6e
+    vld1.16         {d9[1]}, [r1, :16]
+    vld1.16         {d11[1]}, [r9, :16]
+    vld1.16         {d13[1]}, [r8, :16]
+    vld1.16         {d15[1]}, [r3, :16]
+    add             r1, r2, #0x56
+    add             r9, r2, #0x58
+    add             r8, r2, #0x4e
+    add             r3, r2, #0x7c
+    vld1.16         {d9[2]}, [r1, :16]
+    vld1.16         {d11[2]}, [r9, :16]
+    vld1.16         {d13[2]}, [r8, :16]
+    vld1.16         {d15[2]}, [r3, :16]
+    add             r1, r2, #0x48
+    add             r9, r2, #0x66
+    add             r8, r2, #0x5c
+    add             r3, r2, #0x7e
+    vld1.16         {d9[3]}, [r1, :16]
+    vld1.16         {d11[3]}, [r9, :16]
+    vld1.16         {d13[3]}, [r8, :16]
+    vld1.16         {d15[3]}, [r3, :16]
+    vcgt.s16        q8, q8, q4
+    vcgt.s16        q9, q9, q5
+    vcgt.s16        q10, q10, q6
+    vcgt.s16        q11, q11, q7
+    vabs.s16        q4, q4
+    vabs.s16        q5, q5
+    vabs.s16        q6, q6
+    vabs.s16        q7, q7
+    veor            q8, q8, q4
+    veor            q9, q9, q5
+    veor            q10, q10, q6
+    veor            q11, q11, q7
+    add             r1, r4, #0x40
+    add             r9, r4, #0x60
+    add             r8, r4, #0xc0
+    add             r3, r4, #0xe0
+    vclz.i16        q4, q4
+    vclz.i16        q5, q5
+    vclz.i16        q6, q6
+    vclz.i16        q7, q7
+    vsub.i16        q4, q14, q4
+    vsub.i16        q5, q14, q5
+    vsub.i16        q6, q14, q6
+    vsub.i16        q7, q14, q7
+    vst1.16         {d8, d9, d10, d11}, [r1, :256]
+    vst1.16         {d12, d13, d14, d15}, [r9, :256]
+    vshl.s16        q4, q15, q4
+    vshl.s16        q5, q15, q5
+    vshl.s16        q6, q15, q6
+    vshl.s16        q7, q15, q7
+    vsub.i16        q4, q4, q15
+    vsub.i16        q5, q5, q15
+    vsub.i16        q6, q6, q15
+    vsub.i16        q7, q7, q15
+    vand            q8, q8, q4
+    vand            q9, q9, q5
+    vand            q10, q10, q6
+    vand            q11, q11, q7
+    vst1.16         {d16, d17, d18, d19}, [r8, :256]
+    vst1.16         {d20, d21, d22, d23}, [r3, :256]
+    ldr             r12, [r7, #0xc]       /* r12 = actbl */
+    add             r1, lr, #0x400        /* r1 = dctbl->ehufsi */
+    mov             r9, r12               /* r9 = actbl */
+    add             r6, r4, #0x80         /* r6 = t2 */
+    ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
+    ldr             r4, [r0, #0xc]        /* r4  = put_bits */
+    ldrh            r2, [r6, #-128]       /* r2  = nbits */
+    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG) 1)<<nbits) - 1; */
+    ldr             r0, [lr, r2, lsl #2]
+    ldrb            r5, [r1, r2]
+    put_bits        r11, r4, r0, r5
+    checkbuf15      r10, r11, r4, r5, r0
+    put_bits        r11, r4, r3, r2
+    checkbuf15      r10, r11, r4, r5, r0
+    mov             lr, r6                /* lr = t2 */
+    add             r5, r9, #0x400        /* r5 = actbl->ehufsi */
+    ldrsb           r6, [r5, #0xf0]       /* r6 = actbl->ehufsi[0xf0] */
+    veor            q8, q8, q8
+    vceq.i16        q0, q0, q8
+    vceq.i16        q1, q1, q8
+    vceq.i16        q2, q2, q8
+    vceq.i16        q3, q3, q8
+    vceq.i16        q4, q4, q8
+    vceq.i16        q5, q5, q8
+    vceq.i16        q6, q6, q8
+    vceq.i16        q7, q7, q8
+    vmovn.i16       d0, q0
+    vmovn.i16       d2, q1
+    vmovn.i16       d4, q2
+    vmovn.i16       d6, q3
+    vmovn.i16       d8, q4
+    vmovn.i16       d10, q5
+    vmovn.i16       d12, q6
+    vmovn.i16       d14, q7
+    vand            d0, d0, d26
+    vand            d2, d2, d26
+    vand            d4, d4, d26
+    vand            d6, d6, d26
+    vand            d8, d8, d26
+    vand            d10, d10, d26
+    vand            d12, d12, d26
+    vand            d14, d14, d26
+    vpadd.i8        d0, d0, d2
+    vpadd.i8        d4, d4, d6
+    vpadd.i8        d8, d8, d10
+    vpadd.i8        d12, d12, d14
+    vpadd.i8        d0, d0, d4
+    vpadd.i8        d8, d8, d12
+    vpadd.i8        d0, d0, d8
+    vmov.32         r1, d0[1]
+    vmov.32         r8, d0[0]
+    mvn             r1, r1
+    mvn             r8, r8
+    lsrs            r1, r1, #0x1
+    rrx             r8, r8            /* shift in last r1 bit while shifting out DC bit */
+    rbit            r1, r1            /* r1 = index1 */
+    rbit            r8, r8            /* r8 = index0 */
+    ldr             r0, [r9, #0x3c0]  /* r0 = actbl->ehufco[0xf0] */
+    str             r1, [sp, #0x14]   /* index1 > sp + 0x14 */
+    cmp             r8, #0x0
+    beq             6f
+1:
+    clz             r2, r8
+    add             lr, lr, r2, lsl #1
+    lsl             r8, r8, r2
+    ldrh            r1, [lr, #-126]
+2:
+    cmp             r2, #0x10
+    blt             3f
+    sub             r2, r2, #0x10
+    put_bits        r11, r4, r0, r6
+    cmp             r4, #0x10
+    blt             2b
+    eor             r3, r3, r3
+    emit_byte       r10, r11, r4, r3, r12
+    emit_byte       r10, r11, r4, r3, r12
+    b               2b
+3:
+    add             r2, r1, r2, lsl #4
+    ldrh            r3, [lr, #2]!
+    ldr             r12, [r9, r2, lsl #2]
+    ldrb            r2, [r5, r2]
+    put_bits        r11, r4, r12, r2
+    checkbuf15      r10, r11, r4, r2, r12
+    put_bits        r11, r4, r3, r1
+    checkbuf15      r10, r11, r4, r2, r12
+    lsls            r8, r8, #0x1
+    bne             1b
+6:
+    add             r12, sp, #0x20   /* r12 = t1 */
+    ldr             r8, [sp, #0x14]  /* r8 = index1 */
+    adds            r12, #0xc0       /* r12 = t2 + (DCTSIZE2/2) */
+    cmp             r8, #0x0
+    beq             6f
+    clz             r2, r8
+    sub             r12, r12, lr
+    lsl             r8, r8, r2
+    add             r2, r2, r12, lsr #1
+    add             lr, lr, r2, lsl #1
+    b               7f
+1:
+    clz             r2, r8
+    add             lr, lr, r2, lsl #1
+    lsl             r8, r8, r2
+7:
+    ldrh            r1, [lr, #-126]
+2:
+    cmp             r2, #0x10
+    blt             3f
+    sub             r2, r2, #0x10
+    put_bits        r11, r4, r0, r6
+    cmp             r4, #0x10
+    blt             2b
+    eor             r3, r3, r3
+    emit_byte       r10, r11, r4, r3, r12
+    emit_byte       r10, r11, r4, r3, r12
+    b               2b
+3:
+    add             r2, r1, r2, lsl #4
+    ldrh            r3, [lr, #2]!
+    ldr             r12, [r9, r2, lsl #2]
+    ldrb            r2, [r5, r2]
+    put_bits        r11, r4, r12, r2
+    checkbuf15      r10, r11, r4, r2, r12
+    put_bits        r11, r4, r3, r1
+    checkbuf15      r10, r11, r4, r2, r12
+    lsls            r8, r8, #0x1
+    bne             1b
+6:
+    add             r0, sp, #0x20
+    add             r0, #0xfe
+    cmp             lr, r0
+    bhs             1f
+    ldr             r1, [r9]
+    ldrb            r0, [r5]
+    put_bits        r11, r4, r1, r0
+    checkbuf15      r10, r11, r4, r0, r1
+1:
+    ldr             r12, [sp, #0x18]
+    str             r11, [r12, #0x8]
+    str             r4, [r12, #0xc]
+    add             r0, r10, #0x1
+    add             r4, sp, #0x140
+    vld1.64         {d8, d9, d10, d11}, [r4, :128]!
+    vld1.64         {d12, d13, d14, d15}, [r4, :128]
+    sub             r4, r7, #0x1c
+    mov             sp, r4
+    pop             {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+.purgem emit_byte
+.purgem put_bits
+.purgem checkbuf15
diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c
index 47829e9..aef1ad4 100644
--- a/simd/jsimd_i386.c
+++ b/simd/jsimd_i386.c
@@ -2,8 +2,9 @@
  * jsimd_i386.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2013-2014 D. R. Commander
- * 
+ * Copyright 2009-2011, 2013-2014, 2016 D. R. Commander
+ * Copyright 2015 Matthieu Darbois
+ *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * For conditions of distribution and use, see copyright notice in jsimdext.inc
@@ -30,6 +31,7 @@
 #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
 
 static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
 
 /*
  * Check what SIMD accelerations are supported.
@@ -59,9 +61,14 @@
   env = getenv("JSIMD_FORCESSE2");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support &= JSIMD_SSE2;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
 }
 
-#ifndef JPEG_DECODE_ONLY
 GLOBAL(int)
 jsimd_can_rgb_ycc (void)
 {
@@ -83,7 +90,6 @@
 
   return 0;
 }
-#endif
 
 GLOBAL(int)
 jsimd_can_rgb_gray (void)
@@ -135,7 +141,6 @@
   return 0;
 }
 
-#ifndef JPEG_DECODE_ONLY
 GLOBAL(void)
 jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
@@ -144,8 +149,7 @@
   void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
   void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
-  switch(cinfo->in_color_space)
-  {
+  switch(cinfo->in_color_space) {
     case JCS_EXT_RGB:
       sse2fct=jsimd_extrgb_ycc_convert_sse2;
       mmxfct=jsimd_extrgb_ycc_convert_mmx;
@@ -182,19 +186,9 @@
 
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
-    sse2fct(cinfo->image_width, input_buf,
-        output_buf, output_row, num_rows);
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
   else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->image_width, input_buf,
-        output_buf, output_row, num_rows);
-}
-#endif
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
+    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -205,8 +199,7 @@
   void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
   void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
-  switch(cinfo->in_color_space)
-  {
+  switch(cinfo->in_color_space) {
     case JCS_EXT_RGB:
       sse2fct=jsimd_extrgb_gray_convert_sse2;
       mmxfct=jsimd_extrgb_gray_convert_mmx;
@@ -243,11 +236,9 @@
 
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
-    sse2fct(cinfo->image_width, input_buf,
-        output_buf, output_row, num_rows);
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
   else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->image_width, input_buf,
-        output_buf, output_row, num_rows);
+    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -258,8 +249,7 @@
   void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
   void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
 
-  switch(cinfo->out_color_space)
-  {
+  switch(cinfo->out_color_space) {
     case JCS_EXT_RGB:
       sse2fct=jsimd_ycc_extrgb_convert_sse2;
       mmxfct=jsimd_ycc_extrgb_convert_mmx;
@@ -296,14 +286,18 @@
 
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
-    sse2fct(cinfo->output_width, input_buf,
-        input_row, output_buf, num_rows);
+    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
   else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->output_width, input_buf,
-        input_row, output_buf, num_rows);
+    mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
 }
 
-#ifndef JPEG_DECODE_ONLY
+GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+                          JSAMPIMAGE input_buf, JDIMENSION input_row,
+                          JSAMPARRAY output_buf, int num_rows)
+{
+}
+
 GLOBAL(int)
 jsimd_can_h2v2_downsample (void)
 {
@@ -343,33 +337,34 @@
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   if (simd_support & JSIMD_SSE2)
     jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-        compptr->v_samp_factor, compptr->width_in_blocks,
-        input_data, output_data);
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
   else if (simd_support & JSIMD_MMX)
     jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
-        compptr->v_samp_factor, compptr->width_in_blocks,
-        input_data, output_data);
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   if (simd_support & JSIMD_SSE2)
     jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-        compptr->v_samp_factor, compptr->width_in_blocks,
-        input_data, output_data);
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
   else if (simd_support & JSIMD_MMX)
     jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
-        compptr->v_samp_factor, compptr->width_in_blocks,
-        input_data, output_data);
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
 }
-#endif
 
 GLOBAL(int)
 jsimd_can_h2v2_upsample (void)
@@ -411,30 +406,30 @@
 
 GLOBAL(void)
 jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr, 
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
   if (simd_support & JSIMD_SSE2)
-    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor,
-        cinfo->output_width, input_data, output_data_ptr);
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
   else if (simd_support & JSIMD_MMX)
-    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor,
-        cinfo->output_width, input_data, output_data_ptr);
+    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr, 
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
   if (simd_support & JSIMD_SSE2)
-    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor,
-        cinfo->output_width, input_data, output_data_ptr);
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
   else if (simd_support & JSIMD_MMX)
-    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor,
-        cinfo->output_width, input_data, output_data_ptr);
+    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
 }
 
 GLOBAL(int)
@@ -479,32 +474,36 @@
 
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr, 
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
     jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-        compptr->downsampled_width, input_data, output_data_ptr);
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
   else if (simd_support & JSIMD_MMX)
     jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
-        compptr->downsampled_width, input_data, output_data_ptr);
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
 }
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr, 
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
     jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-        compptr->downsampled_width, input_data, output_data_ptr);
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
   else if (simd_support & JSIMD_MMX)
     jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
-        compptr->downsampled_width, input_data, output_data_ptr);
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
 }
 
 GLOBAL(int)
@@ -556,8 +555,7 @@
   void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
   void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
 
-  switch(cinfo->out_color_space)
-  {
+  switch(cinfo->out_color_space) {
     case JCS_EXT_RGB:
       sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
       mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx;
@@ -594,11 +592,9 @@
 
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    sse2fct(cinfo->output_width, input_buf,
-        in_row_group_ctr, output_buf);
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
   else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->output_width, input_buf,
-        in_row_group_ctr, output_buf);
+    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(void)
@@ -610,8 +606,7 @@
   void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
   void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
 
-  switch(cinfo->out_color_space)
-  {
+  switch(cinfo->out_color_space) {
     case JCS_EXT_RGB:
       sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
       mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx;
@@ -648,14 +643,11 @@
 
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    sse2fct(cinfo->output_width, input_buf,
-        in_row_group_ctr, output_buf);
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
   else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->output_width, input_buf,
-        in_row_group_ctr, output_buf);
+    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
-#ifndef JPEG_DECODE_ONLY
 GLOBAL(int)
 jsimd_can_convsamp (void)
 {
@@ -706,7 +698,7 @@
 
 GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM * workspace)
+                DCTELEM *workspace)
 {
   if (simd_support & JSIMD_SSE2)
     jsimd_convsamp_sse2(sample_data, start_col, workspace);
@@ -716,7 +708,7 @@
 
 GLOBAL(void)
 jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT * workspace)
+                      FAST_FLOAT *workspace)
 {
   if (simd_support & JSIMD_SSE2)
     jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
@@ -784,7 +776,7 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM * data)
+jsimd_fdct_islow (DCTELEM *data)
 {
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
     jsimd_fdct_islow_sse2(data);
@@ -793,7 +785,7 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM * data)
+jsimd_fdct_ifast (DCTELEM *data)
 {
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
     jsimd_fdct_ifast_sse2(data);
@@ -802,7 +794,7 @@
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT * data)
+jsimd_fdct_float (FAST_FLOAT *data)
 {
   if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
     jsimd_fdct_float_sse(data);
@@ -855,8 +847,8 @@
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-                DCTELEM * workspace)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
 {
   if (simd_support & JSIMD_SSE2)
     jsimd_quantize_sse2(coef_block, divisors, workspace);
@@ -865,8 +857,8 @@
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                      FAST_FLOAT * workspace)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
 {
   if (simd_support & JSIMD_SSE2)
     jsimd_quantize_float_sse2(coef_block, divisors, workspace);
@@ -875,7 +867,6 @@
   else if (simd_support & JSIMD_3DNOW)
     jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
 }
-#endif
 
 GLOBAL(int)
 jsimd_can_idct_2x2 (void)
@@ -928,23 +919,25 @@
 }
 
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
   else if (simd_support & JSIMD_MMX)
     jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
   else if (simd_support & JSIMD_MMX)
     jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
 }
@@ -1030,39 +1023,69 @@
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
 {
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
-    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
   else if (simd_support & JSIMD_MMX)
-    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf,
+                         output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
 {
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
-    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
   else if (simd_support & JSIMD_MMX)
-    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf,
+                         output_col);
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
 {
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
-    jsimd_idct_float_sse2(compptr->dct_table, coef_block,
-        output_buf, output_col);
+    jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
   else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
-    jsimd_idct_float_sse(compptr->dct_table, coef_block,
-        output_buf, output_col);
+    jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf,
+                         output_col);
   else if (simd_support & JSIMD_3DNOW)
-    jsimd_idct_float_3dnow(compptr->dct_table, coef_block,
-        output_buf, output_col);
+    jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
 }
diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c
new file mode 100644
index 0000000..bdd9912
--- /dev/null
+++ b/simd/jsimd_mips.c
@@ -0,0 +1,1130 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009-2011, 2014 D. R. Commander
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California
+ * Copyright 2015 Matthieu Darbois
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+LOCAL(int)
+parse_proc_cpuinfo(const char* search_string)
+{
+  const char* file_name = "/proc/cpuinfo";
+  char cpuinfo_line[256];
+  FILE* f = NULL;
+  simd_support = 0;
+
+  if ((f = fopen(file_name, "r")) != NULL) {
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+      if (strstr(cpuinfo_line, search_string) != NULL) {
+        fclose(f);
+        simd_support |= JSIMD_MIPS_DSPR2;
+        return 1;
+      }
+    }
+    fclose(f);
+  }
+  /* Did not find string in the proc file, or not Linux ELF. */
+  return 0;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd (void)
+{
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__MIPSEL__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+  simd_support |= JSIMD_MIPS_DSPR2;
+#elif defined(__linux__)
+  /* We still have a chance to use MIPS DSPR2 regardless of globally used
+   * -mdspr2 options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux */
+  if (!parse_proc_cpuinfo("MIPS 74K"))
+    return;
+#endif
+}
+
+static const int mips_idct_ifast_coefs[4] = {
+  0x45404540,           // FIX( 1.082392200 / 2) =  17734 = 0x4546
+  0x5A805A80,           // FIX( 1.414213562 / 2) =  23170 = 0x5A82
+  0x76407640,           // FIX( 1.847759065 / 2) =  30274 = 0x7642
+  0xAC60AC60            // FIX(-2.613125930 / 4) = -21407 = 0xAC61
+};
+
+/* The following struct is borrowed from jdsample.c */
+typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
+                               jpeg_component_info *compptr,
+                               JSAMPARRAY input_data,
+                               JSAMPARRAY *output_data_ptr);
+
+typedef struct {
+  struct jpeg_upsampler pub;
+  JSAMPARRAY color_buf[MAX_COMPONENTS];
+  upsample1_ptr methods[MAX_COMPONENTS];
+  int next_row_out;
+  JDIMENSION rows_to_go;
+  int rowgroup_height[MAX_COMPONENTS];
+  UINT8 h_expand[MAX_COMPONENTS];
+  UINT8 v_expand[MAX_COMPONENTS];
+} my_upsampler;
+
+typedef my_upsampler *my_upsample_ptr;
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565 (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+  void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space) {
+    case JCS_EXT_RGB:
+      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGR:
+      mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2;
+
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2;
+      break;
+    default:
+      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
+      break;
+  }
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row,
+                 num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                        JDIMENSION output_row, int num_rows)
+{
+  void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space) {
+    case JCS_EXT_RGB:
+      mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      mipsdspr2fct=jsimd_extrgbx_gray_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGR:
+      mipsdspr2fct=jsimd_extbgr_gray_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      mipsdspr2fct=jsimd_extbgrx_gray_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      mipsdspr2fct=jsimd_extxbgr_gray_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      mipsdspr2fct=jsimd_extxrgb_gray_convert_mips_dspr2;
+      break;
+    default:
+      mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2;
+      break;
+  }
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row,
+                 num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch(cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGR:
+      mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2;
+      break;
+  default:
+      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
+      break;
+  }
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    mipsdspr2fct(cinfo->output_width, input_buf, input_row, output_buf,
+                 num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+                          JSAMPIMAGE input_buf, JDIMENSION input_row,
+                          JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert (j_compress_ptr cinfo,
+                      JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                      JDIMENSION output_row, int num_rows)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_c_null_convert_mips_dspr2(cinfo->image_width, input_buf,
+                                    output_buf, output_row, num_rows,
+                                    cinfo->num_components);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if(DCTSIZE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v2_downsample_mips_dspr2(cinfo->image_width,
+                                     cinfo->max_v_samp_factor,
+                                     compptr->v_samp_factor,
+                                     compptr->width_in_blocks, input_data,
+                                     output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_smooth_downsample_mips_dspr2(input_data, output_data,
+                                          compptr->v_samp_factor,
+                                          cinfo->max_v_samp_factor,
+                                          cinfo->smoothing_factor,
+                                          compptr->width_in_blocks,
+                                          cinfo->image_width);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v1_downsample_mips_dspr2(cinfo->image_width,
+                                     cinfo->max_v_samp_factor,
+                                     compptr->v_samp_factor,
+                                     compptr->width_in_blocks,
+                                     input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info *compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v2_upsample_mips_dspr2(cinfo->max_v_samp_factor,
+                                   cinfo->output_width, input_data,
+                                   output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info *compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v1_upsample_mips_dspr2(cinfo->max_v_samp_factor,
+                                   cinfo->output_width, input_data,
+                                   output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+
+  jsimd_int_upsample_mips_dspr2(upsample->h_expand[compptr->component_index],
+                                upsample->v_expand[compptr->component_index],
+                                input_data, output_data_ptr,
+                                cinfo->output_width,
+                                cinfo->max_v_samp_factor);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v2_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
+                                         compptr->downsampled_width,
+                                         input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_h2v1_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
+                                         compptr->downsampled_width,
+                                         input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  init_simd();
+
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  init_simd();
+
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY,
+                       JSAMPLE *);
+
+  switch(cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      mipsdspr2fct=jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2;
+      break;
+    case JCS_EXT_BGR:
+      mipsdspr2fct=jsimd_h2v2_extbgr_merged_upsample_mips_dspr2;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      mipsdspr2fct=jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      mipsdspr2fct=jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      mipsdspr2fct=jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2;
+      break;
+    default:
+      mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2;
+      break;
+  }
+
+  mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+               cinfo->sample_range_limit);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY,
+                       JSAMPLE *);
+
+  switch(cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      mipsdspr2fct=jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2;
+      break;
+    case JCS_EXT_BGR:
+      mipsdspr2fct=jsimd_h2v1_extbgr_merged_upsample_mips_dspr2;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      mipsdspr2fct=jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      mipsdspr2fct=jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      mipsdspr2fct=jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2;
+      break;
+    default:
+      mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2;
+      break;
+  }
+
+  mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+               cinfo->sample_range_limit);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_convsamp_mips_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT *workspace)
+{
+  if ((simd_support & JSIMD_MIPS_DSPR2))
+    jsimd_convsamp_float_mips_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM *data)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_fdct_islow_mips_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM *data)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_fdct_ifast_mips_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_quantize_mips_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_quantize_float_mips_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6 (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12 (void)
+{
+  init_simd();
+
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    jsimd_idct_2x2_mips_dspr2(compptr->dct_table, coef_block, output_buf,
+                              output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2) {
+    int workspace[DCTSIZE*4];  /* buffers data between passes */
+    jsimd_idct_4x4_mips_dspr2(compptr->dct_table, coef_block, output_buf,
+                              output_col, workspace);
+  }
+}
+
+GLOBAL(void)
+jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+           JCOEFPTR coef_block, JSAMPARRAY output_buf,
+           JDIMENSION output_col)
+{
+    if (simd_support & JSIMD_MIPS_DSPR2)
+      jsimd_idct_6x6_mips_dspr2(compptr->dct_table, coef_block, output_buf,
+                                output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block,
+                  JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2) {
+    int workspace[96];
+    int output[12] = {
+      (int)(output_buf[0] + output_col),
+      (int)(output_buf[1] + output_col),
+      (int)(output_buf[2] + output_col),
+      (int)(output_buf[3] + output_col),
+      (int)(output_buf[4] + output_col),
+      (int)(output_buf[5] + output_col),
+      (int)(output_buf[6] + output_col),
+      (int)(output_buf[7] + output_col),
+      (int)(output_buf[8] + output_col),
+      (int)(output_buf[9] + output_col),
+      (int)(output_buf[10] + output_col),
+      (int)(output_buf[11] + output_col),
+    };
+    jsimd_idct_12x12_pass1_mips_dspr2(coef_block, compptr->dct_table,
+                                      workspace);
+    jsimd_idct_12x12_pass2_mips_dspr2(workspace, output);
+  }
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MIPS_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  init_simd();
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2) {
+    int output[8] = {
+      (int)(output_buf[0] + output_col),
+      (int)(output_buf[1] + output_col),
+      (int)(output_buf[2] + output_col),
+      (int)(output_buf[3] + output_col),
+      (int)(output_buf[4] + output_col),
+      (int)(output_buf[5] + output_col),
+      (int)(output_buf[6] + output_col),
+      (int)(output_buf[7] + output_col),
+    };
+
+    jsimd_idct_islow_mips_dspr2(coef_block, compptr->dct_table,
+                                output, IDCT_range_limit(cinfo));
+  }
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_MIPS_DSPR2) {
+    JCOEFPTR inptr;
+    IFAST_MULT_TYPE *quantptr;
+    DCTELEM workspace[DCTSIZE2];  /* buffers data between passes */
+
+    /* Pass 1: process columns from input, store into work array. */
+
+    inptr = coef_block;
+    quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
+
+    jsimd_idct_ifast_cols_mips_dspr2(inptr, quantptr,
+                                     workspace, mips_idct_ifast_coefs);
+
+    /* Pass 2: process rows from work array, store into output array. */
+    /* Note that we must descale the results by a factor of 8 == 2**3, */
+    /* and also undo the PASS1_BITS scaling. */
+
+    jsimd_idct_ifast_rows_mips_dspr2(workspace, output_buf,
+                                     output_col, mips_idct_ifast_coefs);
+  }
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  return NULL;
+}
diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S
new file mode 100644
index 0000000..0eed1ce
--- /dev/null
+++ b/simd/jsimd_mips_dspr2.S
@@ -0,0 +1,4487 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * All rights reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_mips_dspr2_asm.h"
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - input_buf
+ * a2     - output_buf
+ * a3     - output_row
+ * 16(sp) - num_rows
+ * 20(sp) - cinfo->num_components
+ *
+ * Null conversion for compression
+ */
+
+    SAVE_REGS_ON_STACK 8, s0, s1
+
+    lw        t9, 24(sp)   // t9 = num_rows
+    lw        s0, 28(sp)   // s0 = cinfo->num_components
+    andi      t0, a0, 3    // t0 = cinfo->image_width & 3
+    beqz      t0, 4f       // no residual
+     nop
+0:
+    addiu     t9, t9, -1
+    bltz      t9, 7f
+     li       t1, 0
+1:
+    sll       t3, t1, 2
+    lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
+    lw        t2, 0(a1)    // t2 = inptr = *input_buf
+    sll       t4, a3, 2
+    lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
+    addu      t2, t2, t1
+    addu      s1, t5, a0
+    addu      t6, t5, t0
+2:
+    lbu       t3, 0(t2)
+    addiu     t5, t5, 1
+    sb        t3, -1(t5)
+    bne       t6, t5, 2b
+     addu     t2, t2, s0
+3:
+    lbu       t3, 0(t2)
+    addu      t4, t2, s0
+    addu      t7, t4, s0
+    addu      t8, t7, s0
+    addu      t2, t8, s0
+    lbu       t4, 0(t4)
+    lbu       t7, 0(t7)
+    lbu       t8, 0(t8)
+    addiu     t5, t5, 4
+    sb        t3, -4(t5)
+    sb        t4, -3(t5)
+    sb        t7, -2(t5)
+    bne       s1, t5, 3b
+     sb       t8, -1(t5)
+    addiu     t1, t1, 1
+    bne       t1, s0, 1b
+     nop
+    addiu     a1, a1, 4
+    bgez      t9, 0b
+     addiu    a3, a3, 1
+    b         7f
+     nop
+4:
+    addiu     t9, t9, -1
+    bltz      t9, 7f
+     li       t1, 0
+5:
+    sll       t3, t1, 2
+    lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
+    lw        t2, 0(a1)    // t2 = inptr = *input_buf
+    sll       t4, a3, 2
+    lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
+    addu      t2, t2, t1
+    addu      s1, t5, a0
+    addu      t6, t5, t0
+6:
+    lbu       t3, 0(t2)
+    addu      t4, t2, s0
+    addu      t7, t4, s0
+    addu      t8, t7, s0
+    addu      t2, t8, s0
+    lbu       t4, 0(t4)
+    lbu       t7, 0(t7)
+    lbu       t8, 0(t8)
+    addiu     t5, t5, 4
+    sb        t3, -4(t5)
+    sb        t4, -3(t5)
+    sb        t7, -2(t5)
+    bne       s1, t5, 6b
+     sb       t8, -1(t5)
+    addiu     t1, t1, 1
+    bne       t1, s0, 5b
+     nop
+    addiu     a1, a1, 4
+    bgez      t9, 4b
+     addiu    a3, a3, 1
+7:
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j         ra
+     nop
+
+END(jsimd_c_null_convert_mips_dspr2)
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_mips_dspr2
+ * jsimd_extbgr_ycc_convert_mips_dspr2
+ * jsimd_extrgbx_ycc_convert_mips_dspr2
+ * jsimd_extbgrx_ycc_convert_mips_dspr2
+ * jsimd_extxbgr_ycc_convert_mips_dspr2
+ * jsimd_extxrgb_ycc_convert_mips_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC r,    \
+                     g,    \
+                     b,    \
+                     inptr
+    lbu     \r, \r_offs(\inptr)
+    lbu     \g, \g_offs(\inptr)
+    lbu     \b, \b_offs(\inptr)
+    addiu   \inptr, \pixel_size
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - input_buf
+ * a2     - output_buf
+ * a3     - output_row
+ * 16(sp) - num_rows
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw      t7, 48(sp)        // t7 = num_rows
+    li      s0, 0x4c8b        // FIX(0.29900)
+    li      s1, 0x9646        // FIX(0.58700)
+    li      s2, 0x1d2f        // FIX(0.11400)
+    li      s3, 0xffffd4cd    // -FIX(0.16874)
+    li      s4, 0xffffab33    // -FIX(0.33126)
+    li      s5, 0x8000        // FIX(0.50000)
+    li      s6, 0xffff94d1    // -FIX(0.41869)
+    li      s7, 0xffffeb2f    // -FIX(0.08131)
+    li      t8, 0x807fff      // CBCR_OFFSET + ONE_HALF-1
+
+0:
+    addiu   t7, -1            // --num_rows
+    lw      t6, 0(a1)         // t6 = input_buf[0]
+    lw      t0, 0(a2)
+    lw      t1, 4(a2)
+    lw      t2, 8(a2)
+    sll     t3, a3, 2
+    lwx     t0, t3(t0)        // t0 = output_buf[0][output_row]
+    lwx     t1, t3(t1)        // t1 = output_buf[1][output_row]
+    lwx     t2, t3(t2)        // t2 = output_buf[2][output_row]
+
+    addu    t9, t2, a0        // t9 = end address
+    addiu   a3, 1
+
+1:
+    DO_RGB_TO_YCC t3, t4, t5, t6
+
+    mtlo    s5, $ac0
+    mtlo    t8, $ac1
+    mtlo    t8, $ac2
+    maddu   $ac0, s2, t5
+    maddu   $ac1, s5, t5
+    maddu   $ac2, s5, t3
+    maddu   $ac0, s0, t3
+    maddu   $ac1, s3, t3
+    maddu   $ac2, s6, t4
+    maddu   $ac0, s1, t4
+    maddu   $ac1, s4, t4
+    maddu   $ac2, s7, t5
+    extr.w  t3, $ac0, 16
+    extr.w  t4, $ac1, 16
+    extr.w  t5, $ac2, 16
+    sb      t3, 0(t0)
+    sb      t4, 0(t1)
+    sb      t5, 0(t2)
+    addiu   t0, 1
+    addiu   t2, 1
+    bne     t2, t9, 1b
+     addiu  t1, 1
+    bgtz    t7, 0b
+     addiu  a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j ra
+     nop
+END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*------------------------------------------id -- pix R  G  B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_mips_dspr2
+ * jsimd_ycc_extbgr_convert_mips_dspr2
+ * jsimd_ycc_extrgbx_convert_mips_dspr2
+ * jsimd_ycc_extbgrx_convert_mips_dspr2
+ * jsimd_ycc_extxbgr_convert_mips_dspr2
+ * jsimd_ycc_extxrgb_convert_mips_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB  scratch0 \
+                         scratch1 \
+                         scratch2 \
+                         outptr
+    sb       \scratch0, \r_offs(\outptr)
+    sb       \scratch1, \g_offs(\outptr)
+    sb       \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+    li       t0, 0xFF
+    sb       t0, \a_offs(\outptr)
+.endif
+    addiu    \outptr, \pixel_size
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - input_buf
+ * a2     - input_row
+ * a3     - output_buf
+ * 16(sp) - num_rows
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw         s1, 48(sp)
+    li         t3, 0x8000
+    li         t4, 0x166e9     // FIX(1.40200)
+    li         t5, 0x1c5a2     // FIX(1.77200)
+    li         t6, 0xffff492e  // -FIX(0.71414)
+    li         t7, 0xffffa7e6  // -FIX(0.34414)
+    repl.ph    t8, 128
+
+0:
+    lw         s0, 0(a3)
+    lw         t0, 0(a1)
+    lw         t1, 4(a1)
+    lw         t2, 8(a1)
+    sll        s5, a2, 2
+    addiu      s1, -1
+    lwx        s2, s5(t0)
+    lwx        s3, s5(t1)
+    lwx        s4, s5(t2)
+    addu       t9, s2, a0
+    addiu      a2, 1
+
+1:
+    lbu        s7, 0(s4)       // cr
+    lbu        s6, 0(s3)       // cb
+    lbu        s5, 0(s2)       // y
+    addiu      s2, 1
+    addiu      s4, 1
+    addiu      s7, -128
+    addiu      s6, -128
+    mul        t2, t7, s6
+    mul        t0, t6, s7      // Crgtab[cr]
+    sll        s7, 15
+    mulq_rs.w  t1, t4, s7      // Crrtab[cr]
+    sll        s6, 15
+    addu       t2, t3          // Cbgtab[cb]
+    addu       t2, t0
+
+    mulq_rs.w  t0, t5, s6      // Cbbtab[cb]
+    sra        t2, 16
+    addu       t1, s5
+    addu       t2, s5          // add y
+    ins        t2, t1, 16, 16
+    subu.ph    t2, t2, t8
+    addu       t0, s5
+    shll_s.ph  t2, t2, 8
+    subu       t0, 128
+    shra.ph    t2, t2, 8
+    shll_s.w   t0, t0, 24
+    addu.ph    t2, t2, t8      // clip & store
+    sra        t0, t0, 24
+    sra        t1, t2, 16
+    addiu      t0, 128
+
+    STORE_YCC_TO_RGB t1, t2, t0, s0
+
+    bne        s2, t9, 1b
+     addiu     s3, 1
+    bgtz       s1, 0b
+     addiu     a3, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j ra
+     nop
+END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*------------------------------------------id -- pix R  G  B  A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_gray_convert_mips_dspr2
+ * jsimd_extbgr_gray_convert_mips_dspr2
+ * jsimd_extrgbx_gray_convert_mips_dspr2
+ * jsimd_extbgrx_gray_convert_mips_dspr2
+ * jsimd_extxbgr_gray_convert_mips_dspr2
+ * jsimd_extxrgb_gray_convert_mips_dspr2
+ *
+ * Colorspace conversion RGB -> GRAY
+ */
+
+.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_GRAY r,    \
+                      g,    \
+                      b,    \
+                      inptr
+    lbu     \r, \r_offs(\inptr)
+    lbu     \g, \g_offs(\inptr)
+    lbu     \b, \b_offs(\inptr)
+    addiu   \inptr, \pixel_size
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - input_buf
+ * a2     - output_buf
+ * a3     - output_row
+ * 16(sp) - num_rows
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    li      s0, 0x4c8b             // s0 = FIX(0.29900)
+    li      s1, 0x9646             // s1 = FIX(0.58700)
+    li      s2, 0x1d2f             // s2 = FIX(0.11400)
+    li      s7, 0x8000             // s7 = FIX(0.50000)
+    lw      s6, 48(sp)
+    andi    t7, a0, 3
+
+0:
+    addiu   s6, -1                 // s6 = num_rows
+    lw      t0, 0(a1)
+    lw      t1, 0(a2)
+    sll     t3, a3, 2
+    lwx     t1, t3(t1)
+    addiu   a3, 1
+    addu    t9, t1, a0
+    subu    t8, t9, t7
+    beq     t1, t8, 2f
+     nop
+
+1:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo    s7, $ac0
+    maddu   $ac0, s2, t5
+    maddu   $ac0, s1, t4
+    maddu   $ac0, s0, t3
+    mtlo    s7, $ac1
+    maddu   $ac1, s2, s5
+    maddu   $ac1, s1, s4
+    maddu   $ac1, s0, s3
+    extr.w  t6, $ac0, 16
+
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo    s7, $ac0
+    maddu   $ac0, s2, t5
+    maddu   $ac0, s1, t4
+    extr.w  t2, $ac1, 16
+    maddu   $ac0, s0, t3
+    mtlo    s7, $ac1
+    maddu   $ac1, s2, s5
+    maddu   $ac1, s1, s4
+    maddu   $ac1, s0, s3
+    extr.w  t5, $ac0, 16
+    sb      t6, 0(t1)
+    sb      t2, 1(t1)
+    extr.w  t3, $ac1, 16
+    addiu   t1, 4
+    sb      t5, -2(t1)
+    sb      t3, -1(t1)
+    bne     t1, t8, 1b
+     nop
+
+2:
+    beqz    t7, 4f
+     nop
+
+3:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+
+    mtlo    s7, $ac0
+    maddu   $ac0, s2, t5
+    maddu   $ac0, s1, t4
+    maddu   $ac0, s0, t3
+    extr.w  t6, $ac0, 16
+    sb      t6, 0(t1)
+    addiu   t1, 1
+    bne     t1, t9, 3b
+     nop
+
+4:
+    bgtz    s6, 0b
+     addiu  a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j ra
+     nop
+END(jsimd_\colorid\()_gray_convert_mips_dspr2)
+
+.purgem DO_RGB_TO_GRAY
+
+.endm
+
+/*------------------------------------------id --  pix R  G  B */
+GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_merged_upsample_mips_dspr2
+ * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
+ * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
+ * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
+ * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
+ * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
+ * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
+ *
+ * Merged h2v2 upsample routines
+ */
+.macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
+                                                pixel_size, \
+                                                r1_offs,    \
+                                                g1_offs,    \
+                                                b1_offs,    \
+                                                a1_offs,    \
+                                                r2_offs,    \
+                                                g2_offs,    \
+                                                b2_offs,    \
+                                                a2_offs
+
+.macro STORE_H2V2_2_PIXELS  scratch0 \
+                            scratch1 \
+                            scratch2 \
+                            scratch3 \
+                            scratch4 \
+                            scratch5 \
+                            outptr
+    sb       \scratch0, \r1_offs(\outptr)
+    sb       \scratch1, \g1_offs(\outptr)
+    sb       \scratch2, \b1_offs(\outptr)
+    sb       \scratch3, \r2_offs(\outptr)
+    sb       \scratch4, \g2_offs(\outptr)
+    sb       \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+    li       \scratch0, 0xFF
+    sb       \scratch0, \a1_offs(\outptr)
+    sb       \scratch0, \a2_offs(\outptr)
+.endif
+    addiu    \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V2_1_PIXEL  scratch0 \
+                           scratch1 \
+                           scratch2 \
+                           outptr
+    sb    \scratch0, \r1_offs(\outptr)
+    sb    \scratch1, \g1_offs(\outptr)
+    sb    \scratch2, \b1_offs(\outptr)
+
+.if (\pixel_size == 8)
+    li    t0, 0xFF
+    sb    t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->output_width
+ * a1     - input_buf
+ * a2     - in_row_group_ctr
+ * a3     - output_buf
+ * 16(sp) - cinfo->sample_range_limit
+ */
+
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    lw           t9, 56(sp)        // cinfo->sample_range_limit
+    lw           v0, 0(a1)
+    lw           v1, 4(a1)
+    lw           t0, 8(a1)
+    sll          t1, a2, 3
+    addiu        t2, t1, 4
+    sll          t3, a2, 2
+    lw           t4, 0(a3)         // t4 = output_buf[0]
+    lwx          t1, t1(v0)        // t1 = input_buf[0][in_row_group_ctr*2]
+    lwx          t2, t2(v0)        // t2 = input_buf[0][in_row_group_ctr*2 + 1]
+    lwx          t5, t3(v1)        // t5 = input_buf[1][in_row_group_ctr]
+    lwx          t6, t3(t0)        // t6 = input_buf[2][in_row_group_ctr]
+    lw           t7, 4(a3)         // t7 = output_buf[1]
+    li           s1, 0xe6ea
+    addiu        t8, s1, 0x7fff    // t8 = 0x166e9 [FIX(1.40200)]
+    addiu        s0, t8, 0x5eb9    // s0 = 0x1c5a2 [FIX(1.77200)]
+    addiu        s1, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
+    xori         s2, s1, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
+    srl          t3, a0, 1
+    blez         t3, 2f
+     addu        t0, t5, t3        // t0 = end address
+ 1:
+    lbu          t3, 0(t5)
+    lbu          s3, 0(t6)
+    addiu        t5, t5, 1
+    addiu        t3, t3, -128      // (cb - 128)
+    addiu        s3, s3, -128      // (cr - 128)
+    mult         $ac1, s1, t3
+    madd         $ac1, s2, s3
+    sll          s3, s3, 15
+    sll          t3, t3, 15
+    mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
+    extr_r.w     s5, $ac1, 16
+    mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
+    lbu          v0, 0(t1)
+    addiu        t6, t6, 1
+    addiu        t1, t1, 2
+    addu         t3, v0, s4        // y+cred
+    addu         s3, v0, s5        // y+cgreen
+    addu         v1, v0, s6        // y+cblue
+    addu         t3, t9, t3        // y+cred
+    addu         s3, t9, s3        // y+cgreen
+    addu         v1, t9, v1        // y+cblue
+    lbu          AT, 0(t3)
+    lbu          s7, 0(s3)
+    lbu          ra, 0(v1)
+    lbu          v0, -1(t1)
+    addu         t3, v0, s4        // y+cred
+    addu         s3, v0, s5        // y+cgreen
+    addu         v1, v0, s6        // y+cblue
+    addu         t3, t9, t3        // y+cred
+    addu         s3, t9, s3        // y+cgreen
+    addu         v1, t9, v1        // y+cblue
+    lbu          t3, 0(t3)
+    lbu          s3, 0(s3)
+    lbu          v1, 0(v1)
+    lbu          v0, 0(t2)
+
+    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
+
+    addu         t3, v0, s4        // y+cred
+    addu         s3, v0, s5        // y+cgreen
+    addu         v1, v0, s6        // y+cblue
+    addu         t3, t9, t3        // y+cred
+    addu         s3, t9, s3        // y+cgreen
+    addu         v1, t9, v1        // y+cblue
+    lbu          AT, 0(t3)
+    lbu          s7, 0(s3)
+    lbu          ra, 0(v1)
+    lbu          v0, 1(t2)
+    addiu        t2, t2, 2
+    addu         t3, v0, s4        // y+cred
+    addu         s3, v0, s5        // y+cgreen
+    addu         v1, v0, s6        // y+cblue
+    addu         t3, t9, t3        // y+cred
+    addu         s3, t9, s3        // y+cgreen
+    addu         v1, t9, v1        // y+cblue
+    lbu          t3, 0(t3)
+    lbu          s3, 0(s3)
+    lbu          v1, 0(v1)
+
+    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
+
+    bne          t0, t5, 1b
+     nop
+2:
+    andi         t0, a0, 1
+    beqz         t0, 4f
+     lbu          t3, 0(t5)
+    lbu          s3, 0(t6)
+    addiu        t3, t3, -128      // (cb - 128)
+    addiu        s3, s3, -128      // (cr - 128)
+    mult         $ac1, s1, t3
+    madd         $ac1, s2, s3
+    sll          s3, s3, 15
+    sll          t3, t3, 15
+    lbu          v0, 0(t1)
+    extr_r.w     s5, $ac1, 16
+    mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
+    mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
+    addu         t3, v0, s4        // y+cred
+    addu         s3, v0, s5        // y+cgreen
+    addu         v1, v0, s6        // y+cblue
+    addu         t3, t9, t3        // y+cred
+    addu         s3, t9, s3        // y+cgreen
+    addu         v1, t9, v1        // y+cblue
+    lbu          t3, 0(t3)
+    lbu          s3, 0(s3)
+    lbu          v1, 0(v1)
+    lbu          v0, 0(t2)
+
+    STORE_H2V2_1_PIXEL t3, s3, v1, t4
+
+    addu         t3, v0, s4        // y+cred
+    addu         s3, v0, s5        // y+cgreen
+    addu         v1, v0, s6        // y+cblue
+    addu         t3, t9, t3        // y+cred
+    addu         s3, t9, s3        // y+cgreen
+    addu         v1, t9, v1        // y+cblue
+    lbu          t3, 0(t3)
+    lbu          s3, 0(s3)
+    lbu          v1, 0(v1)
+
+    STORE_H2V2_1_PIXEL t3, s3, v1, t7
+4:
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    j           ra
+     nop
+
+END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
+
+.purgem STORE_H2V2_1_PIXEL
+.purgem STORE_H2V2_2_PIXELS
+.endm
+
+/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+/*****************************************************************************/
+/*
+ * jsimd_h2v1_merged_upsample_mips_dspr2
+ * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
+ * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
+ * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
+ * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
+ * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
+ * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
+ *
+ * Merged h2v1 upsample routines
+ */
+
+.macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
+                                                pixel_size, \
+                                                r1_offs,    \
+                                                g1_offs,    \
+                                                b1_offs,    \
+                                                a1_offs,    \
+                                                r2_offs,    \
+                                                g2_offs,    \
+                                                b2_offs,    \
+                                                a2_offs
+
+.macro STORE_H2V1_2_PIXELS  scratch0 \
+                            scratch1 \
+                            scratch2 \
+                            scratch3 \
+                            scratch4 \
+                            scratch5 \
+                            outptr
+    sb       \scratch0, \r1_offs(\outptr)
+    sb       \scratch1, \g1_offs(\outptr)
+    sb       \scratch2, \b1_offs(\outptr)
+    sb       \scratch3, \r2_offs(\outptr)
+    sb       \scratch4, \g2_offs(\outptr)
+    sb       \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+    li       t0, 0xFF
+    sb       t0, \a1_offs(\outptr)
+    sb       t0, \a2_offs(\outptr)
+.endif
+    addiu    \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V1_1_PIXEL  scratch0 \
+                           scratch1 \
+                           scratch2 \
+                           outptr
+    sb    \scratch0, \r1_offs(\outptr)
+    sb    \scratch1, \g1_offs(\outptr)
+    sb    \scratch2, \b1_offs(\outptr)
+.if (\pixel_size == 8)
+    li    t0, 0xFF
+    sb    t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->output_width
+ * a1     - input_buf
+ * a2     - in_row_group_ctr
+ * a3     - output_buf
+ * 16(sp) - range_limit
+ */
+
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    li           t0, 0xe6ea
+    lw           t1, 0(a1)         // t1 = input_buf[0]
+    lw           t2, 4(a1)         // t2 = input_buf[1]
+    lw           t3, 8(a1)         // t3 = input_buf[2]
+    lw           t8, 56(sp)        // t8 = range_limit
+    addiu        s1, t0, 0x7fff    // s1 = 0x166e9 [FIX(1.40200)]
+    addiu        s2, s1, 0x5eb9    // s2 = 0x1c5a2 [FIX(1.77200)]
+    addiu        s0, t0, 0x9916    // s0 = 0x8000
+    addiu        s4, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
+    xori         s3, s4, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
+    srl          t0, a0, 1
+    sll          t4, a2, 2
+    lwx          s5, t4(t1)        // s5 = inptr0
+    lwx          s6, t4(t2)        // s6 = inptr1
+    lwx          s7, t4(t3)        // s7 = inptr2
+    lw           t7, 0(a3)         // t7 = outptr
+    blez         t0, 2f
+     addu        t9, s6, t0        // t9 = end address
+1:
+    lbu          t2, 0(s6)         // t2 = cb
+    lbu          t0, 0(s7)         // t0 = cr
+    lbu          t1, 0(s5)         // t1 = y
+    addiu        t2, t2, -128      // t2 = cb - 128
+    addiu        t0, t0, -128      // t0 = cr - 128
+    mult         $ac1, s4, t2
+    madd         $ac1, s3, t0
+    sll          t0, t0, 15
+    sll          t2, t2, 15
+    mulq_rs.w    t0, s1, t0        // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
+    extr_r.w     t5, $ac1, 16
+    mulq_rs.w    t6, s2, t2        // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
+    addiu        s7, s7, 1
+    addiu        s6, s6, 1
+    addu         t2, t1, t0        // t2 = y + cred
+    addu         t3, t1, t5        // t3 = y + cgreen
+    addu         t4, t1, t6        // t4 = y + cblue
+    addu         t2, t8, t2
+    addu         t3, t8, t3
+    addu         t4, t8, t4
+    lbu          t1, 1(s5)
+    lbu          v0, 0(t2)
+    lbu          v1, 0(t3)
+    lbu          ra, 0(t4)
+    addu         t2, t1, t0
+    addu         t3, t1, t5
+    addu         t4, t1, t6
+    addu         t2, t8, t2
+    addu         t3, t8, t3
+    addu         t4, t8, t4
+    lbu          t2, 0(t2)
+    lbu          t3, 0(t3)
+    lbu          t4, 0(t4)
+
+    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
+
+    bne          t9, s6, 1b
+     addiu       s5, s5, 2
+2:
+    andi         t0, a0, 1
+    beqz         t0, 4f
+     nop
+3:
+    lbu          t2, 0(s6)
+    lbu          t0, 0(s7)
+    lbu          t1, 0(s5)
+    addiu        t2, t2, -128      //(cb - 128)
+    addiu        t0, t0, -128      //(cr - 128)
+    mul          t3, s4, t2
+    mul          t4, s3, t0
+    sll          t0, t0, 15
+    sll          t2, t2, 15
+    mulq_rs.w    t0, s1, t0       // (C1*cr + ONE_HALF)>> SCALEBITS
+    mulq_rs.w    t6, s2, t2       // (C2*cb + ONE_HALF)>> SCALEBITS
+    addu         t3, t3, s0
+    addu         t3, t4, t3
+    sra          t5, t3, 16       // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
+    addu         t2, t1, t0       // y + cred
+    addu         t3, t1, t5       // y + cgreen
+    addu         t4, t1, t6       // y + cblue
+    addu         t2, t8, t2
+    addu         t3, t8, t3
+    addu         t4, t8, t4
+    lbu          t2, 0(t2)
+    lbu          t3, 0(t3)
+    lbu          t4, 0(t4)
+
+    STORE_H2V1_1_PIXEL t2, t3, t4, t7
+4:
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    j            ra
+     nop
+
+END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
+
+.purgem STORE_H2V1_1_PIXEL
+.purgem STORE_H2V1_2_PIXELS
+.endm
+
+/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_fancy_upsample_mips_dspr2
+ *
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+ */
+LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->max_v_samp_factor
+ * a1     - downsampled_width
+ * a2     - input_data
+ * a3     - output_data_ptr
+ */
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    li             s4, 0
+    lw             s2, 0(a3)       // s2 = *output_data_ptr
+0:
+    li             t9, 2
+    lw             s1, -4(a2)      // s1 = inptr1
+
+1:
+    lw             s0, 0(a2)       // s0 = inptr0
+    lwx            s3, s4(s2)
+    addiu          s5, a1, -2      // s5 = downsampled_width - 2
+    srl            t4, s5, 1
+    sll            t4, t4, 1
+    lbu            t0, 0(s0)
+    lbu            t1, 1(s0)
+    lbu            t2, 0(s1)
+    lbu            t3, 1(s1)
+    addiu          s0, 2
+    addiu          s1, 2
+    addu           t8, s0, t4      // t8 = end address
+    andi           s5, s5, 1       // s5 = residual
+    sll            t4, t0, 1
+    sll            t6, t1, 1
+    addu           t0, t0, t4      // t0 = (*inptr0++) * 3
+    addu           t1, t1, t6      // t1 = (*inptr0++) * 3
+    addu           t7, t0, t2      // t7 = thiscolsum
+    addu           t6, t1, t3      // t5 = nextcolsum
+    sll            t0, t7, 2       // t0 = thiscolsum * 4
+    subu           t1, t0, t7      // t1 = thiscolsum * 3
+    shra_r.w       t0, t0, 4
+    addiu          t1, 7
+    addu           t1, t1, t6
+    srl            t1, t1, 4
+    sb             t0, 0(s3)
+    sb             t1, 1(s3)
+    beq            t8, s0, 22f     // skip to final iteration if width == 3
+     addiu          s3, 2
+2:
+    lh             t0, 0(s0)       // t0 = A3|A2
+    lh             t2, 0(s1)       // t2 = B3|B2
+    addiu          s0, 2
+    addiu          s1, 2
+    preceu.ph.qbr  t0, t0          // t0 = 0|A3|0|A2
+    preceu.ph.qbr  t2, t2          // t2 = 0|B3|0|B2
+    shll.ph        t1, t0, 1
+    sll            t3, t6, 1
+    addu.ph        t0, t1, t0      // t0 = A3*3|A2*3
+    addu           t3, t3, t6      // t3 = this * 3
+    addu.ph        t0, t0, t2      // t0 = next2|next1
+    addu           t1, t3, t7
+    andi           t7, t0, 0xFFFF  // t7 = next1
+    sll            t2, t7, 1
+    addu           t2, t7, t2      // t2 = next1*3
+    addu           t4, t2, t6
+    srl            t6, t0, 16      // t6 = next2
+    shra_r.w       t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
+    addu           t0, t3, t7
+    addiu          t0, 7
+    srl            t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
+    shra_r.w       t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
+    addu           t2, t2, t6
+    addiu          t2, 7
+    srl            t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
+    sb             t1, 0(s3)
+    sb             t0, 1(s3)
+    sb             t4, 2(s3)
+    sb             t2, 3(s3)
+    bne            t8, s0, 2b
+     addiu         s3, 4
+22:
+    beqz           s5, 4f
+     addu          t8, s0, s5
+3:
+    lbu            t0, 0(s0)
+    lbu            t2, 0(s1)
+    addiu          s0, 1
+    addiu          s1, 1
+    sll            t3, t6, 1
+    sll            t1, t0, 1
+    addu           t1, t0, t1      // t1 = inptr0 * 3
+    addu           t3, t3, t6      // t3 = thiscolsum * 3
+    addu           t5, t1, t2
+    addu           t1, t3, t7
+    shra_r.w       t1, t1, 4
+    addu           t0, t3, t5
+    addiu          t0, 7
+    srl            t0, t0, 4
+    sb             t1, 0(s3)
+    sb             t0, 1(s3)
+    addiu          s3, 2
+    move           t7, t6
+    bne            t8, s0, 3b
+     move          t6, t5
+4:
+    sll            t0, t6, 2       // t0 = thiscolsum * 4
+    subu           t1, t0, t6      // t1 = thiscolsum * 3
+    addu           t1, t1, t7
+    addiu          s4, 4
+    shra_r.w       t1, t1, 4
+    addiu          t0, 7
+    srl            t0, t0, 4
+    sb             t1, 0(s3)
+    sb             t0, 1(s3)
+    addiu          t9, -1
+    addiu          s3, 2
+    bnez           t9, 1b
+     lw            s1, 4(a2)
+    srl            t0, s4, 2
+    subu           t0, a0, t0
+    bgtz           t0, 0b
+     addiu         a2, 4
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j ra
+     nop
+END(jsimd_h2v2_fancy_upsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->max_v_samp_factor
+ * a1     - downsampled_width
+ * a2     - input_data
+ * a3     - output_data_ptr
+ */
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    .set at
+
+    beqz           a0, 3f
+     sll           t0, a0, 2
+    lw             s1, 0(a3)
+    li             s3, 0x10001
+    addu           s0, s1, t0
+0:
+    addiu          t8, a1, -2
+    srl            t9, t8, 2
+    lw             t7, 0(a2)
+    lw             s2, 0(s1)
+    lbu            t0, 0(t7)
+    lbu            t1, 1(t7)   // t1 = inptr[1]
+    sll            t2, t0, 1
+    addu           t2, t2, t0  // t2 = invalue*3
+    addu           t2, t2, t1
+    shra_r.w       t2, t2, 2
+    sb             t0, 0(s2)
+    sb             t2, 1(s2)
+    beqz           t9, 11f
+     addiu         s2, 2
+1:
+    ulw            t0, 0(t7)   // t0 = |P3|P2|P1|P0|
+    ulw            t1, 1(t7)
+    ulh            t2, 4(t7)   // t2 = |0|0|P5|P4|
+    preceu.ph.qbl  t3, t0      // t3 = |0|P3|0|P2|
+    preceu.ph.qbr  t0, t0      // t0 = |0|P1|0|P0|
+    preceu.ph.qbr  t2, t2      // t2 = |0|P5|0|P4|
+    preceu.ph.qbl  t4, t1      // t4 = |0|P4|0|P3|
+    preceu.ph.qbr  t1, t1      // t1 = |0|P2|0|P1|
+    shll.ph        t5, t4, 1
+    shll.ph        t6, t1, 1
+    addu.ph        t5, t5, t4  // t5 = |P4*3|P3*3|
+    addu.ph        t6, t6, t1  // t6 = |P2*3|P1*3|
+    addu.ph        t4, t3, s3
+    addu.ph        t0, t0, s3
+    addu.ph        t4, t4, t5
+    addu.ph        t0, t0, t6
+    shrl.ph        t4, t4, 2   // t4 = |0|P3|0|P2|
+    shrl.ph        t0, t0, 2   // t0 = |0|P1|0|P0|
+    addu.ph        t2, t2, t5
+    addu.ph        t3, t3, t6
+    shra_r.ph      t2, t2, 2   // t2 = |0|P5|0|P4|
+    shra_r.ph      t3, t3, 2   // t3 = |0|P3|0|P2|
+    shll.ph        t2, t2, 8
+    shll.ph        t3, t3, 8
+    or             t2, t4, t2
+    or             t3, t3, t0
+    addiu          t9, -1
+    usw            t3, 0(s2)
+    usw            t2, 4(s2)
+    addiu          s2, 8
+    bgtz           t9, 1b
+     addiu         t7, 4
+11:
+    andi           t8, 3
+    beqz           t8, 22f
+     addiu         t7, 1
+
+2:
+    lbu            t0, 0(t7)
+    addiu          t7, 1
+    sll            t1, t0, 1
+    addu           t2, t0, t1  // t2 = invalue
+    lbu            t3, -2(t7)
+    lbu            t4, 0(t7)
+    addiu          t3, 1
+    addiu          t4, 2
+    addu           t3, t3, t2
+    addu           t4, t4, t2
+    srl            t3, 2
+    srl            t4, 2
+    sb             t3, 0(s2)
+    sb             t4, 1(s2)
+    addiu          t8, -1
+    bgtz           t8, 2b
+     addiu         s2, 2
+
+22:
+    lbu            t0, 0(t7)
+    lbu            t2, -1(t7)
+    sll            t1, t0, 1
+    addu           t1, t1, t0 // t1 = invalue * 3
+    addu           t1, t1, t2
+    addiu          t1, 1
+    srl            t1, t1, 2
+    sb             t1, 0(s2)
+    sb             t0, 1(s2)
+    addiu          s1, 4
+    bne            s1, s0, 0b
+     addiu         a2, 4
+3:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j              ra
+     nop
+END(jsimd_h2v1_fancy_upsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
+/*
+ * a0     - cinfo->image_width
+ * a1     - cinfo->max_v_samp_factor
+ * a2     - compptr->v_samp_factor
+ * a3     - compptr->width_in_blocks
+ * 16(sp) - input_data
+ * 20(sp) - output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
+
+    beqz        a2, 7f
+     lw         s1, 44(sp)  // s1 = output_data
+    lw          s0, 40(sp)  // s0 = input_data
+    srl         s2, a0, 2
+    andi        t9, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3   // t0 = width_in_blocks*DCT
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    andi        t6, a0, 1   // t6 = temp_index
+    addiu       t6, -1
+    lw          t4, 0(s1)   // t4 = outptr
+    lw          t5, 0(s0)   // t5 = inptr0
+    li          s3, 0       // s3 = bias
+    srl         t7, a0, 1   // t7 = image_width1
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+1:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 2(t5)
+    ulhu        t2, 4(t5)
+    ulhu        t3, 6(t5)
+    raddu.w.qb  t0, t0
+    raddu.w.qb  t1, t1
+    raddu.w.qb  t2, t2
+    raddu.w.qb  t3, t3
+    shra.ph     t0, t0, 1
+    shra_r.ph   t1, t1, 1
+    shra.ph     t2, t2, 1
+    shra_r.ph   t3, t3, 1
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t3, 3(t4)
+    addiu       s4, -1
+    addiu       t4, 4
+    bgtz        s4, 1b
+     addiu      t5, 8
+    beqz        t8, 3f
+     addu       s4, t4, t8
+2:
+    ulhu        t0, 0(t5)
+    raddu.w.qb  t0, t0
+    addqh.w     t0, t0, s3
+    xori        s3, s3, 1
+    sb          t0, 0(t4)
+    addiu       t4, 1
+    bne         t4, s4, 2b
+     addiu      t5, 2
+3:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    addqh.w     t2, t1, s3  // t2 = pixval1
+    xori        s3, s3, 1
+    addqh.w     t3, t1, s3  // t3 = pixval2
+    blez        s2, 5f
+     append     t3, t2,  8
+    addu        t5, t4, s2  // t5 = loop_end2
+4:
+    ush         t3, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 4b
+     addiu      t4,  2
+5:
+    beqz        t9, 6f
+     nop
+    sb          t2, 0(t4)
+6:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 4
+7:
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
+
+    j           ra
+    nop
+END(jsimd_h2v1_downsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
+
+/*
+ * a0     - cinfo->image_width
+ * a1     - cinfo->max_v_samp_factor
+ * a2     - compptr->v_samp_factor
+ * a3     - compptr->width_in_blocks
+ * 16(sp) - input_data
+ * 20(sp) - output_data
+ */
+    .set at
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    beqz         a2, 8f
+     lw          s1, 52(sp)      // s1 = output_data
+    lw           s0, 48(sp)      // s0 = input_data
+
+    andi         t6, a0, 1       // t6 = temp_index
+    addiu        t6, -1
+    srl          t7, a0, 1       // t7 = image_width1
+    srl          s4, t7, 2
+    andi         t8, t7, 3
+    andi         t9, a0, 2
+    srl          s2, a0, 2
+    srl          t7, t9, 1
+    addu         s2, t7, s2
+    sll          t0, a3, 3       // s2 = width_in_blocks*DCT
+    srl          t7, t0, 1
+    subu         s2, t7, s2
+0:
+    lw           t4, 0(s1)       // t4 = outptr
+    lw           t5, 0(s0)       // t5 = inptr0
+    lw           s7, 4(s0)       // s7 = inptr1
+    li           s6, 1           // s6 = bias
+2:
+    ulw          t0, 0(t5)       // t0 = |P3|P2|P1|P0|
+    ulw          t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
+    ulw          t2, 4(t5)
+    ulw          t3, 4(s7)
+    precrq.ph.w  t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
+    ins          t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
+    raddu.w.qb   t1, t7
+    raddu.w.qb   t0, t0
+    shra_r.w     t1, t1, 2
+    addiu        t0, 1
+    srl          t0, 2
+    precrq.ph.w  t7, t2, t3
+    ins          t2, t3, 16, 16
+    raddu.w.qb   t7, t7
+    raddu.w.qb   t2, t2
+    shra_r.w     t7, t7, 2
+    addiu        t2, 1
+    srl          t2, 2
+    sb           t0, 0(t4)
+    sb           t1, 1(t4)
+    sb           t2, 2(t4)
+    sb           t7, 3(t4)
+    addiu        t4, 4
+    addiu        t5, 8
+    addiu        s4, s4, -1
+    bgtz         s4, 2b
+     addiu       s7, 8
+    beqz         t8, 4f
+     addu        t8, t4, t8
+3:
+    ulhu         t0, 0(t5)
+    ulhu         t1, 0(s7)
+    ins          t0, t1, 16, 16
+    raddu.w.qb   t0, t0
+    addu         t0, t0, s6
+    srl          t0, 2
+    xori         s6, s6, 3
+    sb           t0, 0(t4)
+    addiu        t5, 2
+    addiu        t4, 1
+    bne          t8, t4, 3b
+     addiu       s7, 2
+4:
+    lbux         t1, t6(t5)
+    sll          t1, 1
+    lbux         t0, t6(s7)
+    sll          t0, 1
+    addu         t1, t1, t0
+    addu         t3, t1, s6
+    srl          t0, t3, 2       // t2 = pixval1
+    xori         s6, s6, 3
+    addu         t2, t1, s6
+    srl          t1, t2, 2       // t3 = pixval2
+    blez         s2, 6f
+     append      t1, t0, 8
+5:
+    ush          t1, 0(t4)
+    addiu        s2, -1
+    bgtz         s2, 5b
+     addiu       t4, 2
+6:
+    beqz         t9, 7f
+     nop
+    sb           t0, 0(t4)
+7:
+    addiu        s1, 4
+    addiu        a2, -1
+    bnez         a2, 0b
+     addiu       s0, 8
+8:
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j            ra
+     nop
+END(jsimd_h2v2_downsample_mips_dspr2)
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
+/*
+ * a0     - input_data
+ * a1     - output_data
+ * a2     - compptr->v_samp_factor
+ * a3     - cinfo->max_v_samp_factor
+ * 16(sp) - cinfo->smoothing_factor
+ * 20(sp) - compptr->width_in_blocks
+ * 24(sp) - cinfo->image_width
+ */
+
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          s7, 52(sp)      // compptr->width_in_blocks
+    lw          s0, 56(sp)      // cinfo->image_width
+    lw          s6, 48(sp)      // cinfo->smoothing_factor
+    sll         s7, 3           // output_cols = width_in_blocks * DCTSIZE
+    sll         v0, s7, 1
+    subu        v0, v0, s0
+    blez        v0, 2f
+    move        v1, zero
+    addiu       t0, a3, 2       // t0 = cinfo->max_v_samp_factor + 2
+0:
+    addiu       t1, a0, -4
+    sll         t2, v1, 2
+    lwx         t1, t2(t1)
+    move        t3, v0
+    addu        t1, t1, s0
+    lbu         t2, -1(t1)
+1:
+    addiu       t3, t3, -1
+    sb          t2, 0(t1)
+    bgtz        t3, 1b
+    addiu       t1, t1, 1
+    addiu       v1, v1, 1
+    bne         v1, t0, 0b
+    nop
+2:
+    li          v0, 80
+    mul         v0, s6, v0
+    li          v1, 16384
+    move        t4, zero
+    move        t5, zero
+    subu        t6, v1, v0      // t6 = 16384 - tmp_smoot_f * 80
+    sll         t7, s6, 4       // t7 = tmp_smoot_f * 16
+3:
+/* Special case for first column: pretend column -1 is same as column 0 */
+    sll         v0, t4, 2
+    lwx         t8, v0(a1)      //  outptr = output_data[outrow]
+    sll         v1, t5, 2
+    addiu       t9, v1, 4
+    addiu       s0, v1, -4
+    addiu       s1, v1, 8
+    lwx         s2, v1(a0)      // inptr0 = input_data[inrow]
+    lwx         t9, t9(a0)      // inptr1 = input_data[inrow+1]
+    lwx         s0, s0(a0)      // above_ptr = input_data[inrow-1]
+    lwx         s1, s1(a0)      // below_ptr = input_data[inrow+2]
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, 0(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, 0(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1,t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, 0(s0)
+    lbu         t0, 0(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1,s3, t7
+    extr_r.w    v0, $ac1, 16
+    addiu       t8, t8, 1
+    addiu       s2, s2, 2
+    addiu       t9, t9, 2
+    addiu       s0, s0, 2
+    addiu       s1, s1, 2
+    sb          v0, -1(t8)
+    addiu       s4, s7, -2
+    and         s4, s4, 3
+    addu        s5, s4, t8      //end adress
+4:
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    addiu       t8, t8, 1
+    addiu       s2, s2, 2
+    addiu       t9, t9, 2
+    addiu       s0, s0, 2
+    sb          t2, -1(t8)
+    bne         s5, t8, 4b
+    addiu       s1, s1, 2
+    addiu       s5, s7, -2
+    subu        s5, s5, s4
+    addu        s5, s5, t8      //end adress
+5:
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    lh          v1, 2(t9)
+    addu        t0, t0, v0
+    lh          v0, 2(s2)
+    addu        s3, t0, s3
+    lh          t0, 2(s0)
+    lh          t1, 2(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 4(s2)
+    lbu         t0, 1(t9)
+    lbu         t1, 4(t9)
+    sb          t2, 0(t8)
+    raddu.w.qb  t3, v0
+    lbu         v0, 1(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t3, t6
+    addu        v0, v0, v1
+    lbu         t2, 4(s0)
+    addu        t0, t0, v0
+    lbu         v0, 1(s0)
+    addu        s3, t0, s3
+    lbu         t0, 1(s1)
+    lbu         t3, 4(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    lh          v1, 4(t9)
+    addu        t0, t0, v0
+    lh          v0, 4(s2)
+    addu        s3, t0, s3
+    lh          t0, 4(s0)
+    lh          t1, 4(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 6(s2)
+    lbu         t0, 3(t9)
+    lbu         t1, 6(t9)
+    sb          t2, 1(t8)
+    raddu.w.qb  t3, v0
+    lbu         v0, 3(s2)
+    addu        t0, t0,t1
+    mult        $ac1, t3, t6
+    addu        v0, v0, v1
+    lbu         t2, 6(s0)
+    addu        t0, t0, v0
+    lbu         v0, 3(s0)
+    addu        s3, t0, s3
+    lbu         t0, 3(s1)
+    lbu         t3, 6(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    lh          v1, 6(t9)
+    addu        t0, t0, v0
+    lh          v0, 6(s2)
+    addu        s3, t0, s3
+    lh          t0, 6(s0)
+    lh          t1, 6(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t3, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 8(s2)
+    lbu         t0, 5(t9)
+    lbu         t1, 8(t9)
+    sb          t3, 2(t8)
+    raddu.w.qb  t2, v0
+    lbu         v0, 5(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t2, t6
+    addu        v0, v0, v1
+    lbu         t2, 8(s0)
+    addu        t0, t0, v0
+    lbu         v0, 5(s0)
+    addu        s3, t0, s3
+    lbu         t0, 5(s1)
+    lbu         t3, 8(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    addiu       t8, t8, 4
+    addu        t0, t0, v0
+    addiu       s2, s2, 8
+    addu        s3, t0, s3
+    addiu       t9, t9, 8
+    madd        $ac1, s3, t7
+    extr_r.w    t1, $ac1, 16
+    addiu       s0, s0, 8
+    addiu       s1, s1, 8
+    bne         s5, t8, 5b
+    sb          t1, -1(t8)
+/* Special case for last column */
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 1(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 1(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 1(s0)
+    addu        t0, t0, v0
+    lbu         t3, 1(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    t0, $ac1, 16
+    addiu       t5, t5, 2
+    sb          t0, 0(t8)
+    addiu       t4, t4, 1
+    bne         t4, a2, 3b
+    addiu       t5, t5, 2
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_h2v2_smooth_downsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
+/*
+ * a0     - upsample->h_expand[compptr->component_index]
+ * a1     - upsample->v_expand[compptr->component_index]
+ * a2     - input_data
+ * a3     - output_data_ptr
+ * 16(sp) - cinfo->output_width
+ * 20(sp) - cinfo->max_v_samp_factor
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    lw      s0, 0(a3)    // s0 = output_data
+    lw      s1, 32(sp)   // s1 = cinfo->output_width
+    lw      s2, 36(sp)   // s2 = cinfo->max_v_samp_factor
+    li      t6, 0        // t6 = inrow
+    beqz    s2, 10f
+     li     s3, 0        // s3 = outrow
+0:
+    addu    t0, a2, t6
+    addu    t7, s0, s3
+    lw      t3, 0(t0)    // t3 = inptr
+    lw      t8, 0(t7)    // t8 = outptr
+    beqz    s1, 4f
+     addu   t5, t8, s1   // t5 = outend
+1:
+    lb      t2, 0(t3)    // t2 = invalue = *inptr++
+    addiu   t3, 1
+    beqz    a0, 3f
+     move   t0, a0       // t0 = h_expand
+2:
+    sb      t2, 0(t8)
+    addiu   t0, -1
+    bgtz    t0, 2b
+     addiu  t8, 1
+3:
+    bgt     t5, t8, 1b
+     nop
+4:
+    addiu   t9, a1, -1   // t9 = v_expand - 1
+    blez    t9, 9f
+     nop
+5:
+    lw      t3, 0(s0)
+    lw      t4, 4(s0)
+    subu    t0, s1, 0xF
+    blez    t0, 7f
+     addu   t5, t3, s1   // t5 = end address
+    andi    t7, s1, 0xF  // t7 = residual
+    subu    t8, t5, t7
+6:
+    ulw     t0, 0(t3)
+    ulw     t1, 4(t3)
+    ulw     t2, 8(t3)
+    usw     t0, 0(t4)
+    ulw     t0, 12(t3)
+    usw     t1, 4(t4)
+    usw     t2, 8(t4)
+    usw     t0, 12(t4)
+    addiu   t3, 16
+    bne     t3, t8, 6b
+     addiu  t4, 16
+    beqz    t7, 8f
+     nop
+7:
+    lbu     t0, 0(t3)
+    sb      t0, 0(t4)
+    addiu   t3, 1
+    bne     t3, t5, 7b
+     addiu  t4, 1
+8:
+    addiu   t9, -1
+    bgtz    t9, 5b
+     addiu  s0, 8
+9:
+    addu    s3, s3, a1
+    bne     s3, s2, 0b
+     addiu  t6, 1
+10:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j       ra
+     nop
+END(jsimd_int_upsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->max_v_samp_factor
+ * a1     - cinfo->output_width
+ * a2     - input_data
+ * a3     - output_data_ptr
+ */
+    lw      t7, 0(a3)       // t7 = output_data
+    andi    t8, a1, 0xf     // t8 = residual
+    sll     t0, a0, 2
+    blez    a0, 4f
+     addu   t9, t7, t0      // t9 = output_data end address
+0:
+    lw      t5, 0(t7)       // t5 = outptr
+    lw      t6, 0(a2)       // t6 = inptr
+    addu    t3, t5, a1      // t3 = outptr + output_width (end address)
+    subu    t3, t8          // t3 = end address - residual
+    beq     t5, t3, 2f
+     move   t4, t8
+1:
+    ulw     t0, 0(t6)       // t0 = |P3|P2|P1|P0|
+    ulw     t2, 4(t6)       // t2 = |P7|P6|P5|P4|
+    srl     t1, t0, 16      // t1 = |X|X|P3|P2|
+    ins     t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
+    ins     t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
+    ins     t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
+    ins     t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
+    usw     t0, 0(t5)
+    usw     t1, 4(t5)
+    srl     t0, t2, 16      // t0 = |X|X|P7|P6|
+    ins     t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
+    ins     t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
+    ins     t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
+    ins     t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
+    usw     t2, 8(t5)
+    usw     t0, 12(t5)
+    addiu   t5, 16
+    bne     t5, t3, 1b
+     addiu  t6, 8
+    beqz    t8, 3f
+     move   t4, t8
+2:
+    lbu     t1, 0(t6)
+    sb      t1, 0(t5)
+    sb      t1, 1(t5)
+    addiu   t4, -2
+    addiu   t6, 1
+    bgtz    t4, 2b
+     addiu  t5, 2
+3:
+    addiu   t7, 4
+    bne     t9, t7, 0b
+     addiu  a2, 4
+4:
+    j       ra
+     nop
+END(jsimd_h2v1_upsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
+/*
+ * a0     - cinfo->max_v_samp_factor
+ * a1     - cinfo->output_width
+ * a2     - input_data
+ * a3     - output_data_ptr
+ */
+    lw      t7, 0(a3)
+    blez    a0, 7f
+     andi   t9, a1, 0xf     // t9 = residual
+0:
+    lw      t6, 0(a2)       // t6 = inptr
+    lw      t5, 0(t7)       // t5 = outptr
+    addu    t8, t5, a1      // t8 = outptr end address
+    subu    t8, t9          // t8 = end address - residual
+    beq     t5, t8, 2f
+     move   t4, t9
+1:
+    ulw     t0, 0(t6)
+    srl     t1, t0, 16
+    ins     t0, t0, 16, 16
+    ins     t0, t0, 8, 16
+    ins     t1, t1, 16, 16
+    ins     t1, t1, 8, 16
+    ulw     t2, 4(t6)
+    usw     t0, 0(t5)
+    usw     t1, 4(t5)
+    srl     t3, t2, 16
+    ins     t2, t2, 16, 16
+    ins     t2, t2, 8, 16
+    ins     t3, t3, 16, 16
+    ins     t3, t3, 8, 16
+    usw     t2, 8(t5)
+    usw     t3, 12(t5)
+    addiu   t5, 16
+    bne     t5, t8, 1b
+     addiu  t6, 8
+    beqz    t9, 3f
+     move   t4, t9
+2:
+    lbu     t0, 0(t6)
+    sb      t0, 0(t5)
+    sb      t0, 1(t5)
+    addiu   t4, -2
+    addiu   t6, 1
+    bgtz    t4, 2b
+     addiu  t5, 2
+3:
+    lw      t6, 0(t7)       // t6 = outptr[0]
+    lw      t5, 4(t7)       // t5 = outptr[1]
+    addu    t4, t6, a1      // t4 = new end address
+    beq     a1, t9, 5f
+     subu   t8, t4, t9
+4:
+    ulw     t0, 0(t6)
+    ulw     t1, 4(t6)
+    ulw     t2, 8(t6)
+    usw     t0, 0(t5)
+    ulw     t0, 12(t6)
+    usw     t1, 4(t5)
+    usw     t2, 8(t5)
+    usw     t0, 12(t5)
+    addiu   t6, 16
+    bne     t6, t8, 4b
+     addiu  t5, 16
+    beqz    t9, 6f
+     nop
+5:
+    lbu     t0, 0(t6)
+    sb      t0, 0(t5)
+    addiu   t6, 1
+    bne     t6, t4, 5b
+     addiu  t5, 1
+6:
+    addiu   t7, 8
+    addiu   a0, -2
+    bgtz    a0, 0b
+     addiu  a2, 4
+7:
+    j       ra
+     nop
+END(jsimd_h2v2_upsample_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
+/*
+ * a0     - coef_block
+ * a1     - compptr->dcttable
+ * a2     - output
+ * a3     - range_limit
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu     sp, sp, -256
+    move      v0, sp
+    addiu     v1, zero, 8      // v1 = DCTSIZE = 8
+1:
+    lh        s4, 32(a0)       // s4 = inptr[16]
+    lh        s5, 64(a0)       // s5 = inptr[32]
+    lh        s6, 96(a0)       // s6 = inptr[48]
+    lh        t1, 112(a0)      // t1 = inptr[56]
+    lh        t7, 16(a0)       // t7 = inptr[8]
+    lh        t5, 80(a0)       // t5 = inptr[40]
+    lh        t3, 48(a0)       // t3 = inptr[24]
+    or        s4, s4, t1
+    or        s4, s4, t3
+    or        s4, s4, t5
+    or        s4, s4, t7
+    or        s4, s4, s5
+    or        s4, s4, s6
+    bnez      s4, 2f
+     addiu    v1, v1, -1
+    lh        s5, 0(a1)        // quantptr[DCTSIZE*0]
+    lh        s6, 0(a0)        // inptr[DCTSIZE*0]
+    mul       s5, s5, s6       // DEQUANTIZE(inptr[0], quantptr[0])
+    sll       s5, s5, 2
+    sw        s5, 0(v0)
+    sw        s5, 32(v0)
+    sw        s5, 64(v0)
+    sw        s5, 96(v0)
+    sw        s5, 128(v0)
+    sw        s5, 160(v0)
+    sw        s5, 192(v0)
+    b         3f
+     sw       s5, 224(v0)
+2:
+    lh        t0, 112(a1)
+    lh        t2, 48(a1)
+    lh        t4, 80(a1)
+    lh        t6, 16(a1)
+    mul       t0, t0, t1       // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
+    mul       t1, t2, t3       // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
+    mul       t2, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
+    mul       t3, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
+    lh        t4, 32(a1)
+    lh        t5, 32(a0)
+    lh        t6, 96(a1)
+    lh        t7, 96(a0)
+    addu      s0, t0, t1       // z3 = tmp0 + tmp2
+    addu      s1, t1, t2       // z2 = tmp1 + tmp2
+    addu      s2, t2, t3       // z4 = tmp1 + tmp3
+    addu      s3, s0, s2       // z3 + z4
+    addiu     t9, zero, 9633   // FIX_1_175875602
+    mul       s3, s3, t9       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
+    addu      t8, t0, t3       // z1 = tmp0 + tmp3
+    addiu     t9, zero, 2446   // FIX_0_298631336
+    mul       t0, t0, t9       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
+    addiu     t9, zero, 16819  // FIX_2_053119869
+    mul       t2, t2, t9       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
+    addiu     t9, zero, 25172  // FIX_3_072711026
+    mul       t1, t1, t9       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
+    addiu     t9, zero, 12299  // FIX_1_501321110
+    mul       t3, t3, t9       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
+    addiu     t9, zero, 16069  // FIX_1_961570560
+    mul       s0, s0, t9       // -z3 = MULTIPLY(z3, FIX_1_961570560)
+    addiu     t9, zero, 3196   // FIX_0_390180644
+    mul       s2, s2, t9       // -z4 = MULTIPLY(z4, FIX_0_390180644)
+    addiu     t9, zero, 7373   // FIX_0_899976223
+    mul       t8, t8, t9       // -z1 = MULTIPLY(z1, FIX_0_899976223)
+    addiu     t9, zero, 20995  // FIX_2_562915447
+    mul       s1, s1, t9       // -z2 = MULTIPLY(z2, FIX_2_562915447)
+    subu      s0, s3, s0       // z3 += z5
+    addu      t0, t0, s0       // tmp0 += z3
+    addu      t1, t1, s0       // tmp2 += z3
+    subu      s2, s3, s2       // z4 += z5
+    addu      t2, t2, s2       // tmp1 += z4
+    addu      t3, t3, s2       // tmp3 += z4
+    subu      t0, t0, t8       // tmp0 += z1
+    subu      t1, t1, s1       // tmp2 += z2
+    subu      t2, t2, s1       // tmp1 += z2
+    subu      t3, t3, t8       // tmp3 += z1
+    mul       s0, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
+    addiu     t9, zero, 6270   // FIX_0_765366865
+    mul       s1, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
+    lh        t4, 0(a1)
+    lh        t5, 0(a0)
+    lh        t6, 64(a1)
+    lh        t7, 64(a0)
+    mul       s2, t9, s0       // MULTIPLY(z2, FIX_0_765366865)
+    mul       t5, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
+    mul       t6, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
+    addiu     t9, zero, 4433   // FIX_0_541196100
+    addu      s3, s0, s1       // z2 + z3
+    mul       s3, s3, t9       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
+    addiu     t9, zero, 15137  // FIX_1_847759065
+    mul       t8, s1, t9       // MULTIPLY(z3, FIX_1_847759065)
+    addu      t4, t5, t6
+    subu      t5, t5, t6
+    sll       t4, t4, 13       // tmp0 = (z2 + z3) << CONST_BITS
+    sll       t5, t5, 13       // tmp1 = (z2 - z3) << CONST_BITS
+    addu      t7, s3, s2       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
+    subu      t6, s3, t8       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
+    addu      s0, t4, t7
+    subu      s1, t4, t7
+    addu      s2, t5, t6
+    subu      s3, t5, t6
+    addu      t4, s0, t3
+    subu      s0, s0, t3
+    addu      t3, s2, t1
+    subu      s2, s2, t1
+    addu      t1, s3, t2
+    subu      s3, s3, t2
+    addu      t2, s1, t0
+    subu      s1, s1, t0
+    shra_r.w  t4, t4, 11
+    shra_r.w  t3, t3, 11
+    shra_r.w  t1, t1, 11
+    shra_r.w  t2, t2, 11
+    shra_r.w  s1, s1, 11
+    shra_r.w  s3, s3, 11
+    shra_r.w  s2, s2, 11
+    shra_r.w  s0, s0, 11
+    sw        t4, 0(v0)
+    sw        t3, 32(v0)
+    sw        t1, 64(v0)
+    sw        t2, 96(v0)
+    sw        s1, 128(v0)
+    sw        s3, 160(v0)
+    sw        s2, 192(v0)
+    sw        s0, 224(v0)
+3:
+    addiu     a1, a1, 2
+    addiu     a0, a0, 2
+    bgtz      v1, 1b
+     addiu    v0, v0, 4
+    move      v0, sp
+    addiu     v1, zero, 8
+4:
+    lw        t0, 8(v0)        // z2 = (JLONG) wsptr[2]
+    lw        t1, 24(v0)       // z3 = (JLONG) wsptr[6]
+    lw        t2, 0(v0)        // (JLONG) wsptr[0]
+    lw        t3, 16(v0)       // (JLONG) wsptr[4]
+    lw        s4, 4(v0)        // (JLONG) wsptr[1]
+    lw        s5, 12(v0)       // (JLONG) wsptr[3]
+    lw        s6, 20(v0)       // (JLONG) wsptr[5]
+    lw        s7, 28(v0)       // (JLONG) wsptr[7]
+    or        s4, s4, t0
+    or        s4, s4, t1
+    or        s4, s4, t3
+    or        s4, s4, s7
+    or        s4, s4, s5
+    or        s4, s4, s6
+    bnez      s4, 5f
+     addiu    v1, v1, -1
+    shra_r.w  s5, t2, 5
+    andi      s5, s5, 0x3ff
+    lbux      s5, s5(a3)
+    lw        s1, 0(a2)
+    replv.qb  s5, s5
+    usw       s5, 0(s1)
+    usw       s5, 4(s1)
+    b         6f
+     nop
+5:
+    addu      t4, t0, t1       // z2 + z3
+    addiu     t8, zero, 4433   // FIX_0_541196100
+    mul       t5, t4, t8       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
+    addiu     t8, zero, 15137  // FIX_1_847759065
+    mul       t1, t1, t8       // MULTIPLY(z3, FIX_1_847759065)
+    addiu     t8, zero, 6270   // FIX_0_765366865
+    mul       t0, t0, t8       // MULTIPLY(z2, FIX_0_765366865)
+    addu      t4, t2, t3       // (JLONG) wsptr[0] + (JLONG) wsptr[4]
+    subu      t2, t2, t3       // (JLONG) wsptr[0] - (JLONG) wsptr[4]
+    sll       t4, t4, 13       // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
+    sll       t2, t2, 13       // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
+    subu      t1, t5, t1       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
+    subu      t3, t2, t1       // tmp12 = tmp1 - tmp2
+    addu      t2, t2, t1       // tmp11 = tmp1 + tmp2
+    addu      t5, t5, t0       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
+    subu      t1, t4, t5       // tmp13 = tmp0 - tmp3
+    addu      t0, t4, t5       // tmp10 = tmp0 + tmp3
+    lw        t4, 28(v0)       // tmp0 = (JLONG) wsptr[7]
+    lw        t6, 12(v0)       // tmp2 = (JLONG) wsptr[3]
+    lw        t5, 20(v0)       // tmp1 = (JLONG) wsptr[5]
+    lw        t7, 4(v0)        // tmp3 = (JLONG) wsptr[1]
+    addu      s0, t4, t6       // z3 = tmp0 + tmp2
+    addiu     t8, zero, 9633   // FIX_1_175875602
+    addu      s1, t5, t7       // z4 = tmp1 + tmp3
+    addu      s2, s0, s1       // z3 + z4
+    mul       s2, s2, t8       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
+    addu      s3, t4, t7       // z1 = tmp0 + tmp3
+    addu      t9, t5, t6       // z2 = tmp1 + tmp2
+    addiu     t8, zero, 16069  // FIX_1_961570560
+    mul       s0, s0, t8       // -z3 = MULTIPLY(z3, FIX_1_961570560)
+    addiu     t8, zero, 3196   // FIX_0_390180644
+    mul       s1, s1, t8       // -z4 = MULTIPLY(z4, FIX_0_390180644)
+    addiu     t8, zero, 2446   // FIX_0_298631336
+    mul       t4, t4, t8       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
+    addiu     t8, zero, 7373   // FIX_0_899976223
+    mul       s3, s3, t8       // -z1 = MULTIPLY(z1, FIX_0_899976223)
+    addiu     t8, zero, 16819  // FIX_2_053119869
+    mul       t5, t5, t8       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
+    addiu     t8, zero, 20995  // FIX_2_562915447
+    mul       t9, t9, t8       // -z2 = MULTIPLY(z2, FIX_2_562915447)
+    addiu     t8, zero, 25172  // FIX_3_072711026
+    mul       t6, t6, t8       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
+    addiu     t8, zero, 12299  // FIX_1_501321110
+    mul       t7, t7, t8       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
+    subu      s0, s2, s0       // z3 += z5
+    subu      s1, s2, s1       // z4 += z5
+    addu      t4, t4, s0
+    subu      t4, t4, s3       // tmp0
+    addu      t5, t5, s1
+    subu      t5, t5, t9       // tmp1
+    addu      t6, t6, s0
+    subu      t6, t6, t9       // tmp2
+    addu      t7, t7, s1
+    subu      t7, t7, s3       // tmp3
+    addu      s0, t0, t7
+    subu      t0, t0, t7
+    addu      t7, t2, t6
+    subu      t2, t2, t6
+    addu      t6, t3, t5
+    subu      t3, t3, t5
+    addu      t5, t1, t4
+    subu      t1, t1, t4
+    shra_r.w  s0, s0, 18
+    shra_r.w  t7, t7, 18
+    shra_r.w  t6, t6, 18
+    shra_r.w  t5, t5, 18
+    shra_r.w  t1, t1, 18
+    shra_r.w  t3, t3, 18
+    shra_r.w  t2, t2, 18
+    shra_r.w  t0, t0, 18
+    andi      s0, s0, 0x3ff
+    andi      t7, t7, 0x3ff
+    andi      t6, t6, 0x3ff
+    andi      t5, t5, 0x3ff
+    andi      t1, t1, 0x3ff
+    andi      t3, t3, 0x3ff
+    andi      t2, t2, 0x3ff
+    andi      t0, t0, 0x3ff
+    lw        s1, 0(a2)
+    lbux      s0, s0(a3)
+    lbux      t7, t7(a3)
+    lbux      t6, t6(a3)
+    lbux      t5, t5(a3)
+    lbux      t1, t1(a3)
+    lbux      t3, t3(a3)
+    lbux      t2, t2(a3)
+    lbux      t0, t0(a3)
+    sb        s0, 0(s1)
+    sb        t7, 1(s1)
+    sb        t6, 2(s1)
+    sb        t5, 3(s1)
+    sb        t1, 4(s1)
+    sb        t3, 5(s1)
+    sb        t2, 6(s1)
+    sb        t0, 7(s1)
+6:
+    addiu     v0, v0, 32
+    bgtz      v1, 4b
+     addiu    a2, a2, 4
+    addiu     sp, sp, 256
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j         ra
+     nop
+
+END(jsimd_idct_islow_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
+/*
+ * a0     - inptr
+ * a1     - quantptr
+ * a2     - wsptr
+ * a3     - mips_idct_ifast_coefs
+ */
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu          t9, a0, 16            // end address
+    or             AT, a3, zero
+
+0:
+    lw             s0, 0(a1)             // quantptr[DCTSIZE*0]
+    lw             t0, 0(a0)             // inptr[DCTSIZE*0]
+    lw             t1, 16(a0)            // inptr[DCTSIZE*1]
+    muleq_s.w.phl  v0, t0, s0            // tmp0 ...
+    lw             t2, 32(a0)            // inptr[DCTSIZE*2]
+    lw             t3, 48(a0)            // inptr[DCTSIZE*3]
+    lw             t4, 64(a0)            // inptr[DCTSIZE*4]
+    lw             t5, 80(a0)            // inptr[DCTSIZE*5]
+    muleq_s.w.phr  t0, t0, s0            // ... tmp0 ...
+    lw             t6, 96(a0)            // inptr[DCTSIZE*6]
+    lw             t7, 112(a0)           // inptr[DCTSIZE*7]
+    or             s4, t1, t2
+    or             s5, t3, t4
+    bnez           s4, 1f
+     ins           t0, v0, 16, 16        // ... tmp0
+    bnez           s5, 1f
+     or            s6, t5, t6
+    or             s6, s6, t7
+    bnez           s6, 1f
+     sw            t0, 0(a2)             // wsptr[DCTSIZE*0]
+    sw             t0, 16(a2)            // wsptr[DCTSIZE*1]
+    sw             t0, 32(a2)            // wsptr[DCTSIZE*2]
+    sw             t0, 48(a2)            // wsptr[DCTSIZE*3]
+    sw             t0, 64(a2)            // wsptr[DCTSIZE*4]
+    sw             t0, 80(a2)            // wsptr[DCTSIZE*5]
+    sw             t0, 96(a2)            // wsptr[DCTSIZE*6]
+    sw             t0, 112(a2)           // wsptr[DCTSIZE*7]
+    addiu          a0, a0, 4
+    b              2f
+     addiu         a1, a1, 4
+
+1:
+    lw             s1, 32(a1)            // quantptr[DCTSIZE*2]
+    lw             s2, 64(a1)            // quantptr[DCTSIZE*4]
+    muleq_s.w.phl  v0, t2, s1            // tmp1 ...
+    muleq_s.w.phr  t2, t2, s1            // ... tmp1 ...
+    lw             s0, 16(a1)            // quantptr[DCTSIZE*1]
+    lw             s1, 48(a1)            // quantptr[DCTSIZE*3]
+    lw             s3, 96(a1)            // quantptr[DCTSIZE*6]
+    muleq_s.w.phl  v1, t4, s2            // tmp2 ...
+    muleq_s.w.phr  t4, t4, s2            // ... tmp2 ...
+    lw             s2, 80(a1)            // quantptr[DCTSIZE*5]
+    lw             t8, 4(AT)             // FIX(1.414213562)
+    ins            t2, v0, 16, 16        // ... tmp1
+    muleq_s.w.phl  v0, t6, s3            // tmp3 ...
+    muleq_s.w.phr  t6, t6, s3            // ... tmp3 ...
+    ins            t4, v1, 16, 16        // ... tmp2
+    addq.ph        s4, t0, t4            // tmp10
+    subq.ph        s5, t0, t4            // tmp11
+    ins            t6, v0, 16, 16        // ... tmp3
+    subq.ph        s6, t2, t6            // tmp12 ...
+    addq.ph        s7, t2, t6            // tmp13
+    mulq_s.ph      s6, s6, t8            // ... tmp12 ...
+    addq.ph        t0, s4, s7            // tmp0
+    subq.ph        t6, s4, s7            // tmp3
+    muleq_s.w.phl  v0, t1, s0            // tmp4 ...
+    muleq_s.w.phr  t1, t1, s0            // ... tmp4 ...
+    shll_s.ph      s6, s6, 1             // x2
+    lw             s3, 112(a1)           // quantptr[DCTSIZE*7]
+    subq.ph        s6, s6, s7            // ... tmp12
+    muleq_s.w.phl  v1, t7, s3            // tmp7 ...
+    muleq_s.w.phr  t7, t7, s3            // ... tmp7 ...
+    ins            t1, v0, 16, 16        // ... tmp4
+    addq.ph        t2, s5, s6            // tmp1
+    subq.ph        t4, s5, s6            // tmp2
+    muleq_s.w.phl  v0, t5, s2            // tmp6 ...
+    muleq_s.w.phr  t5, t5, s2            // ... tmp6 ...
+    ins            t7, v1, 16, 16        // ... tmp7
+    addq.ph        s5, t1, t7            // z11
+    subq.ph        s6, t1, t7            // z12
+    muleq_s.w.phl  v1, t3, s1            // tmp5 ...
+    muleq_s.w.phr  t3, t3, s1            // ... tmp5 ...
+    ins            t5, v0, 16, 16        // ... tmp6
+    ins            t3, v1, 16, 16        // ... tmp5
+    addq.ph        s7, t5, t3            // z13
+    subq.ph        v0, t5, t3            // z10
+    addq.ph        t7, s5, s7            // tmp7
+    subq.ph        s5, s5, s7            // tmp11 ...
+    addq.ph        v1, v0, s6            // z5 ...
+    mulq_s.ph      s5, s5, t8            // ... tmp11
+    lw             t8, 8(AT)             // FIX(1.847759065)
+    lw             s4, 0(AT)             // FIX(1.082392200)
+    addq.ph        s0, t0, t7
+    subq.ph        s1, t0, t7
+    mulq_s.ph      v1, v1, t8            // ... z5
+    shll_s.ph      s5, s5, 1             // x2
+    lw             t8, 12(AT)            // FIX(-2.613125930)
+    sw             s0, 0(a2)             // wsptr[DCTSIZE*0]
+    shll_s.ph      v0, v0, 1             // x4
+    mulq_s.ph      v0, v0, t8            // tmp12 ...
+    mulq_s.ph      s4, s6, s4            // tmp10 ...
+    shll_s.ph      v1, v1, 1             // x2
+    addiu          a0, a0, 4
+    addiu          a1, a1, 4
+    sw             s1, 112(a2)           // wsptr[DCTSIZE*7]
+    shll_s.ph      s6, v0, 1             // x4
+    shll_s.ph      s4, s4, 1             // x2
+    addq.ph        s6, s6, v1            // ... tmp12
+    subq.ph        t5, s6, t7            // tmp6
+    subq.ph        s4, s4, v1            // ... tmp10
+    subq.ph        t3, s5, t5            // tmp5
+    addq.ph        s2, t2, t5
+    addq.ph        t1, s4, t3            // tmp4
+    subq.ph        s3, t2, t5
+    sw             s2, 16(a2)            // wsptr[DCTSIZE*1]
+    sw             s3, 96(a2)            // wsptr[DCTSIZE*6]
+    addq.ph        v0, t4, t3
+    subq.ph        v1, t4, t3
+    sw             v0, 32(a2)            // wsptr[DCTSIZE*2]
+    sw             v1, 80(a2)            // wsptr[DCTSIZE*5]
+    addq.ph        v0, t6, t1
+    subq.ph        v1, t6, t1
+    sw             v0, 64(a2)            // wsptr[DCTSIZE*4]
+    sw             v1, 48(a2)            // wsptr[DCTSIZE*3]
+
+2:
+    bne            a0, t9, 0b
+     addiu         a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j              ra
+     nop
+
+END(jsimd_idct_ifast_cols_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
+/*
+ * a0     - wsptr
+ * a1     - output_buf
+ * a2     - output_col
+ * a3     - mips_idct_ifast_coefs
+ */
+
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    addiu         t9, a0, 128        // end address
+    lui           s8, 0x8080
+    ori           s8, s8, 0x8080
+
+0:
+    lw            AT, 36(sp)         // restore $a3 (mips_idct_ifast_coefs)
+    lw            t0, 0(a0)          // wsptr[DCTSIZE*0+0/1]  b a
+    lw            s0, 16(a0)         // wsptr[DCTSIZE*1+0/1]  B A
+    lw            t2, 4(a0)          // wsptr[DCTSIZE*0+2/3]  d c
+    lw            s2, 20(a0)         // wsptr[DCTSIZE*1+2/3]  D C
+    lw            t4, 8(a0)          // wsptr[DCTSIZE*0+4/5]  f e
+    lw            s4, 24(a0)         // wsptr[DCTSIZE*1+4/5]  F E
+    lw            t6, 12(a0)         // wsptr[DCTSIZE*0+6/7]  h g
+    lw            s6, 28(a0)         // wsptr[DCTSIZE*1+6/7]  H G
+    precrq.ph.w   t1, s0, t0         // B b
+    ins           t0, s0, 16, 16     // A a
+    bnez          t1, 1f
+     or           s0, t2, s2
+    bnez          s0, 1f
+     or           s0, t4, s4
+    bnez          s0, 1f
+     or           s0, t6, s6
+    bnez          s0, 1f
+     shll_s.ph    s0, t0, 2          // A a
+    lw            a3, 0(a1)
+    lw            AT, 4(a1)
+    precrq.ph.w   t0, s0, s0         // A A
+    ins           s0, s0, 16, 16     // a a
+    addu          a3, a3, a2
+    addu          AT, AT, a2
+    precrq.qb.ph  t0, t0, t0         // A A A A
+    precrq.qb.ph  s0, s0, s0         // a a a a
+    addu.qb       s0, s0, s8
+    addu.qb       t0, t0, s8
+    sw            s0, 0(a3)
+    sw            s0, 4(a3)
+    sw            t0, 0(AT)
+    sw            t0, 4(AT)
+    addiu         a0, a0, 32
+    bne           a0, t9, 0b
+     addiu        a1, a1, 8
+    b             2f
+     nop
+
+1:
+    precrq.ph.w   t3, s2, t2
+    ins           t2, s2, 16, 16
+    precrq.ph.w   t5, s4, t4
+    ins           t4, s4, 16, 16
+    precrq.ph.w   t7, s6, t6
+    ins           t6, s6, 16, 16
+    lw            t8, 4(AT)          // FIX(1.414213562)
+    addq.ph       s4, t0, t4         // tmp10
+    subq.ph       s5, t0, t4         // tmp11
+    subq.ph       s6, t2, t6         // tmp12 ...
+    addq.ph       s7, t2, t6         // tmp13
+    mulq_s.ph     s6, s6, t8         // ... tmp12 ...
+    addq.ph       t0, s4, s7         // tmp0
+    subq.ph       t6, s4, s7         // tmp3
+    shll_s.ph     s6, s6, 1          // x2
+    subq.ph       s6, s6, s7         // ... tmp12
+    addq.ph       t2, s5, s6         // tmp1
+    subq.ph       t4, s5, s6         // tmp2
+    addq.ph       s5, t1, t7         // z11
+    subq.ph       s6, t1, t7         // z12
+    addq.ph       s7, t5, t3         // z13
+    subq.ph       v0, t5, t3         // z10
+    addq.ph       t7, s5, s7         // tmp7
+    subq.ph       s5, s5, s7         // tmp11 ...
+    addq.ph       v1, v0, s6         // z5 ...
+    mulq_s.ph     s5, s5, t8         // ... tmp11
+    lw            t8, 8(AT)          // FIX(1.847759065)
+    lw            s4, 0(AT)          // FIX(1.082392200)
+    addq.ph       s0, t0, t7         // tmp0 + tmp7
+    subq.ph       s7, t0, t7         // tmp0 - tmp7
+    mulq_s.ph     v1, v1, t8         // ... z5
+    lw            a3, 0(a1)
+    lw            t8, 12(AT)         // FIX(-2.613125930)
+    shll_s.ph     s5, s5, 1          // x2
+    addu          a3, a3, a2
+    shll_s.ph     v0, v0, 1          // x4
+    mulq_s.ph     v0, v0, t8         // tmp12 ...
+    mulq_s.ph     s4, s6, s4         // tmp10 ...
+    shll_s.ph     v1, v1, 1          // x2
+    addiu         a0, a0, 32
+    addiu         a1, a1, 8
+    shll_s.ph     s6, v0, 1          // x4
+    shll_s.ph     s4, s4, 1          // x2
+    addq.ph       s6, s6, v1         // ... tmp12
+    shll_s.ph     s0, s0, 2
+    subq.ph       t5, s6, t7         // tmp6
+    subq.ph       s4, s4, v1         // ... tmp10
+    subq.ph       t3, s5, t5         // tmp5
+    shll_s.ph     s7, s7, 2
+    addq.ph       t1, s4, t3         // tmp4
+    addq.ph       s1, t2, t5         // tmp1 + tmp6
+    subq.ph       s6, t2, t5         // tmp1 - tmp6
+    addq.ph       s2, t4, t3         // tmp2 + tmp5
+    subq.ph       s5, t4, t3         // tmp2 - tmp5
+    addq.ph       s4, t6, t1         // tmp3 + tmp4
+    subq.ph       s3, t6, t1         // tmp3 - tmp4
+    shll_s.ph     s1, s1, 2
+    shll_s.ph     s2, s2, 2
+    shll_s.ph     s3, s3, 2
+    shll_s.ph     s4, s4, 2
+    shll_s.ph     s5, s5, 2
+    shll_s.ph     s6, s6, 2
+    precrq.ph.w   t0, s1, s0         // B A
+    ins           s0, s1, 16, 16     // b a
+    precrq.ph.w   t2, s3, s2         // D C
+    ins           s2, s3, 16, 16     // d c
+    precrq.ph.w   t4, s5, s4         // F E
+    ins           s4, s5, 16, 16     // f e
+    precrq.ph.w   t6, s7, s6         // H G
+    ins           s6, s7, 16, 16     // h g
+    precrq.qb.ph  t0, t2, t0         // D C B A
+    precrq.qb.ph  s0, s2, s0         // d c b a
+    precrq.qb.ph  t4, t6, t4         // H G F E
+    precrq.qb.ph  s4, s6, s4         // h g f e
+    addu.qb       s0, s0, s8
+    addu.qb       s4, s4, s8
+    sw            s0, 0(a3)          // outptr[0/1/2/3]       d c b a
+    sw            s4, 4(a3)          // outptr[4/5/6/7]       h g f e
+    lw            a3, -4(a1)
+    addu.qb       t0, t0, s8
+    addu          a3, a3, a2
+    addu.qb       t4, t4, s8
+    sw            t0, 0(a3)          // outptr[0/1/2/3]       D C B A
+    bne           a0, t9, 0b
+     sw           t4, 4(a3)          // outptr[4/5/6/7]       H G F E
+
+2:
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    j             ra
+     nop
+
+END(jsimd_idct_ifast_rows_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
+/*
+ * a0     - data
+ */
+
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lui       t0, 6437
+    ori       t0, 2260
+    lui       t1, 9633
+    ori       t1, 11363
+    lui       t2, 0xd39e
+    ori       t2, 0xe6dc
+    lui       t3, 0xf72d
+    ori       t3, 9633
+    lui       t4, 2261
+    ori       t4, 9633
+    lui       t5, 0xd39e
+    ori       t5, 6437
+    lui       t6, 9633
+    ori       t6, 0xd39d
+    lui       t7, 0xe6dc
+    ori       t7, 2260
+    lui       t8, 4433
+    ori       t8, 10703
+    lui       t9, 0xd630
+    ori       t9, 4433
+    li        s8, 8
+    move      a1, a0
+1:
+    lw        s0, 0(a1)     // tmp0 = 1|0
+    lw        s1, 4(a1)     // tmp1 = 3|2
+    lw        s2, 8(a1)     // tmp2 = 5|4
+    lw        s3, 12(a1)    // tmp3 = 7|6
+    packrl.ph s1, s1, s1    // tmp1 = 2|3
+    packrl.ph s3, s3, s3    // tmp3 = 6|7
+    subq.ph   s7, s1, s2    // tmp7 = 2-5|3-4 = t5|t4
+    subq.ph   s5, s0, s3    // tmp5 = 1-6|0-7 = t6|t7
+    mult      $0, $0        // ac0  = 0
+    dpa.w.ph  $ac0, s7, t0  // ac0 += t5*  6437 + t4*  2260
+    dpa.w.ph  $ac0, s5, t1  // ac0 += t6*  9633 + t7* 11363
+    mult      $ac1, $0, $0  // ac1  = 0
+    dpa.w.ph  $ac1, s7, t2  // ac1 += t5*-11362 + t4* -6436
+    dpa.w.ph  $ac1, s5, t3  // ac1 += t6* -2259 + t7*  9633
+    mult      $ac2, $0, $0  // ac2  = 0
+    dpa.w.ph  $ac2, s7, t4  // ac2 += t5*  2261 + t4*  9633
+    dpa.w.ph  $ac2, s5, t5  // ac2 += t6*-11362 + t7*  6437
+    mult      $ac3, $0, $0  // ac3  = 0
+    dpa.w.ph  $ac3, s7, t6  // ac3 += t5*  9633 + t4*-11363
+    dpa.w.ph  $ac3, s5, t7  // ac3 += t6* -6436 + t7*  2260
+    addq.ph   s6, s1, s2    // tmp6 = 2+5|3+4 = t2|t3
+    addq.ph   s4, s0, s3    // tmp4 = 1+6|0+7 = t1|t0
+    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
+    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
+    extr_r.w  s2, $ac2, 11  // tmp2 = (ac2 + 1024) >> 11
+    extr_r.w  s3, $ac3, 11  // tmp3 = (ac3 + 1024) >> 11
+    addq.ph   s5, s4, s6    // tmp5 = t1+t2|t0+t3 = t11|t10
+    subq.ph   s7, s4, s6    // tmp7 = t1-t2|t0-t3 = t12|t13
+    sh        s0, 2(a1)
+    sh        s1, 6(a1)
+    sh        s2, 10(a1)
+    sh        s3, 14(a1)
+    mult      $0, $0        // ac0  = 0
+    dpa.w.ph  $ac0, s7, t8  // ac0 += t12*  4433 + t13* 10703
+    mult      $ac1, $0, $0  // ac1  = 0
+    dpa.w.ph  $ac1, s7, t9  // ac1 += t12*-10704 + t13*  4433
+    sra       s4, s5, 16    // tmp4 = t11
+    addiu     a1, a1, 16
+    addiu     s8, s8, -1
+    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
+    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
+    addu      s2, s5, s4    // tmp2 = t10 + t11
+    subu      s3, s5, s4    // tmp3 = t10 - t11
+    sll       s2, s2, 2     // tmp2 = (t10 + t11) << 2
+    sll       s3, s3, 2     // tmp3 = (t10 - t11) << 2
+    sh        s2, -16(a1)
+    sh        s3, -8(a1)
+    sh        s0, -12(a1)
+    bgtz      s8, 1b
+     sh       s1, -4(a1)
+    li        t0, 2260
+    li        t1, 11363
+    li        t2, 9633
+    li        t3, 6436
+    li        t4, 6437
+    li        t5, 2261
+    li        t6, 11362
+    li        t7, 2259
+    li        t8, 4433
+    li        t9, 10703
+    li        a1, 10704
+    li        s8, 8
+
+2:
+    lh        a2, 0(a0)     // 0
+    lh        a3, 16(a0)    // 8
+    lh        v0, 32(a0)    // 16
+    lh        v1, 48(a0)    // 24
+    lh        s4, 64(a0)    // 32
+    lh        s5, 80(a0)    // 40
+    lh        s6, 96(a0)    // 48
+    lh        s7, 112(a0)   // 56
+    addu      s2, v0, s5    // tmp2 = 16 + 40
+    subu      s5, v0, s5    // tmp5 = 16 - 40
+    addu      s3, v1, s4    // tmp3 = 24 + 32
+    subu      s4, v1, s4    // tmp4 = 24 - 32
+    addu      s0, a2, s7    // tmp0 =  0 + 56
+    subu      s7, a2, s7    // tmp7 =  0 - 56
+    addu      s1, a3, s6    // tmp1 =  8 + 48
+    subu      s6, a3, s6    // tmp6 =  8 - 48
+    addu      a2, s0, s3    // tmp10 = tmp0 + tmp3
+    subu      v1, s0, s3    // tmp13 = tmp0 - tmp3
+    addu      a3, s1, s2    // tmp11 = tmp1 + tmp2
+    subu      v0, s1, s2    // tmp12 = tmp1 - tmp2
+    mult      s7, t1        // ac0  = tmp7 * c1
+    madd      s4, t0        // ac0 += tmp4 * c0
+    madd      s5, t4        // ac0 += tmp5 * c4
+    madd      s6, t2        // ac0 += tmp6 * c2
+    mult      $ac1, s7, t2  // ac1  = tmp7 * c2
+    msub      $ac1, s4, t3  // ac1 -= tmp4 * c3
+    msub      $ac1, s5, t6  // ac1 -= tmp5 * c6
+    msub      $ac1, s6, t7  // ac1 -= tmp6 * c7
+    mult      $ac2, s7, t4  // ac2  = tmp7 * c4
+    madd      $ac2, s4, t2  // ac2 += tmp4 * c2
+    madd      $ac2, s5, t5  // ac2 += tmp5 * c5
+    msub      $ac2, s6, t6  // ac2 -= tmp6 * c6
+    mult      $ac3, s7, t0  // ac3  = tmp7 * c0
+    msub      $ac3, s4, t1  // ac3 -= tmp4 * c1
+    madd      $ac3, s5, t2  // ac3 += tmp5 * c2
+    msub      $ac3, s6, t3  // ac3 -= tmp6 * c3
+    extr_r.w  s0, $ac0, 15  // tmp0 = (ac0 + 16384) >> 15
+    extr_r.w  s1, $ac1, 15  // tmp1 = (ac1 + 16384) >> 15
+    extr_r.w  s2, $ac2, 15  // tmp2 = (ac2 + 16384) >> 15
+    extr_r.w  s3, $ac3, 15  // tmp3 = (ac3 + 16384) >> 15
+    addiu     s8, s8, -1
+    addu      s4, a2, a3    // tmp4 = tmp10 + tmp11
+    subu      s5, a2, a3    // tmp5 = tmp10 - tmp11
+    sh        s0, 16(a0)
+    sh        s1, 48(a0)
+    sh        s2, 80(a0)
+    sh        s3, 112(a0)
+    mult      v0, t8        // ac0  = tmp12 * c8
+    madd      v1, t9        // ac0 += tmp13 * c9
+    mult      $ac1, v1, t8  // ac1  = tmp13 * c8
+    msub      $ac1, v0, a1  // ac1 -= tmp12 * c10
+    addiu     a0, a0, 2
+    extr_r.w  s6, $ac0, 15  // tmp6 = (ac0 + 16384) >> 15
+    extr_r.w  s7, $ac1, 15  // tmp7 = (ac1 + 16384) >> 15
+    shra_r.w  s4, s4, 2     // tmp4 = (tmp4 + 2) >> 2
+    shra_r.w  s5, s5, 2     // tmp5 = (tmp5 + 2) >> 2
+    sh        s4, -2(a0)
+    sh        s5, 62(a0)
+    sh        s6, 30(a0)
+    bgtz      s8, 2b
+     sh       s7, 94(a0)
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    jr       ra
+     nop
+
+END(jsimd_fdct_islow_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
+/*
+ * a0     - data
+ */
+    .set at
+    SAVE_REGS_ON_STACK 8, s0, s1
+    li           a1, 0x014e014e  // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
+    li           a2, 0x008b008b  // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
+    li           a3, 0x00620062  // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
+    li           s1, 0x00b500b5  // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
+
+    move         v0, a0
+    addiu        v1, v0, 128     // end address
+
+0:
+    lw           t0, 0(v0)       // tmp0 = 1|0
+    lw           t1, 4(v0)       // tmp1 = 3|2
+    lw           t2, 8(v0)       // tmp2 = 5|4
+    lw           t3, 12(v0)      // tmp3 = 7|6
+    packrl.ph    t1, t1, t1      // tmp1 = 2|3
+    packrl.ph    t3, t3, t3      // tmp3 = 6|7
+    subq.ph      t7, t1, t2      // tmp7 = 2-5|3-4 = t5|t4
+    subq.ph      t5, t0, t3      // tmp5 = 1-6|0-7 = t6|t7
+    addq.ph      t6, t1, t2      // tmp6 = 2+5|3+4 = t2|t3
+    addq.ph      t4, t0, t3      // tmp4 = 1+6|0+7 = t1|t0
+    addq.ph      t8, t4, t6      // tmp5 = t1+t2|t0+t3 = t11|t10
+    subq.ph      t9, t4, t6      // tmp7 = t1-t2|t0-t3 = t12|t13
+    sra          t4, t8, 16      // tmp4 = t11
+    mult         $0, $0          // ac0  = 0
+    dpa.w.ph     $ac0, t9, s1
+    mult         $ac1, $0, $0    // ac1  = 0
+    dpa.w.ph     $ac1, t7, a3    // ac1 += t4*98 + t5*98
+    dpsx.w.ph    $ac1, t5, a3    // ac1 += t6*98 + t7*98
+    mult         $ac2, $0, $0    // ac2  = 0
+    dpa.w.ph     $ac2, t7, a2    // ac2 += t4*139 + t5*139
+    mult         $ac3, $0, $0    // ac3  = 0
+    dpa.w.ph     $ac3, t5, a1    // ac3 += t6*334 + t7*334
+    precrq.ph.w  t0, t5, t7      // t0 = t5|t6
+    addq.ph      t2, t8, t4      // tmp2 = t10 + t11
+    subq.ph      t3, t8, t4      // tmp3 = t10 - t11
+    extr.w       t4, $ac0, 8
+    mult         $0, $0          // ac0  = 0
+    dpa.w.ph     $ac0, t0, s1    // ac0 += t5*181 + t6*181
+    extr.w       t0, $ac1, 8     // t0 = z5
+    extr.w       t1, $ac2, 8     // t1 = MULTIPLY(tmp10, 139)
+    extr.w       t7, $ac3, 8     // t2 = MULTIPLY(tmp12, 334)
+    extr.w       t8, $ac0, 8     // t8 = z3 = MULTIPLY(tmp11, 181)
+    add          t6, t1, t0      // t6 = z2
+    add          t7, t7, t0      // t7 = z4
+    subq.ph      t0, t5, t8      // t0 = z13 = tmp7 - z3
+    addq.ph      t8, t5, t8      // t9 = z11 = tmp7 + z3
+    addq.ph      t1, t0, t6      // t1 = z13 + z2
+    subq.ph      t6, t0, t6      // t6 = z13 - z2
+    addq.ph      t0, t8, t7      // t0 = z11 + z4
+    subq.ph      t7, t8, t7      // t7 = z11 - z4
+    addq.ph      t5, t4, t9
+    subq.ph      t4, t9, t4
+    sh           t2, 0(v0)
+    sh           t5, 4(v0)
+    sh           t3, 8(v0)
+    sh           t4, 12(v0)
+    sh           t1, 10(v0)
+    sh           t6, 6(v0)
+    sh           t0, 2(v0)
+    sh           t7, 14(v0)
+    addiu        v0, 16
+    bne          v1, v0, 0b
+     nop
+    move         v0, a0
+    addiu        v1, v0, 16
+
+1:
+    lh           t0, 0(v0)       // 0
+    lh           t1, 16(v0)      // 8
+    lh           t2, 32(v0)      // 16
+    lh           t3, 48(v0)      // 24
+    lh           t4, 64(v0)      // 32
+    lh           t5, 80(v0)      // 40
+    lh           t6, 96(v0)      // 48
+    lh           t7, 112(v0)     // 56
+    add          t8, t0, t7      // t8 = tmp0
+    sub          t7, t0, t7      // t7 = tmp7
+    add          t0, t1, t6      // t0 = tmp1
+    sub          t1, t1, t6      // t1 = tmp6
+    add          t6, t2, t5      // t6 = tmp2
+    sub          t5, t2, t5      // t5 = tmp5
+    add          t2, t3, t4      // t2 = tmp3
+    sub          t3, t3, t4      // t3 = tmp4
+    add          t4, t8, t2      // t4 = tmp10 = tmp0 + tmp3
+    sub          t8, t8, t2      // t8 = tmp13 = tmp0 - tmp3
+    sub          s0, t0, t6      // s0 = tmp12 = tmp1 - tmp2
+    ins          t8, s0, 16, 16  // t8 = tmp12|tmp13
+    add          t2, t0, t6      // t2 = tmp11 = tmp1 + tmp2
+    mult         $0, $0          // ac0  = 0
+    dpa.w.ph     $ac0, t8, s1    // ac0 += t12*181 + t13*181
+    add          s0, t4, t2      // t8 = tmp10+tmp11
+    sub          t4, t4, t2      // t4 = tmp10-tmp11
+    sh           s0, 0(v0)
+    sh           t4, 64(v0)
+    extr.w       t2, $ac0, 8     // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
+    addq.ph      t4, t8, t2      // t9 = tmp13 + z1
+    subq.ph      t8, t8, t2      // t2 = tmp13 - z1
+    sh           t4, 32(v0)
+    sh           t8, 96(v0)
+    add          t3, t3, t5      // t3 = tmp10 = tmp4 + tmp5
+    add          t0, t5, t1      // t0 = tmp11 = tmp5 + tmp6
+    add          t1, t1, t7      // t1 = tmp12 = tmp6 + tmp7
+    andi         t4, a1, 0xffff
+    mul          s0, t1, t4
+    sra          s0, s0, 8       // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
+    ins          t1, t3, 16, 16  // t1 = tmp10|tmp12
+    mult         $0, $0          // ac0  = 0
+    mulsa.w.ph   $ac0, t1, a3    // ac0 += t10*98 - t12*98
+    extr.w       t8, $ac0, 8     // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
+    add          t2, t7, t8      // t2 = tmp7 + z5
+    sub          t7, t7, t8      // t7 = tmp7 - z5
+    andi         t4, a2, 0xffff
+    mul          t8, t3, t4
+    sra          t8, t8, 8       // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
+    andi         t4, s1, 0xffff
+    mul          t6, t0, t4
+    sra          t6, t6, 8       // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
+    add          t0, t6, t8      // t0 = z3 + z2
+    sub          t1, t6, t8      // t1 = z3 - z2
+    add          t3, t6, s0      // t3 = z3 + z4
+    sub          t4, t6, s0      // t4 = z3 - z4
+    sub          t5, t2, t1      // t5 = dataptr[5]
+    sub          t6, t7, t0      // t6 = dataptr[3]
+    add          t3, t2, t3      // t3 = dataptr[1]
+    add          t4, t7, t4      // t4 = dataptr[7]
+    sh           t5, 80(v0)
+    sh           t6, 48(v0)
+    sh           t3, 16(v0)
+    sh           t4, 112(v0)
+    addiu        v0, 2
+    bne          v0, v1, 1b
+     nop
+
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j            ra
+     nop
+END(jsimd_fdct_ifast_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
+/*
+ * a0     - coef_block
+ * a1     - divisors
+ * a2     - workspace
+ */
+
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2
+
+    addiu   v0, a2, 124  // v0 = workspace_end
+    lh      t0, 0(a2)
+    lh      t1, 0(a1)
+    lh      t2, 128(a1)
+    sra     t3, t0, 15
+    sll     t3, t3, 1
+    addiu   t3, t3, 1
+    mul     t0, t0, t3
+    lh      t4, 384(a1)
+    lh      t5, 130(a1)
+    lh      t6, 2(a2)
+    lh      t7, 2(a1)
+    lh      t8, 386(a1)
+
+1:
+    andi    t1, 0xffff
+    add     t9, t0, t2
+    andi    t9, 0xffff
+    mul     v1, t9, t1
+    sra     s0, t6, 15
+    sll     s0, s0, 1
+    addiu   s0, s0, 1
+    addiu   t9, t4, 16
+    srav    v1, v1, t9
+    mul     v1, v1, t3
+    mul     t6, t6, s0
+    andi    t7, 0xffff
+    addiu   a2, a2, 4
+    addiu   a1, a1, 4
+    add     s1, t6, t5
+    andi    s1, 0xffff
+    sh      v1, 0(a0)
+
+    mul     s2, s1, t7
+    addiu   s1, t8, 16
+    srav    s2, s2, s1
+    mul     s2,s2, s0
+    lh      t0, 0(a2)
+    lh      t1, 0(a1)
+    sra     t3, t0, 15
+    sll     t3, t3, 1
+    addiu   t3, t3, 1
+    mul     t0, t0, t3
+    lh      t2, 128(a1)
+    lh      t4, 384(a1)
+    lh      t5, 130(a1)
+    lh      t8, 386(a1)
+    lh      t6, 2(a2)
+    lh      t7, 2(a1)
+    sh      s2, 2(a0)
+    lh      t0, 0(a2)
+    sra     t3, t0, 15
+    sll     t3, t3, 1
+    addiu   t3, t3, 1
+    mul     t0, t0,t3
+    bne     a2, v0, 1b
+     addiu  a0, a0, 4
+
+    andi    t1, 0xffff
+    add     t9, t0, t2
+    andi    t9, 0xffff
+    mul     v1, t9, t1
+    sra     s0, t6, 15
+    sll     s0, s0, 1
+    addiu   s0, s0, 1
+    addiu   t9, t4, 16
+    srav    v1, v1, t9
+    mul     v1, v1, t3
+    mul     t6, t6, s0
+    andi    t7, 0xffff
+    sh      v1, 0(a0)
+    add     s1, t6, t5
+    andi    s1, 0xffff
+    mul     s2, s1, t7
+    addiu   s1, t8, 16
+    addiu   a2, a2, 4
+    addiu   a1, a1, 4
+    srav    s2, s2, s1
+    mul     s2, s2, s0
+    sh      s2, 2(a0)
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
+
+    j       ra
+     nop
+
+END(jsimd_quantize_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
+/*
+ * a0     - coef_block
+ * a1     - divisors
+ * a2     - workspace
+ */
+
+    .set at
+
+    li         t1, 0x46800100     //integer representation 16384.5
+    mtc1       t1, f0
+    li         t0, 63
+0:
+    lwc1       f2, 0(a2)
+    lwc1       f10, 0(a1)
+    lwc1       f4, 4(a2)
+    lwc1       f12, 4(a1)
+    lwc1       f6, 8(a2)
+    lwc1       f14, 8(a1)
+    lwc1       f8, 12(a2)
+    lwc1       f16, 12(a1)
+    madd.s     f2, f0, f2, f10
+    madd.s     f4, f0, f4, f12
+    madd.s     f6, f0, f6, f14
+    madd.s     f8, f0, f8, f16
+    lwc1       f10, 16(a1)
+    lwc1       f12, 20(a1)
+    trunc.w.s  f2, f2
+    trunc.w.s  f4, f4
+    trunc.w.s  f6, f6
+    trunc.w.s  f8, f8
+    lwc1       f14, 24(a1)
+    lwc1       f16, 28(a1)
+    mfc1       t1, f2
+    mfc1       t2, f4
+    mfc1       t3, f6
+    mfc1       t4, f8
+    lwc1       f2, 16(a2)
+    lwc1       f4, 20(a2)
+    lwc1       f6, 24(a2)
+    lwc1       f8, 28(a2)
+    madd.s     f2, f0, f2, f10
+    madd.s     f4, f0, f4, f12
+    madd.s     f6, f0, f6, f14
+    madd.s     f8, f0, f8, f16
+    addiu      t1, t1, -16384
+    addiu      t2, t2, -16384
+    addiu      t3, t3, -16384
+    addiu      t4, t4, -16384
+    trunc.w.s  f2, f2
+    trunc.w.s  f4, f4
+    trunc.w.s  f6, f6
+    trunc.w.s  f8, f8
+    sh         t1, 0(a0)
+    sh         t2, 2(a0)
+    sh         t3, 4(a0)
+    sh         t4, 6(a0)
+    mfc1       t1, f2
+    mfc1       t2, f4
+    mfc1       t3, f6
+    mfc1       t4, f8
+    addiu      t0, t0, -8
+    addiu      a2, a2, 32
+    addiu      a1, a1, 32
+    addiu      t1, t1, -16384
+    addiu      t2, t2, -16384
+    addiu      t3, t3, -16384
+    addiu      t4, t4, -16384
+    sh         t1, 8(a0)
+    sh         t2, 10(a0)
+    sh         t3, 12(a0)
+    sh         t4, 14(a0)
+    bgez       t0, 0b
+     addiu     a0, a0, 16
+
+    j          ra
+     nop
+
+END(jsimd_quantize_float_mips_dspr2)
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
+/*
+ * a0     - compptr->dct_table
+ * a1     - coef_block
+ * a2     - output_buf
+ * a3     - output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    addiu     sp, sp, -40
+    move      v0, sp
+    addiu     s2, zero, 29692
+    addiu     s3, zero, -10426
+    addiu     s4, zero, 6967
+    addiu     s5, zero, -5906
+    lh        t0, 0(a1)         // t0 = inptr[DCTSIZE*0]
+    lh        t5, 0(a0)         // t5 = quantptr[DCTSIZE*0]
+    lh        t1, 48(a1)        // t1 = inptr[DCTSIZE*3]
+    lh        t6, 48(a0)        // t6 = quantptr[DCTSIZE*3]
+    mul       t4, t5, t0
+    lh        t0, 16(a1)        // t0 = inptr[DCTSIZE*1]
+    lh        t5, 16(a0)        // t5 = quantptr[DCTSIZE*1]
+    mul       t6, t6, t1
+    mul       t5, t5, t0
+    lh        t2, 80(a1)        // t2 = inptr[DCTSIZE*5]
+    lh        t7, 80(a0)        // t7 = quantptr[DCTSIZE*5]
+    lh        t3, 112(a1)       // t3 = inptr[DCTSIZE*7]
+    lh        t8, 112(a0)       // t8 = quantptr[DCTSIZE*7]
+    mul       t7, t7, t2
+    mult      zero, zero
+    mul       t8, t8, t3
+    li        s0, 0x73FCD746    // s0 = (29692 << 16) | (-10426 & 0xffff)
+    li        s1, 0x1B37E8EE    // s1 = (6967 << 16) | (-5906 & 0xffff)
+    ins       t6, t5, 16, 16    // t6 = t5|t6
+    sll       t4, t4, 15
+    dpa.w.ph  $ac0, t6, s0
+    lh        t1, 2(a1)
+    lh        t6, 2(a0)
+    ins       t8, t7, 16, 16    // t8 = t7|t8
+    dpa.w.ph  $ac0, t8, s1
+    mflo      t0, $ac0
+    mul       t5, t6, t1
+    lh        t1, 18(a1)
+    lh        t6, 18(a0)
+    lh        t2, 50(a1)
+    lh        t7, 50(a0)
+    mul       t6, t6, t1
+    subu      t8, t4, t0
+    mul       t7, t7, t2
+    addu      t0, t4, t0
+    shra_r.w  t0, t0, 13
+    lh        t1, 82(a1)
+    lh        t2, 82(a0)
+    lh        t3, 114(a1)
+    lh        t4, 114(a0)
+    shra_r.w  t8, t8, 13
+    mul       t1, t1, t2
+    mul       t3, t3, t4
+    sw        t0, 0(v0)
+    sw        t8, 20(v0)
+    sll       t4, t5, 15
+    ins       t7, t6, 16, 16
+    mult      zero, zero
+    dpa.w.ph  $ac0, t7, s0
+    ins       t3, t1, 16, 16
+    lh        t1, 6(a1)
+    lh        t6, 6(a0)
+    dpa.w.ph  $ac0, t3, s1
+    mflo      t0, $ac0
+    mul       t5, t6, t1
+    lh        t1, 22(a1)
+    lh        t6, 22(a0)
+    lh        t2, 54(a1)
+    lh        t7, 54(a0)
+    mul       t6, t6, t1
+    subu      t8, t4, t0
+    mul       t7, t7, t2
+    addu      t0, t4, t0
+    shra_r.w  t0, t0, 13
+    lh        t1, 86(a1)
+    lh        t2, 86(a0)
+    lh        t3, 118(a1)
+    lh        t4, 118(a0)
+    shra_r.w  t8, t8, 13
+    mul       t1, t1, t2
+    mul       t3, t3, t4
+    sw        t0, 4(v0)
+    sw        t8, 24(v0)
+    sll       t4, t5, 15
+    ins       t7, t6, 16, 16
+    mult      zero, zero
+    dpa.w.ph  $ac0, t7, s0
+    ins       t3, t1, 16, 16
+    lh        t1, 10(a1)
+    lh        t6, 10(a0)
+    dpa.w.ph  $ac0, t3, s1
+    mflo      t0, $ac0
+    mul       t5, t6, t1
+    lh        t1, 26(a1)
+    lh        t6, 26(a0)
+    lh        t2, 58(a1)
+    lh        t7, 58(a0)
+    mul       t6, t6, t1
+    subu      t8, t4, t0
+    mul       t7, t7, t2
+    addu      t0, t4, t0
+    shra_r.w  t0, t0, 13
+    lh        t1, 90(a1)
+    lh        t2, 90(a0)
+    lh        t3, 122(a1)
+    lh        t4, 122(a0)
+    shra_r.w  t8, t8, 13
+    mul       t1, t1, t2
+    mul       t3, t3, t4
+    sw        t0, 8(v0)
+    sw        t8, 28(v0)
+    sll       t4, t5, 15
+    ins       t7, t6, 16, 16
+    mult      zero, zero
+    dpa.w.ph  $ac0, t7, s0
+    ins       t3, t1, 16, 16
+    lh        t1, 14(a1)
+    lh        t6, 14(a0)
+    dpa.w.ph  $ac0, t3, s1
+    mflo      t0, $ac0
+    mul       t5, t6, t1
+    lh        t1, 30(a1)
+    lh        t6, 30(a0)
+    lh        t2, 62(a1)
+    lh        t7, 62(a0)
+    mul       t6, t6, t1
+    subu      t8, t4, t0
+    mul       t7, t7, t2
+    addu      t0, t4, t0
+    shra_r.w  t0, t0, 13
+    lh        t1, 94(a1)
+    lh        t2, 94(a0)
+    lh        t3, 126(a1)
+    lh        t4, 126(a0)
+    shra_r.w  t8, t8, 13
+    mul       t1, t1, t2
+    mul       t3, t3, t4
+    sw        t0, 12(v0)
+    sw        t8, 32(v0)
+    sll       t4, t5, 15
+    ins       t7, t6, 16, 16
+    mult      zero, zero
+    dpa.w.ph  $ac0, t7, s0
+    ins       t3, t1, 16, 16
+    dpa.w.ph  $ac0, t3, s1
+    mflo      t0, $ac0
+    lw        t9, 0(a2)
+    lw        t3, 0(v0)
+    lw        t7, 4(v0)
+    lw        t1, 8(v0)
+    addu      t9, t9, a3
+    sll       t3, t3, 15
+    subu      t8, t4, t0
+    addu      t0, t4, t0
+    shra_r.w  t0, t0, 13
+    shra_r.w  t8, t8, 13
+    sw        t0, 16(v0)
+    sw        t8, 36(v0)
+    lw        t5, 12(v0)
+    lw        t6, 16(v0)
+    mult      t7, s2
+    madd      t1, s3
+    madd      t5, s4
+    madd      t6, s5
+    lw        t5, 24(v0)
+    lw        t7, 28(v0)
+    mflo      t0, $ac0
+    lw        t8, 32(v0)
+    lw        t2, 36(v0)
+    mult      $ac1, t5, s2
+    madd      $ac1, t7, s3
+    madd      $ac1, t8, s4
+    madd      $ac1, t2, s5
+    addu      t1, t3, t0
+    subu      t6, t3, t0
+    shra_r.w  t1, t1, 20
+    shra_r.w  t6, t6, 20
+    mflo      t4, $ac1
+    shll_s.w  t1, t1, 24
+    shll_s.w  t6, t6, 24
+    sra       t1, t1, 24
+    sra       t6, t6, 24
+    addiu     t1, t1, 128
+    addiu     t6, t6, 128
+    lw        t0, 20(v0)
+    sb        t1, 0(t9)
+    sb        t6, 1(t9)
+    sll       t0, t0, 15
+    lw        t9, 4(a2)
+    addu      t1, t0, t4
+    subu      t6, t0, t4
+    addu      t9, t9, a3
+    shra_r.w  t1, t1, 20
+    shra_r.w  t6, t6, 20
+    shll_s.w  t1, t1, 24
+    shll_s.w  t6, t6, 24
+    sra       t1, t1, 24
+    sra       t6, t6, 24
+    addiu     t1, t1, 128
+    addiu     t6, t6, 128
+    sb        t1, 0(t9)
+    sb        t6, 1(t9)
+    addiu     sp, sp, 40
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j         ra
+     nop
+
+END(jsimd_idct_2x2_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
+/*
+ * a0     - compptr->dct_table
+ * a1     - coef_block
+ * a2     - output_buf
+ * a3     - output_col
+ * 16(sp) - workspace[DCTSIZE*4];  // buffers data between passes
+ */
+
+    .set at
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw        v1, 48(sp)
+    move      t0, a1
+    move      t1, v1
+    li        t9, 4
+    li        s0, 0x2e75f93e
+    li        s1, 0x21f9ba79
+    li        s2, 0xecc2efb0
+    li        s3, 0x52031ccd
+
+0:
+    lh        s6, 32(t0)        // inptr[DCTSIZE*2]
+    lh        t6, 32(a0)        // quantptr[DCTSIZE*2]
+    lh        s7, 96(t0)        // inptr[DCTSIZE*6]
+    lh        t7, 96(a0)        // quantptr[DCTSIZE*6]
+    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+    lh        s4, 0(t0)         // inptr[DCTSIZE*0]
+    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+    lh        s5, 0(a0)         // quantptr[0]
+    li        s6, 15137
+    li        s7, 6270
+    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
+    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+    lh        t5, 112(t0)       // inptr[DCTSIZE*7]
+    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+    lh        s4, 112(a0)       // quantptr[DCTSIZE*7]
+    lh        v0, 80(t0)        // inptr[DCTSIZE*5]
+    lh        s5, 80(a0)        // quantptr[DCTSIZE*5]
+    lh        s6, 48(a0)        // quantptr[DCTSIZE*3]
+    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
+    lh        s7, 16(a0)        // quantptr[DCTSIZE*1]
+    lh        t8, 16(t0)        // inptr[DCTSIZE*1]
+    subu      t6, t6, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
+    lh        t7, 48(t0)        // inptr[DCTSIZE*3]
+    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
+    mul       v0, s5, v0        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
+    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
+    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
+    addu      t3, t2, t6        // tmp10 = tmp0 + z2
+    subu      t4, t2, t6        // tmp10 = tmp0 - z2
+    mult      $ac0, zero, zero
+    mult      $ac1, zero, zero
+    ins       t5, v0, 16, 16
+    ins       t7, t8, 16, 16
+    addiu     t9, t9, -1
+    dpa.w.ph  $ac0, t5, s0
+    dpa.w.ph  $ac0, t7, s1
+    dpa.w.ph  $ac1, t5, s2
+    dpa.w.ph  $ac1, t7, s3
+    mflo      s4, $ac0
+    mflo      s5, $ac1
+    addiu     a0, a0, 2
+    addiu     t1, t1, 4
+    addiu     t0, t0, 2
+    addu      t6, t4, s4
+    subu      t5, t4, s4
+    addu      s6, t3, s5
+    subu      s7, t3, s5
+    shra_r.w  t6, t6, 12        // DESCALE(tmp12 + temp1, 12)
+    shra_r.w  t5, t5, 12        // DESCALE(tmp12 - temp1, 12)
+    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
+    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
+    sw        t6, 28(t1)
+    sw        t5, 60(t1)
+    sw        s6, -4(t1)
+    bgtz      t9, 0b
+     sw       s7, 92(t1)
+    // second loop three pass
+    li        t9, 3
+1:
+    lh        s6, 34(t0)        // inptr[DCTSIZE*2]
+    lh        t6, 34(a0)        // quantptr[DCTSIZE*2]
+    lh        s7, 98(t0)        // inptr[DCTSIZE*6]
+    lh        t7, 98(a0)        // quantptr[DCTSIZE*6]
+    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+    lh        s4, 2(t0)         // inptr[DCTSIZE*0]
+    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+    lh        s5, 2(a0)         // quantptr[DCTSIZE*0]
+    li        s6, 15137
+    li        s7, 6270
+    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
+    mul       v0, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+    lh        t5, 114(t0)       // inptr[DCTSIZE*7]
+    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+    lh        s4, 114(a0)       // quantptr[DCTSIZE*7]
+    lh        s5, 82(a0)        // quantptr[DCTSIZE*5]
+    lh        t6, 82(t0)        // inptr[DCTSIZE*5]
+    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
+    lh        s6, 50(a0)        // quantptr[DCTSIZE*3]
+    lh        t8, 18(t0)        // inptr[DCTSIZE*1]
+    subu      v0, v0, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
+    lh        t7, 50(t0)        // inptr[DCTSIZE*3]
+    lh        s7, 18(a0)        // quantptr[DCTSIZE*1]
+    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
+    mul       t6, s5, t6        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
+    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
+    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
+    addu      t3, t2, v0        // tmp10 = tmp0 + z2
+    subu      t4, t2, v0        // tmp10 = tmp0 - z2
+    mult      $ac0, zero, zero
+    mult      $ac1, zero, zero
+    ins       t5, t6, 16, 16
+    ins       t7, t8, 16, 16
+    dpa.w.ph  $ac0, t5, s0
+    dpa.w.ph  $ac0, t7, s1
+    dpa.w.ph  $ac1, t5, s2
+    dpa.w.ph  $ac1, t7, s3
+    mflo      t5, $ac0
+    mflo      t6, $ac1
+    addiu     t9, t9, -1
+    addiu     t0, t0, 2
+    addiu     a0, a0, 2
+    addiu     t1, t1, 4
+    addu      s5, t4, t5
+    subu      s4, t4, t5
+    addu      s6, t3, t6
+    subu      s7, t3, t6
+    shra_r.w  s5, s5, 12        // DESCALE(tmp12 + temp1, 12)
+    shra_r.w  s4, s4, 12        // DESCALE(tmp12 - temp1, 12)
+    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
+    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
+    sw        s5, 32(t1)
+    sw        s4, 64(t1)
+    sw        s6, 0(t1)
+    bgtz      t9, 1b
+     sw       s7, 96(t1)
+    move      t1, v1
+    li        s4, 15137
+    lw        s6, 8(t1)         // wsptr[2]
+    li        s5, 6270
+    lw        s7, 24(t1)        // wsptr[6]
+    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
+    lw        t2, 0(t1)         // wsptr[0]
+    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
+    lh        t5, 28(t1)        // wsptr[7]
+    lh        t6, 20(t1)        // wsptr[5]
+    lh        t7, 12(t1)        // wsptr[3]
+    lh        t8, 4(t1)         // wsptr[1]
+    ins       t5, t6, 16, 16
+    ins       t7, t8, 16, 16
+    mult      $ac0, zero, zero
+    dpa.w.ph  $ac0, t5, s0
+    dpa.w.ph  $ac0, t7, s1
+    mult      $ac1, zero, zero
+    dpa.w.ph  $ac1, t5, s2
+    dpa.w.ph  $ac1, t7, s3
+    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
+    mflo      s6, $ac0
+    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+    subu      s4, s4, s5
+    addu      t3, t2, s4        // tmp10 = tmp0 + z2
+    mflo      s7, $ac1
+    subu      t4, t2, s4        // tmp10 = tmp0 - z2
+    addu      t7, t4, s6
+    subu      t8, t4, s6
+    addu      t5, t3, s7
+    subu      t6, t3, s7
+    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
+    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
+    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
+    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
+    sll       s4, t9, 2
+    lw        v0, 0(a2)         // output_buf[ctr]
+    shll_s.w  t5, t5, 24
+    shll_s.w  t6, t6, 24
+    shll_s.w  t7, t7, 24
+    shll_s.w  t8, t8, 24
+    sra       t5, t5, 24
+    sra       t6, t6, 24
+    sra       t7, t7, 24
+    sra       t8, t8, 24
+    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
+    addiu     t5, t5, 128
+    addiu     t6, t6, 128
+    addiu     t7, t7, 128
+    addiu     t8, t8, 128
+    sb        t5, 0(v0)
+    sb        t7, 1(v0)
+    sb        t8, 2(v0)
+    sb        t6, 3(v0)
+    // 2
+    li        s4, 15137
+    lw        s6, 40(t1)        // wsptr[2]
+    li        s5, 6270
+    lw        s7, 56(t1)        // wsptr[6]
+    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
+    lw        t2, 32(t1)        // wsptr[0]
+    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
+    lh        t5, 60(t1)        // wsptr[7]
+    lh        t6, 52(t1)        // wsptr[5]
+    lh        t7, 44(t1)        // wsptr[3]
+    lh        t8, 36(t1)        // wsptr[1]
+    ins       t5, t6, 16, 16
+    ins       t7, t8, 16, 16
+    mult      $ac0, zero, zero
+    dpa.w.ph  $ac0, t5, s0
+    dpa.w.ph  $ac0, t7, s1
+    mult      $ac1, zero, zero
+    dpa.w.ph  $ac1, t5, s2
+    dpa.w.ph  $ac1, t7, s3
+    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
+    mflo      s6, $ac0
+    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+    subu      s4, s4, s5
+    addu      t3, t2, s4        // tmp10 = tmp0 + z2
+    mflo      s7, $ac1
+    subu      t4, t2, s4        // tmp10 = tmp0 - z2
+    addu      t7, t4, s6
+    subu      t8, t4, s6
+    addu      t5, t3, s7
+    subu      t6, t3, s7
+    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
+    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
+    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
+    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
+    sll       s4, t9, 2
+    lw        v0, 4(a2)         // output_buf[ctr]
+    shll_s.w  t5, t5, 24
+    shll_s.w  t6, t6, 24
+    shll_s.w  t7, t7, 24
+    shll_s.w  t8, t8, 24
+    sra       t5, t5, 24
+    sra       t6, t6, 24
+    sra       t7, t7, 24
+    sra       t8, t8, 24
+    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
+    addiu     t5, t5, 128
+    addiu     t6, t6, 128
+    addiu     t7, t7, 128
+    addiu     t8, t8, 128
+    sb        t5, 0(v0)
+    sb        t7, 1(v0)
+    sb        t8, 2(v0)
+    sb        t6, 3(v0)
+    // 3
+    li        s4, 15137
+    lw        s6, 72(t1)        // wsptr[2]
+    li        s5, 6270
+    lw        s7, 88(t1)        // wsptr[6]
+    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
+    lw        t2, 64(t1)        // wsptr[0]
+    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
+    lh        t5, 92(t1)        // wsptr[7]
+    lh        t6, 84(t1)        // wsptr[5]
+    lh        t7, 76(t1)        // wsptr[3]
+    lh        t8, 68(t1)        // wsptr[1]
+    ins       t5, t6, 16, 16
+    ins       t7, t8, 16, 16
+    mult      $ac0, zero, zero
+    dpa.w.ph  $ac0, t5, s0
+    dpa.w.ph  $ac0, t7, s1
+    mult      $ac1, zero, zero
+    dpa.w.ph  $ac1, t5, s2
+    dpa.w.ph  $ac1, t7, s3
+    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
+    mflo      s6, $ac0
+    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+    subu      s4, s4, s5
+    addu      t3, t2, s4        // tmp10 = tmp0 + z2
+    mflo      s7, $ac1
+    subu      t4, t2, s4        // tmp10 = tmp0 - z2
+    addu      t7, t4, s6
+    subu      t8, t4, s6
+    addu      t5, t3, s7
+    subu      t6, t3, s7
+    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
+    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
+    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
+    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
+    sll       s4, t9, 2
+    lw        v0, 8(a2)         // output_buf[ctr]
+    shll_s.w  t5, t5, 24
+    shll_s.w  t6, t6, 24
+    shll_s.w  t7, t7, 24
+    shll_s.w  t8, t8, 24
+    sra       t5, t5, 24
+    sra       t6, t6, 24
+    sra       t7, t7, 24
+    sra       t8, t8, 24
+    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
+    addiu     t5, t5, 128
+    addiu     t6, t6, 128
+    addiu     t7, t7, 128
+    addiu     t8, t8, 128
+    sb        t5, 0(v0)
+    sb        t7, 1(v0)
+    sb        t8, 2(v0)
+    sb        t6, 3(v0)
+    li        s4, 15137
+    lw        s6, 104(t1)       // wsptr[2]
+    li        s5, 6270
+    lw        s7, 120(t1)       // wsptr[6]
+    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
+    lw        t2, 96(t1)        // wsptr[0]
+    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865)
+    lh        t5, 124(t1)       // wsptr[7]
+    lh        t6, 116(t1)       // wsptr[5]
+    lh        t7, 108(t1)       // wsptr[3]
+    lh        t8, 100(t1)       // wsptr[1]
+    ins       t5, t6, 16, 16
+    ins       t7, t8, 16, 16
+    mult      $ac0, zero, zero
+    dpa.w.ph  $ac0, t5, s0
+    dpa.w.ph  $ac0, t7, s1
+    mult      $ac1, zero, zero
+    dpa.w.ph  $ac1, t5, s2
+    dpa.w.ph  $ac1, t7, s3
+    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
+    mflo      s6, $ac0
+    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+    subu      s4, s4, s5
+    addu      t3, t2, s4        // tmp10 = tmp0 + z2;
+    mflo      s7, $ac1
+    subu      t4, t2, s4        // tmp10 = tmp0 - z2;
+    addu      t7, t4, s6
+    subu      t8, t4, s6
+    addu      t5, t3, s7
+    subu      t6, t3, s7
+    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
+    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
+    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
+    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
+    sll       s4, t9, 2
+    lw        v0, 12(a2)        // output_buf[ctr]
+    shll_s.w  t5, t5, 24
+    shll_s.w  t6, t6, 24
+    shll_s.w  t7, t7, 24
+    shll_s.w  t8, t8, 24
+    sra       t5, t5, 24
+    sra       t6, t6, 24
+    sra       t7, t7, 24
+    sra       t8, t8, 24
+    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
+    addiu     t5, t5, 128
+    addiu     t6, t6, 128
+    addiu     t7, t7, 128
+    addiu     t8, t8, 128
+    sb        t5, 0(v0)
+    sb        t7, 1(v0)
+    sb        t8, 2(v0)
+    sb        t6, 3(v0)
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j         ra
+     nop
+END(jsimd_idct_4x4_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
+/*
+ * a0     - compptr->dct_table
+ * a1     - coef_block
+ * a2     - output_buf
+ * a3     - output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu     sp, sp, -144
+    move      v0, sp
+    addiu     v1, v0, 24
+    addiu     t9, zero, 5793
+    addiu     s0, zero, 10033
+    addiu     s1, zero, 2998
+
+1:
+    lh        s2, 0(a0)   // q0 = quantptr[ 0]
+    lh        s3, 32(a0)  // q1 = quantptr[16]
+    lh        s4, 64(a0)  // q2 = quantptr[32]
+    lh        t2, 64(a1)  // tmp2 = inptr[32]
+    lh        t1, 32(a1)  // tmp1 = inptr[16]
+    lh        t0, 0(a1)   // tmp0 = inptr[ 0]
+    mul       t2, t2, s4  // tmp2 = tmp2 * q2
+    mul       t1, t1, s3  // tmp1 = tmp1 * q1
+    mul       t0, t0, s2  // tmp0 = tmp0 * q0
+    lh        t6, 16(a1)  // z1 = inptr[ 8]
+    lh        t8, 80(a1)  // z3 = inptr[40]
+    lh        t7, 48(a1)  // z2 = inptr[24]
+    lh        s2, 16(a0)  // q0 = quantptr[ 8]
+    lh        s4, 80(a0)  // q2 = quantptr[40]
+    lh        s3, 48(a0)  // q1 = quantptr[24]
+    mul       t2, t2, t9  // tmp2 = tmp2 * 5793
+    mul       t1, t1, s0  // tmp1 = tmp1 * 10033
+    sll       t0, t0, 13  // tmp0 = tmp0 << 13
+    mul       t6, t6, s2  // z1 = z1 * q0
+    mul       t8, t8, s4  // z3 = z3 * q2
+    mul       t7, t7, s3  // z2 = z2 * q1
+    addu      t3, t0, t2  // tmp10 = tmp0 + tmp2
+    sll       t2, t2, 1   // tmp2 = tmp2 << 2
+    subu      t4, t0, t2  // tmp11 = tmp0 - tmp2;
+    subu      t5, t3, t1  // tmp12 = tmp10 - tmp1
+    addu      t3, t3, t1  // tmp10 = tmp10 + tmp1
+    addu      t1, t6, t8  // tmp1 = z1 + z3
+    mul       t1, t1, s1  // tmp1 = tmp1 * 2998
+    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
+    subu      t2, t6, t8  // tmp2 = z1 - z3
+    subu      t2, t2, t7  // tmp2 = tmp2 - z2
+    sll       t2, t2, 2   // tmp2 = tmp2 << 2
+    addu      t0, t6, t7  // tmp0 = z1 + z2
+    sll       t0, t0, 13  // tmp0 = tmp0 << 13
+    subu      s2, t8, t7  // q0 = z3 - z2
+    sll       s2, s2, 13  // q0 = q0 << 13
+    addu      t0, t0, t1  // tmp0 = tmp0 + tmp1
+    addu      t1, s2, t1  // tmp1 = q0 + tmp1
+    addu      s2, t4, t2  // q0 = tmp11 + tmp2
+    subu      s3, t4, t2  // q1 = tmp11 - tmp2
+    addu      t6, t3, t0  // z1 = tmp10 + tmp0
+    subu      t7, t3, t0  // z2 = tmp10 - tmp0
+    addu      t4, t5, t1  // tmp11 = tmp12 + tmp1
+    subu      t5, t5, t1  // tmp12 = tmp12 - tmp1
+    shra_r.w  t6, t6, 11  // z1 = (z1 + 1024) >> 11
+    shra_r.w  t7, t7, 11  // z2 = (z2 + 1024) >> 11
+    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
+    shra_r.w  t5, t5, 11  // tmp12 = (tmp12 + 1024) >> 11
+    sw        s2, 24(v0)
+    sw        s3, 96(v0)
+    sw        t6, 0(v0)
+    sw        t7, 120(v0)
+    sw        t4, 48(v0)
+    sw        t5, 72(v0)
+    addiu     v0, v0, 4
+    addiu     a1, a1, 2
+    bne       v0, v1, 1b
+     addiu    a0, a0, 2
+
+    /* Pass 2: process 6 rows from work array, store into output array. */
+    move      v0, sp
+    addiu     v1, v0, 144
+
+2:
+    lw        t0, 0(v0)
+    lw        t2, 16(v0)
+    lw        s5, 0(a2)
+    addiu     t0, t0, 16
+    sll       t0, t0, 13
+    mul       t3, t2, t9
+    lw        t6, 4(v0)
+    lw        t8, 20(v0)
+    lw        t7, 12(v0)
+    addu      s5, s5, a3
+    addu      s6, t6, t8
+    mul       s6, s6, s1
+    addu      t1, t0, t3
+    subu      t4, t0, t3
+    subu      t4, t4, t3
+    lw        t3, 8(v0)
+    mul       t0, t3, s0
+    addu      s7, t6, t7
+    sll       s7, s7, 13
+    addu      s7, s6, s7
+    subu      t2, t8, t7
+    sll       t2, t2, 13
+    addu      t2, s6, t2
+    subu      s6, t6, t7
+    subu      s6, s6, t8
+    sll       s6, s6, 13
+    addu      t3, t1, t0
+    subu      t5, t1, t0
+    addu      t6, t3, s7
+    subu      t3, t3, s7
+    addu      t7, t4, s6
+    subu      t4, t4, s6
+    addu      t8, t5, t2
+    subu      t5, t5, t2
+    shll_s.w  t6, t6, 6
+    shll_s.w  t3, t3, 6
+    shll_s.w  t7, t7, 6
+    shll_s.w  t4, t4, 6
+    shll_s.w  t8, t8, 6
+    shll_s.w  t5, t5, 6
+    sra       t6, t6, 24
+    addiu     t6, t6, 128
+    sra       t3, t3, 24
+    addiu     t3, t3, 128
+    sb        t6, 0(s5)
+    sra       t7, t7, 24
+    addiu     t7, t7, 128
+    sb        t3, 5(s5)
+    sra       t4, t4, 24
+    addiu     t4, t4, 128
+    sb        t7, 1(s5)
+    sra       t8, t8, 24
+    addiu     t8, t8, 128
+    sb        t4, 4(s5)
+    addiu     v0, v0, 24
+    sra       t5, t5, 24
+    addiu     t5, t5, 128
+    sb        t8, 2(s5)
+    addiu     a2, a2,  4
+    bne       v0, v1, 2b
+     sb       t5, 3(s5)
+
+    addiu     sp, sp, 144
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j         ra
+     nop
+
+END(jsimd_idct_6x6_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
+/*
+ * a0     - compptr->dct_table
+ * a1     - coef_block
+ * a2     - workspace
+ */
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li         a3, 8
+
+1:
+    // odd part
+    lh         t0, 48(a1)
+    lh         t1, 48(a0)
+    lh         t2, 16(a1)
+    lh         t3, 16(a0)
+    lh         t4, 80(a1)
+    lh         t5, 80(a0)
+    lh         t6, 112(a1)
+    lh         t7, 112(a0)
+    mul        t0, t0, t1    // z2
+    mul        t1, t2, t3    // z1
+    mul        t2, t4, t5    // z3
+    mul        t3, t6, t7    // z4
+    li         t4, 10703     // FIX(1.306562965)
+    li         t5, 4433      // FIX_0_541196100
+    li         t6, 7053      // FIX(0.860918669)
+    mul        t4, t0,t4     // tmp11
+    mul        t5, t0,t5     // -tmp14
+    addu       t7, t1,t2     // tmp10
+    addu       t8, t7,t3     // tmp10 + z4
+    mul        t6, t6, t8    // tmp15
+    li         t8, 2139      // FIX(0.261052384)
+    mul        t8, t7, t8    // MULTIPLY(tmp10, FIX(0.261052384))
+    li         t7, 2295      // FIX(0.280143716)
+    mul        t7, t1, t7    // MULTIPLY(z1, FIX(0.280143716))
+    addu       t9, t2, t3    // z3 + z4
+    li         s0, 8565      // FIX(1.045510580)
+    mul        t9, t9, s0    // -tmp13
+    li         s0, 12112     // FIX(1.478575242)
+    mul        s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242)
+    li         s1, 12998     // FIX(1.586706681)
+    mul        s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
+    li         s2, 5540      // FIX(0.676326758)
+    mul        s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
+    li         s3, 16244     // FIX(1.982889723)
+    mul        s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
+    subu       t1, t1, t3    // z1-=z4
+    subu       t0, t0, t2    // z2-=z3
+    addu       t2, t0, t1    // z1+z2
+    li         t3, 4433      // FIX_0_541196100
+    mul        t2, t2, t3    // z3
+    li         t3, 6270      // FIX_0_765366865
+    mul        t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
+    li         t3, 15137     // FIX_0_765366865
+    mul        t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
+    addu       t8, t6, t8    // tmp12
+    addu       t3, t8, t4    // tmp12 + tmp11
+    addu       t3, t3, t7    // tmp10
+    subu       t8, t8, t9    // tmp12 + tmp13
+    addu       s0, t5, s0
+    subu       t8, t8, s0    // tmp12
+    subu       t9, t6, t9
+    subu       s1, s1, t4
+    addu       t9, t9, s1    // tmp13
+    subu       t6, t6, t5
+    subu       t6, t6, s2
+    subu       t6, t6, s3    // tmp15
+    // even part start
+    lh         t4, 64(a1)
+    lh         t5, 64(a0)
+    lh         t7, 32(a1)
+    lh         s0, 32(a0)
+    lh         s1, 0(a1)
+    lh         s2, 0(a0)
+    lh         s3, 96(a1)
+    lh         v0, 96(a0)
+    mul        t4, t4, t5    // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
+    mul        t5, t7, s0    // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
+    mul        t7, s1, s2    // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
+    mul        s0, s3, v0    // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
+    // odd part end
+    addu       t1, t2, t1    // tmp11
+    subu       t0, t2, t0    // tmp14
+    // update counter and pointers
+    addiu      a3, a3, -1
+    addiu      a0, a0, 2
+    addiu      a1, a1, 2
+    // even part rest
+    li         s1, 10033
+    li         s2, 11190
+    mul        t4, t4, s1    // z4
+    mul        s1, t5, s2    // z4
+    sll        t5, t5, 13    // z1
+    sll        t7, t7, 13
+    addiu      t7, t7, 1024  // z3
+    sll        s0, s0, 13    // z2
+    addu       s2, t7, t4    // tmp10
+    subu       t4, t7, t4    // tmp11
+    subu       s3, t5, s0    // tmp12
+    addu       t2, t7, s3    // tmp21
+    subu       s3, t7, s3    // tmp24
+    addu       t7, s1, s0    // tmp12
+    addu       v0, s2, t7    // tmp20
+    subu       s2, s2, t7    // tmp25
+    subu       s1, s1, t5    // z4 - z1
+    subu       s1, s1, s0    // tmp12
+    addu       s0, t4, s1    // tmp22
+    subu       t4, t4, s1    // tmp23
+    // final output stage
+    addu       t5, v0, t3
+    subu       v0, v0, t3
+    addu       t3, t2, t1
+    subu       t2, t2, t1
+    addu       t1, s0, t8
+    subu       s0, s0, t8
+    addu       t8, t4, t9
+    subu       t4, t4, t9
+    addu       t9, s3, t0
+    subu       s3, s3, t0
+    addu       t0, s2, t6
+    subu       s2, s2, t6
+    sra        t5, t5, 11
+    sra        t3, t3, 11
+    sra        t1, t1, 11
+    sra        t8, t8, 11
+    sra        t9, t9, 11
+    sra        t0, t0, 11
+    sra        s2, s2, 11
+    sra        s3, s3, 11
+    sra        t4, t4, 11
+    sra        s0, s0, 11
+    sra        t2, t2, 11
+    sra        v0, v0, 11
+    sw         t5, 0(a2)
+    sw         t3, 32(a2)
+    sw         t1, 64(a2)
+    sw         t8, 96(a2)
+    sw         t9, 128(a2)
+    sw         t0, 160(a2)
+    sw         s2, 192(a2)
+    sw         s3, 224(a2)
+    sw         t4, 256(a2)
+    sw         s0, 288(a2)
+    sw         t2, 320(a2)
+    sw         v0, 352(a2)
+    bgtz       a3, 1b
+     addiu     a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j          ra
+     nop
+
+END(jsimd_idct_12x12_pass1_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
+/*
+ * a0     - workspace
+ * a1     - output
+ */
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li        a3, 12
+
+1:
+    // Odd part
+    lw        t0, 12(a0)
+    lw        t1, 4(a0)
+    lw        t2, 20(a0)
+    lw        t3, 28(a0)
+    li        t4, 10703     // FIX(1.306562965)
+    li        t5, 4433      // FIX_0_541196100
+    mul       t4, t0, t4    // tmp11
+    mul       t5, t0, t5    // -tmp14
+    addu      t6, t1, t2    // tmp10
+    li        t7, 2139      // FIX(0.261052384)
+    mul       t7, t6, t7    // MULTIPLY(tmp10, FIX(0.261052384))
+    addu      t6, t6, t3    // tmp10 + z4
+    li        t8, 7053      // FIX(0.860918669)
+    mul       t6, t6, t8    // tmp15
+    li        t8, 2295      // FIX(0.280143716)
+    mul       t8, t1, t8    // MULTIPLY(z1, FIX(0.280143716))
+    addu      t9, t2, t3    // z3 + z4
+    li        s0, 8565      // FIX(1.045510580)
+    mul       t9, t9, s0    // -tmp13
+    li        s0, 12112     // FIX(1.478575242)
+    mul       s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242))
+    li        s1, 12998     // FIX(1.586706681)
+    mul       s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
+    li        s2, 5540      // FIX(0.676326758)
+    mul       s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
+    li        s3, 16244     // FIX(1.982889723)
+    mul       s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
+    subu      t1, t1, t3    // z1 -= z4
+    subu      t0, t0, t2    // z2 -= z3
+    addu      t2, t1, t0    // z1 + z2
+    li        t3, 4433      // FIX_0_541196100
+    mul       t2, t2, t3    // z3
+    li        t3, 6270      // FIX_0_765366865
+    mul       t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
+    li        t3, 15137     // FIX_1_847759065
+    mul       t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
+    addu      t3, t6, t7    // tmp12
+    addu      t7, t3, t4
+    addu      t7, t7, t8    // tmp10
+    subu      t3, t3, t9
+    subu      t3, t3, t5
+    subu      t3, t3, s0    // tmp12
+    subu      t9, t6, t9
+    subu      t9, t9, t4
+    addu      t9, t9, s1    // tmp13
+    subu      t6, t6, t5
+    subu      t6, t6, s2
+    subu      t6, t6, s3    // tmp15
+    addu      t1, t2, t1    // tmp11
+    subu      t0, t2, t0    // tmp14
+    // even part
+    lw        t2, 16(a0)    // z4
+    lw        t4, 8(a0)     // z1
+    lw        t5, 0(a0)     // z3
+    lw        t8, 24(a0)    // z2
+    li        s0, 10033     // FIX(1.224744871)
+    li        s1, 11190     // FIX(1.366025404)
+    mul       t2, t2, s0    // z4
+    mul       s0, t4, s1    // z4
+    addiu     t5, t5, 0x10
+    sll       t5, t5, 13    // z3
+    sll       t4, t4, 13    // z1
+    sll       t8, t8, 13    // z2
+    subu      s1, t4, t8    // tmp12
+    addu      s2, t5, t2    // tmp10
+    subu      t2, t5, t2    // tmp11
+    addu      s3, t5, s1    // tmp21
+    subu      s1, t5, s1    // tmp24
+    addu      t5, s0, t8    // tmp12
+    addu      v0, s2, t5    // tmp20
+    subu      t5, s2, t5    // tmp25
+    subu      t4, s0, t4
+    subu      t4, t4, t8    // tmp12
+    addu      t8, t2, t4    // tmp22
+    subu      t2, t2, t4    // tmp23
+    // increment counter and pointers
+    addiu     a3, a3, -1
+    addiu     a0, a0, 32
+    // Final stage
+    addu      t4, v0, t7
+    subu      v0, v0, t7
+    addu      t7, s3, t1
+    subu      s3, s3, t1
+    addu      t1, t8, t3
+    subu      t8, t8, t3
+    addu      t3, t2, t9
+    subu      t2, t2, t9
+    addu      t9, s1, t0
+    subu      s1, s1, t0
+    addu      t0, t5, t6
+    subu      t5, t5, t6
+    sll       t4, t4, 4
+    sll       t7, t7, 4
+    sll       t1, t1, 4
+    sll       t3, t3, 4
+    sll       t9, t9, 4
+    sll       t0, t0, 4
+    sll       t5, t5, 4
+    sll       s1, s1, 4
+    sll       t2, t2, 4
+    sll       t8, t8, 4
+    sll       s3, s3, 4
+    sll       v0, v0, 4
+    shll_s.w  t4, t4, 2
+    shll_s.w  t7, t7, 2
+    shll_s.w  t1, t1, 2
+    shll_s.w  t3, t3, 2
+    shll_s.w  t9, t9, 2
+    shll_s.w  t0, t0, 2
+    shll_s.w  t5, t5, 2
+    shll_s.w  s1, s1, 2
+    shll_s.w  t2, t2, 2
+    shll_s.w  t8, t8, 2
+    shll_s.w  s3, s3, 2
+    shll_s.w  v0, v0, 2
+    srl       t4, t4, 24
+    srl       t7, t7, 24
+    srl       t1, t1, 24
+    srl       t3, t3, 24
+    srl       t9, t9, 24
+    srl       t0, t0, 24
+    srl       t5, t5, 24
+    srl       s1, s1, 24
+    srl       t2, t2, 24
+    srl       t8, t8, 24
+    srl       s3, s3, 24
+    srl       v0, v0, 24
+    lw        t6, 0(a1)
+    addiu     t4, t4, 0x80
+    addiu     t7, t7, 0x80
+    addiu     t1, t1, 0x80
+    addiu     t3, t3, 0x80
+    addiu     t9, t9, 0x80
+    addiu     t0, t0, 0x80
+    addiu     t5, t5, 0x80
+    addiu     s1, s1, 0x80
+    addiu     t2, t2, 0x80
+    addiu     t8, t8, 0x80
+    addiu     s3, s3, 0x80
+    addiu     v0, v0, 0x80
+    sb        t4, 0(t6)
+    sb        t7, 1(t6)
+    sb        t1, 2(t6)
+    sb        t3, 3(t6)
+    sb        t9, 4(t6)
+    sb        t0, 5(t6)
+    sb        t5, 6(t6)
+    sb        s1, 7(t6)
+    sb        t2, 8(t6)
+    sb        t8, 9(t6)
+    sb        s3, 10(t6)
+    sb        v0, 11(t6)
+    bgtz      a3, 1b
+     addiu    a1, a1, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    jr        ra
+     nop
+
+END(jsimd_idct_12x12_pass2_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
+/*
+ * a0     - sample_data
+ * a1     - start_col
+ * a2     - workspace
+ */
+
+    lw             t0, 0(a0)
+    li             t7, 0xff80ff80
+    addu           t0, t0, a1
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    lw             t0, 4(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 0(a2)
+    usw            t4, 4(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 8(a2)
+    usw            t6, 12(a2)
+
+    lw             t0, 8(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 16(a2)
+    usw            t4, 20(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 24(a2)
+    usw            t6, 28(a2)
+
+    lw             t0, 12(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 32(a2)
+    usw            t4, 36(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 40(a2)
+    usw            t6, 44(a2)
+
+    lw             t0, 16(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 48(a2)
+    usw            t4, 52(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 56(a2)
+    usw            t6, 60(a2)
+
+    lw             t0, 20(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 64(a2)
+    usw            t4, 68(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 72(a2)
+    usw            t6, 76(a2)
+
+    lw             t0, 24(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 80(a2)
+    usw            t4, 84(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 88(a2)
+    usw            t6, 92(a2)
+
+    lw             t0, 28(a0)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu           t0, t0, a1
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    ulw            t1, 0(t0)
+    ulw            t2, 4(t0)
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 96(a2)
+    usw            t4, 100(a2)
+    preceu.ph.qbr  t3, t1
+    preceu.ph.qbl  t4, t1
+    usw            t5, 104(a2)
+    usw            t6, 108(a2)
+    preceu.ph.qbr  t5, t2
+    preceu.ph.qbl  t6, t2
+    addu.ph        t3, t3, t7
+    addu.ph        t4, t4, t7
+    addu.ph        t5, t5, t7
+    addu.ph        t6, t6, t7
+    usw            t3, 112(a2)
+    usw            t4, 116(a2)
+    usw            t5, 120(a2)
+    usw            t6, 124(a2)
+
+    j              ra
+     nop
+
+END(jsimd_convsamp_mips_dspr2)
+
+/*****************************************************************************/
+LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
+/*
+ * a0     - sample_data
+ * a1     - start_col
+ * a2     - workspace
+ */
+
+    .set at
+
+    lw       t0, 0(a0)
+    addu     t0, t0, a1
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
+    cvt.s.w  f2, f2
+    cvt.s.w  f4, f4
+    cvt.s.w  f6, f6
+    cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
+    lw       t0, 4(a0)
+    swc1     f2, 0(a2)
+    swc1     f4, 4(a2)
+    swc1     f6, 8(a2)
+    addu     t0, t0, a1
+    swc1     f8, 12(a2)
+    swc1     f10, 16(a2)
+    swc1     f12, 20(a2)
+    swc1     f14, 24(a2)
+    swc1     f16, 28(a2)
+    //elemr 1
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
+    cvt.s.w  f2, f2
+    cvt.s.w  f4, f4
+    cvt.s.w  f6, f6
+    cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
+    lw       t0, 8(a0)
+    swc1     f2, 32(a2)
+    swc1     f4, 36(a2)
+    swc1     f6, 40(a2)
+    addu     t0, t0, a1
+    swc1     f8, 44(a2)
+    swc1     f10, 48(a2)
+    swc1     f12, 52(a2)
+    swc1     f14, 56(a2)
+    swc1     f16, 60(a2)
+    //elemr 2
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
+    cvt.s.w  f2, f2
+    cvt.s.w  f4, f4
+    cvt.s.w  f6, f6
+    cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
+    lw       t0, 12(a0)
+    swc1     f2, 64(a2)
+    swc1     f4, 68(a2)
+    swc1     f6, 72(a2)
+    addu     t0, t0, a1
+    swc1     f8, 76(a2)
+    swc1     f10, 80(a2)
+    swc1     f12, 84(a2)
+    swc1     f14, 88(a2)
+    swc1     f16, 92(a2)
+    //elemr 3
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
+    cvt.s.w  f2, f2
+    cvt.s.w  f4, f4
+    cvt.s.w  f6, f6
+    cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
+    lw       t0, 16(a0)
+    swc1     f2, 96(a2)
+    swc1     f4, 100(a2)
+    swc1     f6, 104(a2)
+    addu     t0, t0, a1
+    swc1     f8, 108(a2)
+    swc1     f10, 112(a2)
+    swc1     f12, 116(a2)
+    swc1     f14, 120(a2)
+    swc1     f16, 124(a2)
+    //elemr 4
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
+    cvt.s.w  f2, f2
+    cvt.s.w  f4, f4
+    cvt.s.w  f6, f6
+    cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
+    lw       t0, 20(a0)
+    swc1     f2, 128(a2)
+    swc1     f4, 132(a2)
+    swc1     f6, 136(a2)
+    addu     t0, t0, a1
+    swc1     f8, 140(a2)
+    swc1     f10, 144(a2)
+    swc1     f12, 148(a2)
+    swc1     f14, 152(a2)
+    swc1     f16, 156(a2)
+    //elemr 5
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
+    cvt.s.w  f2, f2
+    cvt.s.w  f4, f4
+    cvt.s.w  f6, f6
+    cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
+    lw       t0, 24(a0)
+    swc1     f2, 160(a2)
+    swc1     f4, 164(a2)
+    swc1     f6, 168(a2)
+    addu     t0, t0, a1
+    swc1     f8, 172(a2)
+    swc1     f10, 176(a2)
+    swc1     f12, 180(a2)
+    swc1     f14, 184(a2)
+    swc1     f16, 188(a2)
+    //elemr 6
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
+    cvt.s.w  f2, f2
+    cvt.s.w  f4, f4
+    cvt.s.w  f6, f6
+    cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
+    lw       t0, 28(a0)
+    swc1     f2, 192(a2)
+    swc1     f4, 196(a2)
+    swc1     f6, 200(a2)
+    addu     t0, t0, a1
+    swc1     f8, 204(a2)
+    swc1     f10, 208(a2)
+    swc1     f12, 212(a2)
+    swc1     f14, 216(a2)
+    swc1     f16, 220(a2)
+    //elemr 7
+    lbu      t1, 0(t0)
+    lbu      t2, 1(t0)
+    lbu      t3, 2(t0)
+    lbu      t4, 3(t0)
+    lbu      t5, 4(t0)
+    lbu      t6, 5(t0)
+    lbu      t7, 6(t0)
+    lbu      t8, 7(t0)
+    addiu    t1, t1, -128
+    addiu    t2, t2, -128
+    addiu    t3, t3, -128
+    addiu    t4, t4, -128
+    addiu    t5, t5, -128
+    addiu    t6, t6, -128
+    addiu    t7, t7, -128
+    addiu    t8, t8, -128
+    mtc1     t1, f2
+    mtc1     t2, f4
+    mtc1     t3, f6
+    mtc1     t4, f8
+    mtc1     t5, f10
+    mtc1     t6, f12
+    mtc1     t7, f14
+    mtc1     t8, f16
+    cvt.s.w  f2, f2
+    cvt.s.w  f4, f4
+    cvt.s.w  f6, f6
+    cvt.s.w  f8, f8
+    cvt.s.w  f10, f10
+    cvt.s.w  f12, f12
+    cvt.s.w  f14, f14
+    cvt.s.w  f16, f16
+    swc1     f2, 224(a2)
+    swc1     f4, 228(a2)
+    swc1     f6, 232(a2)
+    swc1     f8, 236(a2)
+    swc1     f10, 240(a2)
+    swc1     f12, 244(a2)
+    swc1     f14, 248(a2)
+    swc1     f16, 252(a2)
+
+    j        ra
+     nop
+
+END(jsimd_convsamp_float_mips_dspr2)
+
+/*****************************************************************************/
+
diff --git a/simd/jsimd_mips_dspr2_asm.h b/simd/jsimd_mips_dspr2_asm.h
new file mode 100644
index 0000000..50ec31b
--- /dev/null
+++ b/simd/jsimd_mips_dspr2_asm.h
@@ -0,0 +1,285 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * All rights reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero $0
+#define AT   $1
+#define v0   $2
+#define v1   $3
+#define a0   $4
+#define a1   $5
+#define a2   $6
+#define a3   $7
+#define t0   $8
+#define t1   $9
+#define t2   $10
+#define t3   $11
+#define t4   $12
+#define t5   $13
+#define t6   $14
+#define t7   $15
+#define s0   $16
+#define s1   $17
+#define s2   $18
+#define s3   $19
+#define s4   $20
+#define s5   $21
+#define s6   $22
+#define s7   $23
+#define t8   $24
+#define t9   $25
+#define k0   $26
+#define k1   $27
+#define gp   $28
+#define sp   $29
+#define fp   $30
+#define s8   $30
+#define ra   $31
+
+#define f0   $f0
+#define f1   $f1
+#define f2   $f2
+#define f3   $f3
+#define f4   $f4
+#define f5   $f5
+#define f6   $f6
+#define f7   $f7
+#define f8   $f8
+#define f9   $f9
+#define f10  $f10
+#define f11  $f11
+#define f12  $f12
+#define f13  $f13
+#define f14  $f14
+#define f15  $f15
+#define f16  $f16
+#define f17  $f17
+#define f18  $f18
+#define f19  $f19
+#define f20  $f20
+#define f21  $f21
+#define f22  $f22
+#define f23  $f23
+#define f24  $f24
+#define f25  $f25
+#define f26  $f26
+#define f27  $f27
+#define f28  $f28
+#define f29  $f29
+#define f30  $f30
+#define f31  $f31
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol)                           \
+                .globl  symbol;                         \
+                .align  2;                              \
+                .type   symbol, @function;              \
+                .ent    symbol, 0;                      \
+symbol:         .frame  sp, 0, ra;                      \
+                .set    push;                           \
+                .set    arch=mips32r2;                  \
+                .set    noreorder;                      \
+                .set    noat;
+
+/*
+ * LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_MIPS_DSPR2(symbol)                         \
+LEAF_MIPS32R2(symbol)                                   \
+                .set    dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function)                                   \
+                .set    pop;                            \
+                .end    function;                       \
+                .size   function,.-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
+                          r2  = 0, r3  = 0, r4  = 0, \
+                          r5  = 0, r6  = 0, r7  = 0, \
+                          r8  = 0, r9  = 0, r10 = 0, \
+                          r11 = 0, r12 = 0, r13 = 0, \
+                          r14 = 0
+    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+    .endif
+    .if \stack_offset != 0
+    addiu           sp, sp, -\stack_offset
+    .endif
+    sw              \r1, 0(sp)
+    .if \r2 != 0
+    sw              \r2, 4(sp)
+    .endif
+    .if \r3 != 0
+    sw              \r3, 8(sp)
+    .endif
+    .if \r4 != 0
+    sw              \r4, 12(sp)
+    .endif
+    .if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    sw              \r5, 16(sp)
+    .endif
+    .if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    sw              \r6, 20(sp)
+    .endif
+    .if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    sw              \r7, 24(sp)
+    .endif
+    .if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    sw              \r8, 28(sp)
+    .endif
+    .if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    sw              \r9, 32(sp)
+    .endif
+    .if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    sw              \r10, 36(sp)
+    .endif
+    .if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    sw              \r11, 40(sp)
+    .endif
+    .if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    sw              \r12, 44(sp)
+    .endif
+    .if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    sw              \r13, 48(sp)
+    .endif
+    .if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    sw              \r14, 52(sp)
+    .endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
+                               r2  = 0, r3  = 0, r4  = 0, \
+                               r5  = 0, r6  = 0, r7  = 0, \
+                               r8  = 0, r9  = 0, r10 = 0, \
+                               r11 = 0, r12 = 0, r13 = 0, \
+                               r14 = 0
+    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
+    .error "Stack offset must be pozitive and multiple of 4."
+    .endif
+    lw              \r1, 0(sp)
+    .if \r2 != 0
+    lw              \r2, 4(sp)
+    .endif
+    .if \r3 != 0
+    lw              \r3, 8(sp)
+    .endif
+    .if \r4 != 0
+    lw              \r4, 12(sp)
+    .endif
+    .if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    lw              \r5, 16(sp)
+    .endif
+    .if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    lw              \r6, 20(sp)
+    .endif
+    .if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    lw              \r7, 24(sp)
+    .endif
+    .if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    lw              \r8, 28(sp)
+    .endif
+    .if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    lw              \r9, 32(sp)
+    .endif
+    .if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    lw              \r10, 36(sp)
+    .endif
+    .if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    lw              \r11, 40(sp)
+    .endif
+    .if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    lw              \r12, 44(sp)
+    .endif
+    .if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    lw              \r13, 48(sp)
+    .endif
+    .if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    lw              \r14, 52(sp)
+    .endif
+    .if \stack_offset != 0
+    addiu           sp, sp, \stack_offset
+    .endif
+.endm
+
+
diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c
new file mode 100644
index 0000000..afbaa82
--- /dev/null
+++ b/simd/jsimd_powerpc.c
@@ -0,0 +1,741 @@
+/*
+ * jsimd_powerpc.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009-2011, 2014-2015 D. R. Commander
+ * Copyright 2015 Matthieu Darbois
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * PowerPC architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+static unsigned int simd_support = ~0;
+
+LOCAL(void)
+init_simd (void)
+{
+  char *env = NULL;
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = JSIMD_ALTIVEC;
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565 (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+  void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space) {
+    case JCS_EXT_RGB:
+      altivecfct=jsimd_extrgb_ycc_convert_altivec;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      altivecfct=jsimd_extrgbx_ycc_convert_altivec;
+      break;
+    case JCS_EXT_BGR:
+      altivecfct=jsimd_extbgr_ycc_convert_altivec;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      altivecfct=jsimd_extbgrx_ycc_convert_altivec;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      altivecfct=jsimd_extxbgr_ycc_convert_altivec;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      altivecfct=jsimd_extxrgb_ycc_convert_altivec;
+      break;
+    default:
+      altivecfct=jsimd_rgb_ycc_convert_altivec;
+      break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
+                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                        JDIMENSION output_row, int num_rows)
+{
+  void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space) {
+    case JCS_EXT_RGB:
+      altivecfct=jsimd_extrgb_gray_convert_altivec;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      altivecfct=jsimd_extrgbx_gray_convert_altivec;
+      break;
+    case JCS_EXT_BGR:
+      altivecfct=jsimd_extbgr_gray_convert_altivec;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      altivecfct=jsimd_extbgrx_gray_convert_altivec;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      altivecfct=jsimd_extxbgr_gray_convert_altivec;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      altivecfct=jsimd_extxrgb_gray_convert_altivec;
+      break;
+    default:
+      altivecfct=jsimd_rgb_gray_convert_altivec;
+      break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch(cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      altivecfct=jsimd_ycc_extrgb_convert_altivec;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      altivecfct=jsimd_ycc_extrgbx_convert_altivec;
+      break;
+    case JCS_EXT_BGR:
+      altivecfct=jsimd_ycc_extbgr_convert_altivec;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      altivecfct=jsimd_ycc_extbgrx_convert_altivec;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      altivecfct=jsimd_ycc_extxbgr_convert_altivec;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      altivecfct=jsimd_ycc_extxrgb_convert_altivec;
+      break;
+    default:
+      altivecfct=jsimd_ycc_rgb_convert_altivec;
+      break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+                          JSAMPIMAGE input_buf, JDIMENSION input_row,
+                          JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks,
+                                input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks,
+                                input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info *compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info *compptr,
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr,
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch(cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      altivecfct=jsimd_h2v2_extrgb_merged_upsample_altivec;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      altivecfct=jsimd_h2v2_extrgbx_merged_upsample_altivec;
+      break;
+    case JCS_EXT_BGR:
+      altivecfct=jsimd_h2v2_extbgr_merged_upsample_altivec;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      altivecfct=jsimd_h2v2_extbgrx_merged_upsample_altivec;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      altivecfct=jsimd_h2v2_extxbgr_merged_upsample_altivec;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      altivecfct=jsimd_h2v2_extxrgb_merged_upsample_altivec;
+      break;
+    default:
+      altivecfct=jsimd_h2v2_merged_upsample_altivec;
+      break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch(cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      altivecfct=jsimd_h2v1_extrgb_merged_upsample_altivec;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      altivecfct=jsimd_h2v1_extrgbx_merged_upsample_altivec;
+      break;
+    case JCS_EXT_BGR:
+      altivecfct=jsimd_h2v1_extbgr_merged_upsample_altivec;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      altivecfct=jsimd_h2v1_extbgrx_merged_upsample_altivec;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      altivecfct=jsimd_h2v1_extxbgr_merged_upsample_altivec;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      altivecfct=jsimd_h2v1_extxrgb_merged_upsample_altivec;
+      break;
+    default:
+      altivecfct=jsimd_h2v1_merged_upsample_altivec;
+      break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM *workspace)
+{
+  jsimd_convsamp_altivec(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM *data)
+{
+  jsimd_fdct_islow_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM *data)
+{
+  jsimd_fdct_ifast_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
+{
+  jsimd_quantize_altivec(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+  jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+  jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
+{
+  return NULL;
+}
diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c
index d600861..fa33bea 100644
--- a/simd/jsimd_x86_64.c
+++ b/simd/jsimd_x86_64.c
@@ -2,15 +2,16 @@
  * jsimd_x86_64.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2014 D. R. Commander
- * 
+ * Copyright 2009-2011, 2014, 2016 D. R. Commander
+ * Copyright 2015 Matthieu Darbois
+ *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * For conditions of distribution and use, see copyright notice in jsimdext.inc
  *
  * This file contains the interface between the "normal" portions
  * of the library and the SIMD implementations when running on a
- * x86_64 architecture.
+ * 64-bit x86 architecture.
  */
 
 #define JPEG_INTERNALS
@@ -29,10 +30,38 @@
 
 #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
 
-#ifndef JPEG_DECODE_ONLY
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd (void)
+{
+  char *env = NULL;
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = JSIMD_SSE2 | JSIMD_SSE;
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
+}
+
 GLOBAL(int)
 jsimd_can_rgb_ycc (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
@@ -41,16 +70,18 @@
   if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
-#endif
 
 GLOBAL(int)
 jsimd_can_rgb_gray (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
@@ -59,15 +90,18 @@
   if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_ycc_rgb (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
@@ -76,10 +110,11 @@
   if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
@@ -88,7 +123,6 @@
   return 0;
 }
 
-#ifndef JPEG_DECODE_ONLY
 GLOBAL(void)
 jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
@@ -96,8 +130,7 @@
 {
   void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
-  switch(cinfo->in_color_space)
-  {
+  switch(cinfo->in_color_space) {
     case JCS_EXT_RGB:
       sse2fct=jsimd_extrgb_ycc_convert_sse2;
       break;
@@ -127,7 +160,6 @@
 
   sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
-#endif
 
 GLOBAL(void)
 jsimd_rgb_gray_convert (j_compress_ptr cinfo,
@@ -136,8 +168,7 @@
 {
   void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
-  switch(cinfo->in_color_space)
-  {
+  switch(cinfo->in_color_space) {
     case JCS_EXT_RGB:
       sse2fct=jsimd_extrgb_gray_convert_sse2;
       break;
@@ -175,8 +206,7 @@
 {
   void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
 
-  switch(cinfo->out_color_space)
-  {
+  switch(cinfo->out_color_space) {
     case JCS_EXT_RGB:
       sse2fct=jsimd_ycc_extrgb_convert_sse2;
       break;
@@ -214,180 +244,204 @@
 {
 }
 
-#ifndef JPEG_DECODE_ONLY
 GLOBAL(int)
 jsimd_can_h2v2_downsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_downsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
-  jsimd_h2v2_downsample_sse2(cinfo->image_width,
-                             cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor,
-                             compptr->width_in_blocks,
+  jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
                              input_data, output_data);
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
-  jsimd_h2v1_downsample_sse2(cinfo->image_width,
-                             cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor,
-                             compptr->width_in_blocks,
+  jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
                              input_data, output_data);
 }
-#endif
 
 GLOBAL(int)
 jsimd_can_h2v2_upsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_upsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(void)
 jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr, 
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
-  jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor,
-                           cinfo->output_width,
+  jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
                            input_data, output_data_ptr);
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info * compptr, 
+                     jpeg_component_info *compptr,
                      JSAMPARRAY input_data,
-                     JSAMPARRAY * output_data_ptr)
+                     JSAMPARRAY *output_data_ptr)
 {
-  jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor,
-                           cinfo->output_width,
+  jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
                            input_data, output_data_ptr);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_fancy_upsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_fancy_upsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr, 
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
   jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                 compptr->downsampled_width,
-                                 input_data, output_data_ptr);
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info * compptr, 
+                           jpeg_component_info *compptr,
                            JSAMPARRAY input_data,
-                           JSAMPARRAY * output_data_ptr)
+                           JSAMPARRAY *output_data_ptr)
 {
   jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                 compptr->downsampled_width,
-                                 input_data, output_data_ptr);
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_merged_upsample (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (BITS_IN_JSAMPLE != 8)
     return 0;
   if (sizeof(JDIMENSION) != 4)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(void)
@@ -398,8 +452,7 @@
 {
   void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
 
-  switch(cinfo->out_color_space)
-  {
+  switch(cinfo->out_color_space) {
     case JCS_EXT_RGB:
       sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
       break;
@@ -438,8 +491,7 @@
 {
   void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
 
-  switch(cinfo->out_color_space)
-  {
+  switch(cinfo->out_color_space) {
     case JCS_EXT_RGB:
       sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
       break;
@@ -470,10 +522,11 @@
   sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
-#ifndef JPEG_DECODE_ONLY
 GLOBAL(int)
 jsimd_can_convsamp (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -484,12 +537,17 @@
   if (sizeof(DCTELEM) != 2)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_convsamp_float (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -500,19 +558,22 @@
   if (sizeof(FAST_FLOAT) != 4)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM * workspace)
+                DCTELEM *workspace)
 {
   jsimd_convsamp_sse2(sample_data, start_col, workspace);
 }
 
 GLOBAL(void)
 jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT * workspace)
+                      FAST_FLOAT *workspace)
 {
   jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
 }
@@ -520,62 +581,68 @@
 GLOBAL(int)
 jsimd_can_fdct_islow (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
   if (sizeof(DCTELEM) != 2)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_fdct_ifast (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
   if (sizeof(DCTELEM) != 2)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_fdct_float (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
   if (sizeof(FAST_FLOAT) != 4)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_fdct_float_sse))
-    return 0;
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM * data)
+jsimd_fdct_islow (DCTELEM *data)
 {
   jsimd_fdct_islow_sse2(data);
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM * data)
+jsimd_fdct_ifast (DCTELEM *data)
 {
   jsimd_fdct_ifast_sse2(data);
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT * data)
+jsimd_fdct_float (FAST_FLOAT *data)
 {
   jsimd_fdct_float_sse(data);
 }
@@ -583,6 +650,8 @@
 GLOBAL(int)
 jsimd_can_quantize (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -591,12 +660,17 @@
   if (sizeof(DCTELEM) != 2)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_quantize_float (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -605,27 +679,31 @@
   if (sizeof(FAST_FLOAT) != 4)
     return 0;
 
-  return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
-                DCTELEM * workspace)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
+                DCTELEM *workspace)
 {
   jsimd_quantize_sse2(coef_block, divisors, workspace);
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
-                      FAST_FLOAT * workspace)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                      FAST_FLOAT *workspace)
 {
   jsimd_quantize_float_sse2(coef_block, divisors, workspace);
 }
-#endif
 
 GLOBAL(int)
 jsimd_can_idct_2x2 (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -638,15 +716,17 @@
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_idct_4x4 (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -659,14 +739,14 @@
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
@@ -674,7 +754,7 @@
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
@@ -684,6 +764,8 @@
 GLOBAL(int)
 jsimd_can_idct_islow (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -696,15 +778,17 @@
   if (sizeof(ISLOW_MULT_TYPE) != 2)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_idct_islow_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_idct_ifast (void)
 {
+  init_simd();
+
   /* The code is optimised for these values only */
   if (DCTSIZE != 8)
     return 0;
@@ -719,15 +803,17 @@
   if (IFAST_SCALE_BITS != 2)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
-    return 0;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
 
-  return 1;
+  return 0;
 }
 
 GLOBAL(int)
 jsimd_can_idct_float (void)
 {
+  init_simd();
+
   if (DCTSIZE != 8)
     return 0;
   if (sizeof(JCOEF) != 2)
@@ -741,33 +827,61 @@
   if (sizeof(FLOAT_MULT_TYPE) != 4)
     return 0;
 
-  if (!IS_ALIGNED_SSE(jconst_idct_float_sse2))
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+  jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                  JDIMENSION output_col)
+{
+  jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block (void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
     return 0;
 
-  return 1;
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
 }
 
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+GLOBAL(JOCTET*)
+jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
+                             int last_dc_val, c_derived_tbl *dctbl,
+                             c_derived_tbl *actbl)
 {
-  jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_float_sse2(compptr->dct_table, coef_block,
-                        output_buf, output_col);
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
 }
diff --git a/linux/jsimdcfg.inc b/simd/jsimdcfg.inc
similarity index 97%
rename from linux/jsimdcfg.inc
rename to simd/jsimdcfg.inc
index 9d4aede..8e71b5d 100644
--- a/linux/jsimdcfg.inc
+++ b/simd/jsimdcfg.inc
@@ -90,5 +90,3 @@
 %define JSIMD_3DNOW 0x02
 %define JSIMD_SSE 0x04
 %define JSIMD_SSE2 0x08
-; Short forms of external names for systems with brain-damaged linkers.
-;
diff --git a/simd/jsimdcfg.inc.h b/simd/jsimdcfg.inc.h
index 583b7e3..d2b499f 100644
--- a/simd/jsimdcfg.inc.h
+++ b/simd/jsimdcfg.inc.h
@@ -84,10 +84,10 @@
 %define JDIMENSION              dword         ; unsigned int
 %define SIZEOF_JDIMENSION       SIZEOF_DWORD  ; sizeof(JDIMENSION)
 
-%define JSAMPROW                POINTER       ; JSAMPLE FAR * (jpeglib.h)
+%define JSAMPROW                POINTER       ; JSAMPLE *     (jpeglib.h)
 %define JSAMPARRAY              POINTER       ; JSAMPROW *    (jpeglib.h)
 %define JSAMPIMAGE              POINTER       ; JSAMPARRAY *  (jpeglib.h)
-%define JCOEFPTR                POINTER       ; JCOEF FAR *   (jpeglib.h)
+%define JCOEFPTR                POINTER       ; JCOEF *       (jpeglib.h)
 %define SIZEOF_JSAMPROW         SIZEOF_POINTER  ; sizeof(JSAMPROW)
 %define SIZEOF_JSAMPARRAY       SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
 %define SIZEOF_JSAMPIMAGE       SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
@@ -128,69 +128,3 @@
 %define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
 %define _cpp_protection_JSIMD_SSE JSIMD_SSE
 %define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
-
-; Short forms of external names for systems with brain-damaged linkers.
-;
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-%define _cpp_protection_jpeg_simd_cpu_support jpeg_simd_cpu_support
-%define _cpp_protection_jsimd_rgb_ycc_convert_mmx jsimd_rgb_ycc_convert_mmx
-%define _cpp_protection_jsimd_ycc_rgb_convert_mmx jsimd_ycc_rgb_convert_mmx
-%define _cpp_protection_jconst_rgb_ycc_convert_sse2 jconst_rgb_ycc_convert_sse2
-%define _cpp_protection_jsimd_rgb_ycc_convert_sse2 jsimd_rgb_ycc_convert_sse2
-%define _cpp_protection_jconst_ycc_rgb_convert_sse2 jconst_ycc_rgb_convert_sse2
-%define _cpp_protection_jsimd_ycc_rgb_convert_sse2 jsimd_ycc_rgb_convert_sse2
-%define _cpp_protection_jsimd_h2v2_downsample_mmx jsimd_h2v2_downsample_mmx
-%define _cpp_protection_jsimd_h2v1_downsample_mmx jsimd_h2v1_downsample_mmx
-%define _cpp_protection_jsimd_h2v2_downsample_sse2 jsimd_h2v2_downsample_sse2
-%define _cpp_protection_jsimd_h2v1_downsample_sse2 jsimd_h2v1_downsample_sse2
-%define _cpp_protection_jsimd_h2v2_upsample_mmx jsimd_h2v2_upsample_mmx
-%define _cpp_protection_jsimd_h2v1_upsample_mmx jsimd_h2v1_upsample_mmx
-%define _cpp_protection_jsimd_h2v1_fancy_upsample_mmx jsimd_h2v1_fancy_upsample_mmx
-%define _cpp_protection_jsimd_h2v2_fancy_upsample_mmx jsimd_h2v2_fancy_upsample_mmx
-%define _cpp_protection_jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_merged_upsample_mmx
-%define _cpp_protection_jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_merged_upsample_mmx
-%define _cpp_protection_jsimd_h2v2_upsample_sse2 jsimd_h2v2_upsample_sse2
-%define _cpp_protection_jsimd_h2v1_upsample_sse2 jsimd_h2v1_upsample_sse2
-%define _cpp_protection_jconst_fancy_upsample_sse2 jconst_fancy_upsample_sse2
-%define _cpp_protection_jsimd_h2v1_fancy_upsample_sse2 jsimd_h2v1_fancy_upsample_sse2
-%define _cpp_protection_jsimd_h2v2_fancy_upsample_sse2 jsimd_h2v2_fancy_upsample_sse2
-%define _cpp_protection_jconst_merged_upsample_sse2 jconst_merged_upsample_sse2
-%define _cpp_protection_jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_merged_upsample_sse2
-%define _cpp_protection_jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_merged_upsample_sse2
-%define _cpp_protection_jsimd_convsamp_mmx jsimd_convsamp_mmx
-%define _cpp_protection_jsimd_convsamp_sse2 jsimd_convsamp_sse2
-%define _cpp_protection_jsimd_convsamp_float_3dnow jsimd_convsamp_float_3dnow
-%define _cpp_protection_jsimd_convsamp_float_sse jsimd_convsamp_float_sse
-%define _cpp_protection_jsimd_convsamp_float_sse2 jsimd_convsamp_float_sse2
-%define _cpp_protection_jsimd_fdct_islow_mmx jsimd_fdct_islow_mmx
-%define _cpp_protection_jsimd_fdct_ifast_mmx jsimd_fdct_ifast_mmx
-%define _cpp_protection_jconst_fdct_islow_sse2 jconst_fdct_islow_sse2
-%define _cpp_protection_jsimd_fdct_islow_sse2 jsimd_fdct_islow_sse2
-%define _cpp_protection_jconst_fdct_ifast_sse2 jconst_fdct_ifast_sse2
-%define _cpp_protection_jsimd_fdct_ifast_sse2 jsimd_fdct_ifast_sse2
-%define _cpp_protection_jsimd_fdct_float_3dnow jsimd_fdct_float_3dnow
-%define _cpp_protection_jconst_fdct_float_sse jconst_fdct_float_sse
-%define _cpp_protection_jsimd_fdct_float_sse jsimd_fdct_float_sse
-%define _cpp_protection_jsimd_quantize_mmx jsimd_quantize_mmx
-%define _cpp_protection_jsimd_quantize_sse2 jsimd_quantize_sse2
-%define _cpp_protection_jsimd_quantize_float_3dnow jsimd_quantize_float_3dnow
-%define _cpp_protection_jsimd_quantize_float_sse jsimd_quantize_float_sse
-%define _cpp_protection_jsimd_quantize_float_sse2 jsimd_quantize_float_sse2
-%define _cpp_protection_jsimd_idct_2x2_mmx jsimd_idct_2x2_mmx
-%define _cpp_protection_jsimd_idct_4x4_mmx jsimd_idct_4x4_mmx
-%define _cpp_protection_jconst_idct_red_sse2 jconst_idct_red_sse2
-%define _cpp_protection_jsimd_idct_2x2_sse2 jsimd_idct_2x2_sse2
-%define _cpp_protection_jsimd_idct_4x4_sse2 jsimd_idct_4x4_sse2
-%define _cpp_protection_jsimd_idct_islow_mmx jsimd_idct_islow_mmx
-%define _cpp_protection_jsimd_idct_ifast_mmx jsimd_idct_ifast_mmx
-%define _cpp_protection_jconst_idct_islow_sse2 jconst_idct_islow_sse2
-%define _cpp_protection_jsimd_idct_islow_sse2 jsimd_idct_islow_sse2
-%define _cpp_protection_jconst_idct_ifast_sse2 jconst_idct_ifast_sse2
-%define _cpp_protection_jsimd_idct_ifast_sse2 jsimd_idct_ifast_sse2
-%define _cpp_protection_jsimd_idct_float_3dnow jsimd_idct_float_3dnow
-%define _cpp_protection_jconst_idct_float_sse jconst_idct_float_sse
-%define _cpp_protection_jsimd_idct_float_sse jsimd_idct_float_sse
-%define _cpp_protection_jconst_idct_float_sse2 jconst_idct_float_sse2
-%define _cpp_protection_jsimd_idct_float_sse2 jsimd_idct_float_sse2
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
diff --git a/simd/jsimdcpu.asm b/simd/jsimdcpu.asm
index a886904..c42c4ad 100644
--- a/simd/jsimdcpu.asm
+++ b/simd/jsimdcpu.asm
@@ -19,8 +19,8 @@
 %include "jsimdext.inc"
 
 ; --------------------------------------------------------------------------
-	SECTION	SEG_TEXT
-	BITS	32
+        SECTION SEG_TEXT
+        BITS    32
 ;
 ; Check if the CPU supports SIMD instructions
 ;
@@ -28,78 +28,78 @@
 ; jpeg_simd_cpu_support (void)
 ;
 
-	align	16
-	global	EXTN(jpeg_simd_cpu_support) PRIVATE
+        align   16
+        global  EXTN(jpeg_simd_cpu_support)
 
 EXTN(jpeg_simd_cpu_support):
-	push	ebx
-;	push	ecx		; need not be preserved
-;	push	edx		; need not be preserved
-;	push	esi		; unused
-	push	edi
+        push    ebx
+;       push    ecx             ; need not be preserved
+;       push    edx             ; need not be preserved
+;       push    esi             ; unused
+        push    edi
 
-	xor	edi,edi			; simd support flag
+        xor     edi,edi                 ; simd support flag
 
-	pushfd
-	pop	eax
-	mov	edx,eax
-	xor	eax, 1<<21		; flip ID bit in EFLAGS
-	push	eax
-	popfd
-	pushfd
-	pop	eax
-	xor	eax,edx
-	jz	short .return		; CPUID is not supported
+        pushfd
+        pop     eax
+        mov     edx,eax
+        xor     eax, 1<<21              ; flip ID bit in EFLAGS
+        push    eax
+        popfd
+        pushfd
+        pop     eax
+        xor     eax,edx
+        jz      short .return           ; CPUID is not supported
 
-	; Check for MMX instruction support
-	xor	eax,eax
-	cpuid
-	test	eax,eax
-	jz	short .return
+        ; Check for MMX instruction support
+        xor     eax,eax
+        cpuid
+        test    eax,eax
+        jz      short .return
 
-	xor	eax,eax
-	inc	eax
-	cpuid
-	mov	eax,edx			; eax = Standard feature flags
+        xor     eax,eax
+        inc     eax
+        cpuid
+        mov     eax,edx                 ; eax = Standard feature flags
 
-	test	eax, 1<<23		; bit23:MMX
-	jz	short .no_mmx
-	or	edi, byte JSIMD_MMX
+        test    eax, 1<<23              ; bit23:MMX
+        jz      short .no_mmx
+        or      edi, byte JSIMD_MMX
 .no_mmx:
-	test	eax, 1<<25		; bit25:SSE
-	jz	short .no_sse
-	or	edi, byte JSIMD_SSE
+        test    eax, 1<<25              ; bit25:SSE
+        jz      short .no_sse
+        or      edi, byte JSIMD_SSE
 .no_sse:
-	test	eax, 1<<26		; bit26:SSE2
-	jz	short .no_sse2
-	or	edi, byte JSIMD_SSE2
+        test    eax, 1<<26              ; bit26:SSE2
+        jz      short .no_sse2
+        or      edi, byte JSIMD_SSE2
 .no_sse2:
 
-	; Check for 3DNow! instruction support
-	mov	eax, 0x80000000
-	cpuid
-	cmp	eax, 0x80000000
-	jbe	short .return
+        ; Check for 3DNow! instruction support
+        mov     eax, 0x80000000
+        cpuid
+        cmp     eax, 0x80000000
+        jbe     short .return
 
-	mov	eax, 0x80000001
-	cpuid
-	mov	eax,edx			; eax = Extended feature flags
+        mov     eax, 0x80000001
+        cpuid
+        mov     eax,edx                 ; eax = Extended feature flags
 
-	test	eax, 1<<31		; bit31:3DNow!(vendor independent)
-	jz	short .no_3dnow
-	or	edi, byte JSIMD_3DNOW
+        test    eax, 1<<31              ; bit31:3DNow!(vendor independent)
+        jz      short .no_3dnow
+        or      edi, byte JSIMD_3DNOW
 .no_3dnow:
 
 .return:
-	mov	eax,edi
+        mov     eax,edi
 
-	pop	edi
-;	pop	esi		; unused
-;	pop	edx		; need not be preserved
-;	pop	ecx		; need not be preserved
-	pop	ebx
-	ret
+        pop     edi
+;       pop     esi             ; unused
+;       pop     edx             ; need not be preserved
+;       pop     ecx             ; need not be preserved
+        pop     ebx
+        ret
 
 ; For some reason, the OS X linker does not honor the request to align the
 ; segment unless we do this.
-	align	16
+        align   16
diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc
index abb6863..e1442de 100644
--- a/simd/jsimdext.inc
+++ b/simd/jsimdext.inc
@@ -30,7 +30,7 @@
 ; ==========================================================================
 ;  System-dependent configurations
 
-%ifdef WIN32	; ----(nasm -fwin32 -DWIN32 ...)--------
+%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
 ; * Microsoft Visual C++
 ; * MinGW (Minimalist GNU for Windows)
 ; * CygWin
@@ -46,7 +46,7 @@
 %define SEG_CONST   .rdata align=16 public use32 class=CONST
 %endif
 
-%elifdef WIN64	; ----(nasm -fwin64 -DWIN64 ...)--------
+%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
 ; * Microsoft Visual C++
 
 ; -- segment definition --
@@ -58,24 +58,21 @@
 %define SEG_TEXT    .text  align=16 public use64 class=CODE
 %define SEG_CONST   .rdata align=16 public use64 class=CONST
 %endif
-%define EXTN(name)  name			; foo() -> foo
+%define EXTN(name)  name                        ; foo() -> foo
 
-%elifdef OBJ32	; ----(nasm -fobj -DOBJ32 ...)----------
+%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
 ; * Borland C++ (Win32)
 
 ; -- segment definition --
 ;
-%define SEG_TEXT    .text  align=16 public use32 class=CODE
-%define SEG_CONST   .data  align=16 public use32 class=DATA
+%define SEG_TEXT    _text  align=16 public use32 class=CODE
+%define SEG_CONST   _data  align=16 public use32 class=DATA
 
-%elifdef ELF	; ----(nasm -felf[64] -DELF ...)------------
+%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
 ; * Linux
 ; * *BSD family Unix using elf format
 ; * Unix System V, including Solaris x86, UnixWare and SCO Unix
 
-; PIC is the default on Linux
-%define PIC
-
 ; mark stack as non-executable
 section .note.GNU-stack noalloc noexec nowrite progbits
 
@@ -91,10 +88,10 @@
 
 ; To make the code position-independent, append -DPIC to the commandline
 ;
-%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_	; ELF supports PIC
-%define EXTN(name)  name			; foo() -> foo
+%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_       ; ELF supports PIC
+%define EXTN(name)  name                        ; foo() -> foo
 
-%elifdef AOUT	; ----(nasm -faoutb/aout -DAOUT ...)----
+%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
 ; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
 ; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
 
@@ -105,29 +102,29 @@
 
 ; To make the code position-independent, append -DPIC to the commandline
 ;
-%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_	; BSD-style a.out supports PIC
+%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_      ; BSD-style a.out supports PIC
 
-%elifdef MACHO	; ----(nasm -fmacho -DMACHO ...)--------
+%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
 ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
 
 ; -- segment definition --
 ;
-%define SEG_TEXT    .text  ;align=16	; nasm doesn't accept align=16. why?
+%define SEG_TEXT    .text  ;align=16    ; nasm doesn't accept align=16. why?
 %define SEG_CONST   .rodata align=16
 
 ; The generation of position-independent code (PIC) is the default on Darwin.
 ;
 %define PIC
-%define GOT_SYMBOL  _MACHO_PIC_		; Mach-O style code-relative addressing
+%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
 
-%else		; ----(Other case)----------------------
+%else           ; ----(Other case)----------------------
 
 ; -- segment definition --
 ;
 %define SEG_TEXT    .text
 %define SEG_CONST   .data
 
-%endif	; ----------------------------------------------
+%endif  ; ----------------------------------------------
 
 ; ==========================================================================
 
@@ -182,7 +179,7 @@
 ;  External Symbol Name
 ;
 %ifndef EXTN
-%define EXTN(name)   _ %+ name		; foo() -> _foo
+%define EXTN(name)   _ %+ name          ; foo() -> _foo
 %endif
 
 ; --------------------------------------------------------------------------
@@ -199,79 +196,79 @@
 ; At present, nasm doesn't seem to support PIC generation for Mach-O.
 ; The PIC support code below is a little tricky.
 
-	SECTION	SEG_CONST
+        SECTION SEG_CONST
 const_base:
 
 %define GOTOFF(got,sym) (got) + (sym) - const_base
 
-%imacro get_GOT	1
-	; NOTE: this macro destroys ecx resister.
-	call	%%geteip
-	add	ecx, byte (%%ref - $)
-	jmp	short %%adjust
+%imacro get_GOT 1
+        ; NOTE: this macro destroys ecx resister.
+        call    %%geteip
+        add     ecx, byte (%%ref - $)
+        jmp     short %%adjust
 %%geteip:
-	mov	ecx, POINTER [esp]
-	ret
+        mov     ecx, POINTER [esp]
+        ret
 %%adjust:
-	push	ebp
-	xor	ebp,ebp		; ebp = 0
-%ifidni %1,ebx	; (%1 == ebx)
-	; db 0x8D,0x9C + jmp near const_base =
-	;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
-	db	0x8D,0x9C		; 8D,9C
-	jmp	near const_base		; E9,(const_base-%%ref)
+        push    ebp
+        xor     ebp,ebp         ; ebp = 0
+%ifidni %1,ebx  ; (%1 == ebx)
+        ; db 0x8D,0x9C + jmp near const_base =
+        ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+        db      0x8D,0x9C               ; 8D,9C
+        jmp     near const_base         ; E9,(const_base-%%ref)
 %%ref:
 %else  ; (%1 != ebx)
-	; db 0x8D,0x8C + jmp near const_base =
-	;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
-	db	0x8D,0x8C		; 8D,8C
-	jmp	near const_base		; E9,(const_base-%%ref)
-%%ref:	mov	%1, ecx
+        ; db 0x8D,0x8C + jmp near const_base =
+        ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+        db      0x8D,0x8C               ; 8D,8C
+        jmp     near const_base         ; E9,(const_base-%%ref)
+%%ref:  mov     %1, ecx
 %endif ; (%1 == ebx)
-	pop	ebp
+        pop     ebp
 %endmacro
 
-%else	; GOT_SYMBOL != _MACHO_PIC_ ----------------
+%else   ; GOT_SYMBOL != _MACHO_PIC_ ----------------
 
 %define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
 
-%imacro get_GOT	1
-	extern	GOT_SYMBOL
-	call	%%geteip
-	add	%1, GOT_SYMBOL + $$ - $ wrt ..gotpc
-	jmp	short %%done
+%imacro get_GOT 1
+        extern  GOT_SYMBOL
+        call    %%geteip
+        add     %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+        jmp     short %%done
 %%geteip:
-	mov	%1, POINTER [esp]
-	ret
+        mov     %1, POINTER [esp]
+        ret
 %%done:
 %endmacro
 
-%endif	; GOT_SYMBOL == _MACHO_PIC_ ----------------
+%endif  ; GOT_SYMBOL == _MACHO_PIC_ ----------------
 
-%imacro pushpic	1.nolist
-	push	%1
+%imacro pushpic 1.nolist
+        push    %1
 %endmacro
-%imacro poppic	1.nolist
-	pop	%1
+%imacro poppic  1.nolist
+        pop     %1
 %endmacro
-%imacro movpic	2.nolist
-	mov	%1,%2
+%imacro movpic  2.nolist
+        mov     %1,%2
 %endmacro
 
-%else	; !PIC -----------------------------------------
+%else   ; !PIC -----------------------------------------
 
 %define GOTOFF(got,sym) (sym)
 
-%imacro get_GOT	1.nolist
+%imacro get_GOT 1.nolist
 %endmacro
-%imacro pushpic	1.nolist
+%imacro pushpic 1.nolist
 %endmacro
-%imacro poppic	1.nolist
+%imacro poppic  1.nolist
 %endmacro
-%imacro movpic	2.nolist
+%imacro movpic  2.nolist
 %endmacro
 
-%endif	;  PIC -----------------------------------------
+%endif  ;  PIC -----------------------------------------
 
 ; --------------------------------------------------------------------------
 ;  Align the next instruction on {2,4,8,16,..}-byte boundary.
@@ -281,28 +278,28 @@
 %define FILLB(b,n)  (($$-(b)) & ((n)-1))
 
 %imacro alignx 1-2.nolist 0xFFFF
-%%bs:	times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
-	       db 0x90                               ; nop
-	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
-	       db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
-	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
-	       db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
-	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
-	       db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
-	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
-	       db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
-	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
-	       db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
-	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
-	       db 0x8B,0xED                          ; mov ebp,ebp
-	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
-	       db 0x90                               ; nop
+%%bs:   times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
+               db 0x90                               ; nop
+        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
+               db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
+        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
+               db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
+        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
+               db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
+        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
+               db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
+        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
+               db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
+        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
+               db 0x8B,0xED                          ; mov ebp,ebp
+        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
+               db 0x90                               ; nop
 %endmacro
 
 ; Align the next data on {2,4,8,16,..}-byte boundary.
 ;
 %imacro alignz 1.nolist
-	align %1, db 0		; filling zeros
+        align %1, db 0          ; filling zeros
 %endmacro
 
 %ifdef __x86_64__
@@ -310,61 +307,61 @@
 %ifdef WIN64
 
 %imacro collect_args 0
-	push r12
-	push r13
-	push r14
-	push r15
-	mov r10, rcx
-	mov r11, rdx
-	mov r12, r8
-	mov r13, r9
-	mov r14, [rax+48]
-	mov r15, [rax+56]
-	push rsi
-	push rdi
-	sub     rsp, SIZEOF_XMMWORD
-	movaps  XMMWORD [rsp], xmm6
-	sub     rsp, SIZEOF_XMMWORD
-	movaps  XMMWORD [rsp], xmm7
+        push r12
+        push r13
+        push r14
+        push r15
+        mov r10, rcx
+        mov r11, rdx
+        mov r12, r8
+        mov r13, r9
+        mov r14, [rax+48]
+        mov r15, [rax+56]
+        push rsi
+        push rdi
+        sub     rsp, SIZEOF_XMMWORD
+        movaps  XMMWORD [rsp], xmm6
+        sub     rsp, SIZEOF_XMMWORD
+        movaps  XMMWORD [rsp], xmm7
 %endmacro
 
 %imacro uncollect_args 0
-	movaps  xmm7, XMMWORD [rsp]
-	add     rsp, SIZEOF_XMMWORD
-	movaps  xmm6, XMMWORD [rsp]
-	add     rsp, SIZEOF_XMMWORD
-	pop rdi
-	pop rsi
-	pop r15
-	pop r14
-	pop r13
-	pop r12
+        movaps  xmm7, XMMWORD [rsp]
+        add     rsp, SIZEOF_XMMWORD
+        movaps  xmm6, XMMWORD [rsp]
+        add     rsp, SIZEOF_XMMWORD
+        pop rdi
+        pop rsi
+        pop r15
+        pop r14
+        pop r13
+        pop r12
 %endmacro
 
 %else
 
 %imacro collect_args 0
-	push r10
-	push r11
-	push r12
-	push r13
-	push r14
-	push r15
-	mov r10, rdi
-	mov r11, rsi
-	mov r12, rdx
-	mov r13, rcx
-	mov r14, r8
-	mov r15, r9
+        push r10
+        push r11
+        push r12
+        push r13
+        push r14
+        push r15
+        mov r10, rdi
+        mov r11, rsi
+        mov r12, rdx
+        mov r13, rcx
+        mov r14, r8
+        mov r15, r9
 %endmacro
 
 %imacro uncollect_args 0
-	pop r15
-	pop r14
-	pop r13
-	pop r12
-	pop r11
-	pop r10
+        pop r15
+        pop r14
+        pop r13
+        pop r12
+        pop r11
+        pop r10
 %endmacro
 
 %endif
@@ -376,14 +373,4 @@
 ;
 %include "jsimdcfg.inc"
 
-; Begin chromium edits
-%ifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
-%define PRIVATE :private_extern
-%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
-%define PRIVATE :hidden
-%else
-%define PRIVATE
-%endif
-; End chromium edits
-
 ; --------------------------------------------------------------------------
diff --git a/simd/nasm_lt.sh b/simd/nasm_lt.sh
new file mode 100644
index 0000000..817be16
--- /dev/null
+++ b/simd/nasm_lt.sh
@@ -0,0 +1,60 @@
+#! /bin/sh
+command=""
+infile=""
+o_opt=no
+pic=no
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --silent)
+            exec > /dev/null
+            ;;
+        -DPIC|-fPIC|-fpic|-Kpic|-KPIC)
+            if [ "$pic" != "yes" ] ; then
+                command="$command -DPIC"
+                pic=yes
+            fi
+            ;;
+        -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \
+        -fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64)
+            # it's a file format specifier for nasm.
+            command="$command $1"
+            ;;
+        -f*)
+            # maybe a code-generation flag for gcc.
+            ;;
+        -[Ii]*)
+            incdir=`echo "$1" | sed 's/^-[Ii]//'`
+            if [ "x$incdir" = x -a "x$2" != x ] ; then
+                case "$2" in
+                    -*) ;;
+                    *) incdir="$2"; shift;;
+                esac
+            fi
+            if [ "x$incdir" != x ] ; then
+                # In the case of NASM, the trailing slash is necessary.
+                incdir=`echo "$incdir" | sed 's%/*$%/%'`
+                command="$command -I$incdir"
+            fi
+            ;;
+        -o*)
+            o_opt=yes
+            command="$command $1"
+            ;;
+        *.asm)
+            infile=$1
+            command="$command $1"
+            ;;
+        *)
+            command="$command $1"
+            ;;
+    esac
+    shift
+done
+if [ "$o_opt" != yes ] ; then
+    # By default, NASM creates an output file
+    # in the same directory as the input file.
+    outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o"
+    command="$command $outfile"
+fi
+echo $command
+exec $command
diff --git a/structure.txt b/structure.txt
new file mode 100644
index 0000000..296d125
--- /dev/null
+++ b/structure.txt
@@ -0,0 +1,906 @@
+IJG JPEG LIBRARY:  SYSTEM ARCHITECTURE
+
+This file was part of the Independent JPEG Group's software:
+Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
+It was modified by The libjpeg-turbo Project to include only information
+relevant to libjpeg-turbo.
+For conditions of distribution and use, see the accompanying README.ijg file.
+
+
+This file provides an overview of the architecture of the IJG JPEG software;
+that is, the functions of the various modules in the system and the interfaces
+between modules.  For more precise details about any data structure or calling
+convention, see the include files and comments in the source code.
+
+We assume that the reader is already somewhat familiar with the JPEG standard.
+The README.ijg file includes references for learning about JPEG.  The file
+libjpeg.txt describes the library from the viewpoint of an application
+programmer using the library; it's best to read that file before this one.
+Also, the file coderules.txt describes the coding style conventions we use.
+
+In this document, JPEG-specific terminology follows the JPEG standard:
+  A "component" means a color channel, e.g., Red or Luminance.
+  A "sample" is a single component value (i.e., one number in the image data).
+  A "coefficient" is a frequency coefficient (a DCT transform output number).
+  A "block" is an 8x8 group of samples or coefficients.
+  An "MCU" (minimum coded unit) is an interleaved set of blocks of size
+        determined by the sampling factors, or a single block in a
+        noninterleaved scan.
+We do not use the terms "pixel" and "sample" interchangeably.  When we say
+pixel, we mean an element of the full-size image, while a sample is an element
+of the downsampled image.  Thus the number of samples may vary across
+components while the number of pixels does not.  (This terminology is not used
+rigorously throughout the code, but it is used in places where confusion would
+otherwise result.)
+
+
+*** System features ***
+
+The IJG distribution contains two parts:
+  * A subroutine library for JPEG compression and decompression.
+  * cjpeg/djpeg, two sample applications that use the library to transform
+    JFIF JPEG files to and from several other image formats.
+cjpeg/djpeg are of no great intellectual complexity: they merely add a simple
+command-line user interface and I/O routines for several uncompressed image
+formats.  This document concentrates on the library itself.
+
+We desire the library to be capable of supporting all JPEG baseline, extended
+sequential, and progressive DCT processes.  Hierarchical processes are not
+supported.
+
+The library does not support the lossless (spatial) JPEG process.  Lossless
+JPEG shares little or no code with lossy JPEG, and would normally be used
+without the extensive pre- and post-processing provided by this library.
+We feel that lossless JPEG is better handled by a separate library.
+
+Within these limits, any set of compression parameters allowed by the JPEG
+spec should be readable for decompression.  (We can be more restrictive about
+what formats we can generate.)  Although the system design allows for all
+parameter values, some uncommon settings are not yet implemented and may
+never be; nonintegral sampling ratios are the prime example.  Furthermore,
+we treat 8-bit vs. 12-bit data precision as a compile-time switch, not a
+run-time option, because most machines can store 8-bit pixels much more
+compactly than 12-bit.
+
+By itself, the library handles only interchange JPEG datastreams --- in
+particular the widely used JFIF file format.  The library can be used by
+surrounding code to process interchange or abbreviated JPEG datastreams that
+are embedded in more complex file formats.  (For example, libtiff uses this
+library to implement JPEG compression within the TIFF file format.)
+
+The library includes a substantial amount of code that is not covered by the
+JPEG standard but is necessary for typical applications of JPEG.  These
+functions preprocess the image before JPEG compression or postprocess it after
+decompression.  They include colorspace conversion, downsampling/upsampling,
+and color quantization.  This code can be omitted if not needed.
+
+A wide range of quality vs. speed tradeoffs are possible in JPEG processing,
+and even more so in decompression postprocessing.  The decompression library
+provides multiple implementations that cover most of the useful tradeoffs,
+ranging from very-high-quality down to fast-preview operation.  On the
+compression side we have generally not provided low-quality choices, since
+compression is normally less time-critical.  It should be understood that the
+low-quality modes may not meet the JPEG standard's accuracy requirements;
+nonetheless, they are useful for viewers.
+
+
+*** System overview ***
+
+The compressor and decompressor are each divided into two main sections:
+the JPEG compressor or decompressor proper, and the preprocessing or
+postprocessing functions.  The interface between these two sections is the
+image data that the official JPEG spec regards as its input or output: this
+data is in the colorspace to be used for compression, and it is downsampled
+to the sampling factors to be used.  The preprocessing and postprocessing
+steps are responsible for converting a normal image representation to or from
+this form.  (Those few applications that want to deal with YCbCr downsampled
+data can skip the preprocessing or postprocessing step.)
+
+Looking more closely, the compressor library contains the following main
+elements:
+
+  Preprocessing:
+    * Color space conversion (e.g., RGB to YCbCr).
+    * Edge expansion and downsampling.  Optionally, this step can do simple
+      smoothing --- this is often helpful for low-quality source data.
+  JPEG proper:
+    * MCU assembly, DCT, quantization.
+    * Entropy coding (sequential or progressive, Huffman or arithmetic).
+
+In addition to these modules we need overall control, marker generation,
+and support code (memory management & error handling).  There is also a
+module responsible for physically writing the output data --- typically
+this is just an interface to fwrite(), but some applications may need to
+do something else with the data.
+
+The decompressor library contains the following main elements:
+
+  JPEG proper:
+    * Entropy decoding (sequential or progressive, Huffman or arithmetic).
+    * Dequantization, inverse DCT, MCU disassembly.
+  Postprocessing:
+    * Upsampling.  Optionally, this step may be able to do more general
+      rescaling of the image.
+    * Color space conversion (e.g., YCbCr to RGB).  This step may also
+      provide gamma adjustment [ currently it does not ].
+    * Optional color quantization (e.g., reduction to 256 colors).
+    * Optional color precision reduction (e.g., 24-bit to 15-bit color).
+      [This feature is not currently implemented.]
+
+We also need overall control, marker parsing, and a data source module.
+The support code (memory management & error handling) can be shared with
+the compression half of the library.
+
+There may be several implementations of each of these elements, particularly
+in the decompressor, where a wide range of speed/quality tradeoffs is very
+useful.  It must be understood that some of the best speedups involve
+merging adjacent steps in the pipeline.  For example, upsampling, color space
+conversion, and color quantization might all be done at once when using a
+low-quality ordered-dither technique.  The system architecture is designed to
+allow such merging where appropriate.
+
+
+Note: it is convenient to regard edge expansion (padding to block boundaries)
+as a preprocessing/postprocessing function, even though the JPEG spec includes
+it in compression/decompression.  We do this because downsampling/upsampling
+can be simplified a little if they work on padded data: it's not necessary to
+have special cases at the right and bottom edges.  Therefore the interface
+buffer is always an integral number of blocks wide and high, and we expect
+compression preprocessing to pad the source data properly.  Padding will occur
+only to the next block (8-sample) boundary.  In an interleaved-scan situation,
+additional dummy blocks may be used to fill out MCUs, but the MCU assembly and
+disassembly logic will create or discard these blocks internally.  (This is
+advantageous for speed reasons, since we avoid DCTing the dummy blocks.
+It also permits a small reduction in file size, because the compressor can
+choose dummy block contents so as to minimize their size in compressed form.
+Finally, it makes the interface buffer specification independent of whether
+the file is actually interleaved or not.)  Applications that wish to deal
+directly with the downsampled data must provide similar buffering and padding
+for odd-sized images.
+
+
+*** Poor man's object-oriented programming ***
+
+It should be clear by now that we have a lot of quasi-independent processing
+steps, many of which have several possible behaviors.  To avoid cluttering the
+code with lots of switch statements, we use a simple form of object-style
+programming to separate out the different possibilities.
+
+For example, two different color quantization algorithms could be implemented
+as two separate modules that present the same external interface; at runtime,
+the calling code will access the proper module indirectly through an "object".
+
+We can get the limited features we need while staying within portable C.
+The basic tool is a function pointer.  An "object" is just a struct
+containing one or more function pointer fields, each of which corresponds to
+a method name in real object-oriented languages.  During initialization we
+fill in the function pointers with references to whichever module we have
+determined we need to use in this run.  Then invocation of the module is done
+by indirecting through a function pointer; on most machines this is no more
+expensive than a switch statement, which would be the only other way of
+making the required run-time choice.  The really significant benefit, of
+course, is keeping the source code clean and well structured.
+
+We can also arrange to have private storage that varies between different
+implementations of the same kind of object.  We do this by making all the
+module-specific object structs be separately allocated entities, which will
+be accessed via pointers in the master compression or decompression struct.
+The "public" fields or methods for a given kind of object are specified by
+a commonly known struct.  But a module's initialization code can allocate
+a larger struct that contains the common struct as its first member, plus
+additional private fields.  With appropriate pointer casting, the module's
+internal functions can access these private fields.  (For a simple example,
+see jdatadst.c, which implements the external interface specified by struct
+jpeg_destination_mgr, but adds extra fields.)
+
+(Of course this would all be a lot easier if we were using C++, but we are
+not yet prepared to assume that everyone has a C++ compiler.)
+
+An important benefit of this scheme is that it is easy to provide multiple
+versions of any method, each tuned to a particular case.  While a lot of
+precalculation might be done to select an optimal implementation of a method,
+the cost per invocation is constant.  For example, the upsampling step might
+have a "generic" method, plus one or more "hardwired" methods for the most
+popular sampling factors; the hardwired methods would be faster because they'd
+use straight-line code instead of for-loops.  The cost to determine which
+method to use is paid only once, at startup, and the selection criteria are
+hidden from the callers of the method.
+
+This plan differs a little bit from usual object-oriented structures, in that
+only one instance of each object class will exist during execution.  The
+reason for having the class structure is that on different runs we may create
+different instances (choose to execute different modules).  You can think of
+the term "method" as denoting the common interface presented by a particular
+set of interchangeable functions, and "object" as denoting a group of related
+methods, or the total shared interface behavior of a group of modules.
+
+
+*** Overall control structure ***
+
+We previously mentioned the need for overall control logic in the compression
+and decompression libraries.  In IJG implementations prior to v5, overall
+control was mostly provided by "pipeline control" modules, which proved to be
+large, unwieldy, and hard to understand.  To improve the situation, the
+control logic has been subdivided into multiple modules.  The control modules
+consist of:
+
+1. Master control for module selection and initialization.  This has two
+responsibilities:
+
+   1A.  Startup initialization at the beginning of image processing.
+        The individual processing modules to be used in this run are selected
+        and given initialization calls.
+
+   1B.  Per-pass control.  This determines how many passes will be performed
+        and calls each active processing module to configure itself
+        appropriately at the beginning of each pass.  End-of-pass processing,
+        where necessary, is also invoked from the master control module.
+
+   Method selection is partially distributed, in that a particular processing
+   module may contain several possible implementations of a particular method,
+   which it will select among when given its initialization call.  The master
+   control code need only be concerned with decisions that affect more than
+   one module.
+
+2. Data buffering control.  A separate control module exists for each
+   inter-processing-step data buffer.  This module is responsible for
+   invoking the processing steps that write or read that data buffer.
+
+Each buffer controller sees the world as follows:
+
+input data => processing step A => buffer => processing step B => output data
+                      |              |               |
+              ------------------ controller ------------------
+
+The controller knows the dataflow requirements of steps A and B: how much data
+they want to accept in one chunk and how much they output in one chunk.  Its
+function is to manage its buffer and call A and B at the proper times.
+
+A data buffer control module may itself be viewed as a processing step by a
+higher-level control module; thus the control modules form a binary tree with
+elementary processing steps at the leaves of the tree.
+
+The control modules are objects.  A considerable amount of flexibility can
+be had by replacing implementations of a control module.  For example:
+* Merging of adjacent steps in the pipeline is done by replacing a control
+  module and its pair of processing-step modules with a single processing-
+  step module.  (Hence the possible merges are determined by the tree of
+  control modules.)
+* In some processing modes, a given interstep buffer need only be a "strip"
+  buffer large enough to accommodate the desired data chunk sizes.  In other
+  modes, a full-image buffer is needed and several passes are required.
+  The control module determines which kind of buffer is used and manipulates
+  virtual array buffers as needed.  One or both processing steps may be
+  unaware of the multi-pass behavior.
+
+In theory, we might be able to make all of the data buffer controllers
+interchangeable and provide just one set of implementations for all.  In
+practice, each one contains considerable special-case processing for its
+particular job.  The buffer controller concept should be regarded as an
+overall system structuring principle, not as a complete description of the
+task performed by any one controller.
+
+
+*** Compression object structure ***
+
+Here is a sketch of the logical structure of the JPEG compression library:
+
+                                                 |-- Colorspace conversion
+                  |-- Preprocessing controller --|
+                  |                              |-- Downsampling
+Main controller --|
+                  |                            |-- Forward DCT, quantize
+                  |-- Coefficient controller --|
+                                               |-- Entropy encoding
+
+This sketch also describes the flow of control (subroutine calls) during
+typical image data processing.  Each of the components shown in the diagram is
+an "object" which may have several different implementations available.  One
+or more source code files contain the actual implementation(s) of each object.
+
+The objects shown above are:
+
+* Main controller: buffer controller for the subsampled-data buffer, which
+  holds the preprocessed input data.  This controller invokes preprocessing to
+  fill the subsampled-data buffer, and JPEG compression to empty it.  There is
+  usually no need for a full-image buffer here; a strip buffer is adequate.
+
+* Preprocessing controller: buffer controller for the downsampling input data
+  buffer, which lies between colorspace conversion and downsampling.  Note
+  that a unified conversion/downsampling module would probably replace this
+  controller entirely.
+
+* Colorspace conversion: converts application image data into the desired
+  JPEG color space; also changes the data from pixel-interleaved layout to
+  separate component planes.  Processes one pixel row at a time.
+
+* Downsampling: performs reduction of chroma components as required.
+  Optionally may perform pixel-level smoothing as well.  Processes a "row
+  group" at a time, where a row group is defined as Vmax pixel rows of each
+  component before downsampling, and Vk sample rows afterwards (remember Vk
+  differs across components).  Some downsampling or smoothing algorithms may
+  require context rows above and below the current row group; the
+  preprocessing controller is responsible for supplying these rows via proper
+  buffering.  The downsampler is responsible for edge expansion at the right
+  edge (i.e., extending each sample row to a multiple of 8 samples); but the
+  preprocessing controller is responsible for vertical edge expansion (i.e.,
+  duplicating the bottom sample row as needed to make a multiple of 8 rows).
+
+* Coefficient controller: buffer controller for the DCT-coefficient data.
+  This controller handles MCU assembly, including insertion of dummy DCT
+  blocks when needed at the right or bottom edge.  When performing
+  Huffman-code optimization or emitting a multiscan JPEG file, this
+  controller is responsible for buffering the full image.  The equivalent of
+  one fully interleaved MCU row of subsampled data is processed per call,
+  even when the JPEG file is noninterleaved.
+
+* Forward DCT and quantization: Perform DCT, quantize, and emit coefficients.
+  Works on one or more DCT blocks at a time.  (Note: the coefficients are now
+  emitted in normal array order, which the entropy encoder is expected to
+  convert to zigzag order as necessary.  Prior versions of the IJG code did
+  the conversion to zigzag order within the quantization step.)
+
+* Entropy encoding: Perform Huffman or arithmetic entropy coding and emit the
+  coded data to the data destination module.  Works on one MCU per call.
+  For progressive JPEG, the same DCT blocks are fed to the entropy coder
+  during each pass, and the coder must emit the appropriate subset of
+  coefficients.
+
+In addition to the above objects, the compression library includes these
+objects:
+
+* Master control: determines the number of passes required, controls overall
+  and per-pass initialization of the other modules.
+
+* Marker writing: generates JPEG markers (except for RSTn, which is emitted
+  by the entropy encoder when needed).
+
+* Data destination manager: writes the output JPEG datastream to its final
+  destination (e.g., a file).  The destination manager supplied with the
+  library knows how to write to a stdio stream or to a memory buffer;
+  for other behaviors, the surrounding application may provide its own
+  destination manager.
+
+* Memory manager: allocates and releases memory, controls virtual arrays
+  (with backing store management, where required).
+
+* Error handler: performs formatting and output of error and trace messages;
+  determines handling of nonfatal errors.  The surrounding application may
+  override some or all of this object's methods to change error handling.
+
+* Progress monitor: supports output of "percent-done" progress reports.
+  This object represents an optional callback to the surrounding application:
+  if wanted, it must be supplied by the application.
+
+The error handler, destination manager, and progress monitor objects are
+defined as separate objects in order to simplify application-specific
+customization of the JPEG library.  A surrounding application may override
+individual methods or supply its own all-new implementation of one of these
+objects.  The object interfaces for these objects are therefore treated as
+part of the application interface of the library, whereas the other objects
+are internal to the library.
+
+The error handler and memory manager are shared by JPEG compression and
+decompression; the progress monitor, if used, may be shared as well.
+
+
+*** Decompression object structure ***
+
+Here is a sketch of the logical structure of the JPEG decompression library:
+
+                                               |-- Entropy decoding
+                  |-- Coefficient controller --|
+                  |                            |-- Dequantize, Inverse DCT
+Main controller --|
+                  |                               |-- Upsampling
+                  |-- Postprocessing controller --|   |-- Colorspace conversion
+                                                  |-- Color quantization
+                                                  |-- Color precision reduction
+
+As before, this diagram also represents typical control flow.  The objects
+shown are:
+
+* Main controller: buffer controller for the subsampled-data buffer, which
+  holds the output of JPEG decompression proper.  This controller's primary
+  task is to feed the postprocessing procedure.  Some upsampling algorithms
+  may require context rows above and below the current row group; when this
+  is true, the main controller is responsible for managing its buffer so as
+  to make context rows available.  In the current design, the main buffer is
+  always a strip buffer; a full-image buffer is never required.
+
+* Coefficient controller: buffer controller for the DCT-coefficient data.
+  This controller handles MCU disassembly, including deletion of any dummy
+  DCT blocks at the right or bottom edge.  When reading a multiscan JPEG
+  file, this controller is responsible for buffering the full image.
+  (Buffering DCT coefficients, rather than samples, is necessary to support
+  progressive JPEG.)  The equivalent of one fully interleaved MCU row of
+  subsampled data is processed per call, even when the source JPEG file is
+  noninterleaved.
+
+* Entropy decoding: Read coded data from the data source module and perform
+  Huffman or arithmetic entropy decoding.  Works on one MCU per call.
+  For progressive JPEG decoding, the coefficient controller supplies the prior
+  coefficients of each MCU (initially all zeroes), which the entropy decoder
+  modifies in each scan.
+
+* Dequantization and inverse DCT: like it says.  Note that the coefficients
+  buffered by the coefficient controller have NOT been dequantized; we
+  merge dequantization and inverse DCT into a single step for speed reasons.
+  When scaled-down output is asked for, simplified DCT algorithms may be used
+  that emit fewer samples per DCT block, not the full 8x8.  Works on one DCT
+  block at a time.
+
+* Postprocessing controller: buffer controller for the color quantization
+  input buffer, when quantization is in use.  (Without quantization, this
+  controller just calls the upsampler.)  For two-pass quantization, this
+  controller is responsible for buffering the full-image data.
+
+* Upsampling: restores chroma components to full size.  (May support more
+  general output rescaling, too.  Note that if undersized DCT outputs have
+  been emitted by the DCT module, this module must adjust so that properly
+  sized outputs are created.)  Works on one row group at a time.  This module
+  also calls the color conversion module, so its top level is effectively a
+  buffer controller for the upsampling->color conversion buffer.  However, in
+  all but the highest-quality operating modes, upsampling and color
+  conversion are likely to be merged into a single step.
+
+* Colorspace conversion: convert from JPEG color space to output color space,
+  and change data layout from separate component planes to pixel-interleaved.
+  Works on one pixel row at a time.
+
+* Color quantization: reduce the data to colormapped form, using either an
+  externally specified colormap or an internally generated one.  This module
+  is not used for full-color output.  Works on one pixel row at a time; may
+  require two passes to generate a color map.  Note that the output will
+  always be a single component representing colormap indexes.  In the current
+  design, the output values are JSAMPLEs, so an 8-bit compilation cannot
+  quantize to more than 256 colors.  This is unlikely to be a problem in
+  practice.
+
+* Color reduction: this module handles color precision reduction, e.g.,
+  generating 15-bit color (5 bits/primary) from JPEG's 24-bit output.
+  Not quite clear yet how this should be handled... should we merge it with
+  colorspace conversion???
+
+Note that some high-speed operating modes might condense the entire
+postprocessing sequence to a single module (upsample, color convert, and
+quantize in one step).
+
+In addition to the above objects, the decompression library includes these
+objects:
+
+* Master control: determines the number of passes required, controls overall
+  and per-pass initialization of the other modules.  This is subdivided into
+  input and output control: jdinput.c controls only input-side processing,
+  while jdmaster.c handles overall initialization and output-side control.
+
+* Marker reading: decodes JPEG markers (except for RSTn).
+
+* Data source manager: supplies the input JPEG datastream.  The source
+  manager supplied with the library knows how to read from a stdio stream
+  or from a memory buffer;  for other behaviors, the surrounding application
+  may provide its own source manager.
+
+* Memory manager: same as for compression library.
+
+* Error handler: same as for compression library.
+
+* Progress monitor: same as for compression library.
+
+As with compression, the data source manager, error handler, and progress
+monitor are candidates for replacement by a surrounding application.
+
+
+*** Decompression input and output separation ***
+
+To support efficient incremental display of progressive JPEG files, the
+decompressor is divided into two sections that can run independently:
+
+1. Data input includes marker parsing, entropy decoding, and input into the
+   coefficient controller's DCT coefficient buffer.  Note that this
+   processing is relatively cheap and fast.
+
+2. Data output reads from the DCT coefficient buffer and performs the IDCT
+   and all postprocessing steps.
+
+For a progressive JPEG file, the data input processing is allowed to get
+arbitrarily far ahead of the data output processing.  (This occurs only
+if the application calls jpeg_consume_input(); otherwise input and output
+run in lockstep, since the input section is called only when the output
+section needs more data.)  In this way the application can avoid making
+extra display passes when data is arriving faster than the display pass
+can run.  Furthermore, it is possible to abort an output pass without
+losing anything, since the coefficient buffer is read-only as far as the
+output section is concerned.  See libjpeg.txt for more detail.
+
+A full-image coefficient array is only created if the JPEG file has multiple
+scans (or if the application specifies buffered-image mode anyway).  When
+reading a single-scan file, the coefficient controller normally creates only
+a one-MCU buffer, so input and output processing must run in lockstep in this
+case.  jpeg_consume_input() is effectively a no-op in this situation.
+
+The main impact of dividing the decompressor in this fashion is that we must
+be very careful with shared variables in the cinfo data structure.  Each
+variable that can change during the course of decompression must be
+classified as belonging to data input or data output, and each section must
+look only at its own variables.  For example, the data output section may not
+depend on any of the variables that describe the current scan in the JPEG
+file, because these may change as the data input section advances into a new
+scan.
+
+The progress monitor is (somewhat arbitrarily) defined to treat input of the
+file as one pass when buffered-image mode is not used, and to ignore data
+input work completely when buffered-image mode is used.  Note that the
+library has no reliable way to predict the number of passes when dealing
+with a progressive JPEG file, nor can it predict the number of output passes
+in buffered-image mode.  So the work estimate is inherently bogus anyway.
+
+No comparable division is currently made in the compression library, because
+there isn't any real need for it.
+
+
+*** Data formats ***
+
+Arrays of pixel sample values use the following data structure:
+
+    typedef something JSAMPLE;          a pixel component value, 0..MAXJSAMPLE
+    typedef JSAMPLE *JSAMPROW;          ptr to a row of samples
+    typedef JSAMPROW *JSAMPARRAY;       ptr to a list of rows
+    typedef JSAMPARRAY *JSAMPIMAGE;     ptr to a list of color-component arrays
+
+The basic element type JSAMPLE will typically be one of unsigned char,
+(signed) char, or short.  Short will be used if samples wider than 8 bits are
+to be supported (this is a compile-time option).  Otherwise, unsigned char is
+used if possible.  If the compiler only supports signed chars, then it is
+necessary to mask off the value when reading.  Thus, all reads of JSAMPLE
+values must be coded as "GETJSAMPLE(value)", where the macro will be defined
+as "((value) & 0xFF)" on signed-char machines and "((int) (value))" elsewhere.
+
+With these conventions, JSAMPLE values can be assumed to be >= 0.  This helps
+simplify correct rounding during downsampling, etc.  The JPEG standard's
+specification that sample values run from -128..127 is accommodated by
+subtracting 128 from the sample value in the DCT step.  Similarly, during
+decompression the output of the IDCT step will be immediately shifted back to
+0..255.  (NB: different values are required when 12-bit samples are in use.
+The code is written in terms of MAXJSAMPLE and CENTERJSAMPLE, which will be
+defined as 255 and 128 respectively in an 8-bit implementation, and as 4095
+and 2048 in a 12-bit implementation.)
+
+We use a pointer per row, rather than a two-dimensional JSAMPLE array.  This
+choice costs only a small amount of memory and has several benefits:
+* Code using the data structure doesn't need to know the allocated width of
+  the rows.  This simplifies edge expansion/compression, since we can work
+  in an array that's wider than the logical picture width.
+* Indexing doesn't require multiplication; this is a performance win on many
+  machines.
+* Arrays with more than 64K total elements can be supported even on machines
+  where malloc() cannot allocate chunks larger than 64K.
+* The rows forming a component array may be allocated at different times
+  without extra copying.  This trick allows some speedups in smoothing steps
+  that need access to the previous and next rows.
+
+Note that each color component is stored in a separate array; we don't use the
+traditional layout in which the components of a pixel are stored together.
+This simplifies coding of modules that work on each component independently,
+because they don't need to know how many components there are.  Furthermore,
+we can read or write each component to a temporary file independently, which
+is helpful when dealing with noninterleaved JPEG files.
+
+In general, a specific sample value is accessed by code such as
+        GETJSAMPLE(image[colorcomponent][row][col])
+where col is measured from the image left edge, but row is measured from the
+first sample row currently in memory.  Either of the first two indexings can
+be precomputed by copying the relevant pointer.
+
+
+Since most image-processing applications prefer to work on images in which
+the components of a pixel are stored together, the data passed to or from the
+surrounding application uses the traditional convention: a single pixel is
+represented by N consecutive JSAMPLE values, and an image row is an array of
+(# of color components)*(image width) JSAMPLEs.  One or more rows of data can
+be represented by a pointer of type JSAMPARRAY in this scheme.  This scheme is
+converted to component-wise storage inside the JPEG library.  (Applications
+that want to skip JPEG preprocessing or postprocessing will have to contend
+with component-wise storage.)
+
+
+Arrays of DCT-coefficient values use the following data structure:
+
+    typedef short JCOEF;                a 16-bit signed integer
+    typedef JCOEF JBLOCK[DCTSIZE2];     an 8x8 block of coefficients
+    typedef JBLOCK *JBLOCKROW;          ptr to one horizontal row of 8x8 blocks
+    typedef JBLOCKROW *JBLOCKARRAY;     ptr to a list of such rows
+    typedef JBLOCKARRAY *JBLOCKIMAGE;   ptr to a list of color component arrays
+
+The underlying type is at least a 16-bit signed integer; while "short" is big
+enough on all machines of interest, on some machines it is preferable to use
+"int" for speed reasons, despite the storage cost.  Coefficients are grouped
+into 8x8 blocks (but we always use #defines DCTSIZE and DCTSIZE2 rather than
+"8" and "64").
+
+The contents of a coefficient block may be in either "natural" or zigzagged
+order, and may be true values or divided by the quantization coefficients,
+depending on where the block is in the processing pipeline.  In the current
+library, coefficient blocks are kept in natural order everywhere; the entropy
+codecs zigzag or dezigzag the data as it is written or read.  The blocks
+contain quantized coefficients everywhere outside the DCT/IDCT subsystems.
+(This latter decision may need to be revisited to support variable
+quantization a la JPEG Part 3.)
+
+Notice that the allocation unit is now a row of 8x8 blocks, corresponding to
+eight rows of samples.  Otherwise the structure is much the same as for
+samples, and for the same reasons.
+
+
+*** Suspendable processing ***
+
+In some applications it is desirable to use the JPEG library as an
+incremental, memory-to-memory filter.  In this situation the data source or
+destination may be a limited-size buffer, and we can't rely on being able to
+empty or refill the buffer at arbitrary times.  Instead the application would
+like to have control return from the library at buffer overflow/underrun, and
+then resume compression or decompression at a later time.
+
+This scenario is supported for simple cases.  (For anything more complex, we
+recommend that the application "bite the bullet" and develop real multitasking
+capability.)  The libjpeg.txt file goes into more detail about the usage and
+limitations of this capability; here we address the implications for library
+structure.
+
+The essence of the problem is that the entropy codec (coder or decoder) must
+be prepared to stop at arbitrary times.  In turn, the controllers that call
+the entropy codec must be able to stop before having produced or consumed all
+the data that they normally would handle in one call.  That part is reasonably
+straightforward: we make the controller call interfaces include "progress
+counters" which indicate the number of data chunks successfully processed, and
+we require callers to test the counter rather than just assume all of the data
+was processed.
+
+Rather than trying to restart at an arbitrary point, the current Huffman
+codecs are designed to restart at the beginning of the current MCU after a
+suspension due to buffer overflow/underrun.  At the start of each call, the
+codec's internal state is loaded from permanent storage (in the JPEG object
+structures) into local variables.  On successful completion of the MCU, the
+permanent state is updated.  (This copying is not very expensive, and may even
+lead to *improved* performance if the local variables can be registerized.)
+If a suspension occurs, the codec simply returns without updating the state,
+thus effectively reverting to the start of the MCU.  Note that this implies
+leaving some data unprocessed in the source/destination buffer (ie, the
+compressed partial MCU).  The data source/destination module interfaces are
+specified so as to make this possible.  This also implies that the data buffer
+must be large enough to hold a worst-case compressed MCU; a couple thousand
+bytes should be enough.
+
+In a successive-approximation AC refinement scan, the progressive Huffman
+decoder has to be able to undo assignments of newly nonzero coefficients if it
+suspends before the MCU is complete, since decoding requires distinguishing
+previously-zero and previously-nonzero coefficients.  This is a bit tedious
+but probably won't have much effect on performance.  Other variants of Huffman
+decoding need not worry about this, since they will just store the same values
+again if forced to repeat the MCU.
+
+This approach would probably not work for an arithmetic codec, since its
+modifiable state is quite large and couldn't be copied cheaply.  Instead it
+would have to suspend and resume exactly at the point of the buffer end.
+
+The JPEG marker reader is designed to cope with suspension at an arbitrary
+point.  It does so by backing up to the start of the marker parameter segment,
+so the data buffer must be big enough to hold the largest marker of interest.
+Again, a couple KB should be adequate.  (A special "skip" convention is used
+to bypass COM and APPn markers, so these can be larger than the buffer size
+without causing problems; otherwise a 64K buffer would be needed in the worst
+case.)
+
+The JPEG marker writer currently does *not* cope with suspension.
+We feel that this is not necessary; it is much easier simply to require
+the application to ensure there is enough buffer space before starting.  (An
+empty 2K buffer is more than sufficient for the header markers; and ensuring
+there are a dozen or two bytes available before calling jpeg_finish_compress()
+will suffice for the trailer.)  This would not work for writing multi-scan
+JPEG files, but we simply do not intend to support that capability with
+suspension.
+
+
+*** Memory manager services ***
+
+The JPEG library's memory manager controls allocation and deallocation of
+memory, and it manages large "virtual" data arrays on machines where the
+operating system does not provide virtual memory.  Note that the same
+memory manager serves both compression and decompression operations.
+
+In all cases, allocated objects are tied to a particular compression or
+decompression master record, and they will be released when that master
+record is destroyed.
+
+The memory manager does not provide explicit deallocation of objects.
+Instead, objects are created in "pools" of free storage, and a whole pool
+can be freed at once.  This approach helps prevent storage-leak bugs, and
+it speeds up operations whenever malloc/free are slow (as they often are).
+The pools can be regarded as lifetime identifiers for objects.  Two
+pools/lifetimes are defined:
+  * JPOOL_PERMANENT     lasts until master record is destroyed
+  * JPOOL_IMAGE         lasts until done with image (JPEG datastream)
+Permanent lifetime is used for parameters and tables that should be carried
+across from one datastream to another; this includes all application-visible
+parameters.  Image lifetime is used for everything else.  (A third lifetime,
+JPOOL_PASS = one processing pass, was originally planned.  However it was
+dropped as not being worthwhile.  The actual usage patterns are such that the
+peak memory usage would be about the same anyway; and having per-pass storage
+substantially complicates the virtual memory allocation rules --- see below.)
+
+The memory manager deals with three kinds of object:
+1. "Small" objects.  Typically these require no more than 10K-20K total.
+2. "Large" objects.  These may require tens to hundreds of K depending on
+   image size.  Semantically they behave the same as small objects, but we
+   distinguish them because pool allocation heuristics may differ for large and
+   small objects (historically, large objects were also referenced by far
+   pointers on MS-DOS machines.)  Note that individual "large" objects cannot
+   exceed the size allowed by type size_t, which may be 64K or less on some
+   machines.
+3. "Virtual" objects.  These are large 2-D arrays of JSAMPLEs or JBLOCKs
+   (typically large enough for the entire image being processed).  The
+   memory manager provides stripwise access to these arrays.  On machines
+   without virtual memory, the rest of the array may be swapped out to a
+   temporary file.
+
+(Note: JSAMPARRAY and JBLOCKARRAY data structures are a combination of large
+objects for the data proper and small objects for the row pointers.  For
+convenience and speed, the memory manager provides single routines to create
+these structures.  Similarly, virtual arrays include a small control block
+and a JSAMPARRAY or JBLOCKARRAY working buffer, all created with one call.)
+
+In the present implementation, virtual arrays are only permitted to have image
+lifespan.  (Permanent lifespan would not be reasonable, and pass lifespan is
+not very useful since a virtual array's raison d'etre is to store data for
+multiple passes through the image.)  We also expect that only "small" objects
+will be given permanent lifespan, though this restriction is not required by
+the memory manager.
+
+In a non-virtual-memory machine, some performance benefit can be gained by
+making the in-memory buffers for virtual arrays be as large as possible.
+(For small images, the buffers might fit entirely in memory, so blind
+swapping would be very wasteful.)  The memory manager will adjust the height
+of the buffers to fit within a prespecified maximum memory usage.  In order
+to do this in a reasonably optimal fashion, the manager needs to allocate all
+of the virtual arrays at once.  Therefore, there isn't a one-step allocation
+routine for virtual arrays; instead, there is a "request" routine that simply
+allocates the control block, and a "realize" routine (called just once) that
+determines space allocation and creates all of the actual buffers.  The
+realize routine must allow for space occupied by non-virtual large objects.
+(We don't bother to factor in the space needed for small objects, on the
+grounds that it isn't worth the trouble.)
+
+To support all this, we establish the following protocol for doing business
+with the memory manager:
+  1. Modules must request virtual arrays (which may have only image lifespan)
+     during the initial setup phase, i.e., in their jinit_xxx routines.
+  2. All "large" objects (including JSAMPARRAYs and JBLOCKARRAYs) must also be
+     allocated during initial setup.
+  3. realize_virt_arrays will be called at the completion of initial setup.
+     The above conventions ensure that sufficient information is available
+     for it to choose a good size for virtual array buffers.
+Small objects of any lifespan may be allocated at any time.  We expect that
+the total space used for small objects will be small enough to be negligible
+in the realize_virt_arrays computation.
+
+In a virtual-memory machine, we simply pretend that the available space is
+infinite, thus causing realize_virt_arrays to decide that it can allocate all
+the virtual arrays as full-size in-memory buffers.  The overhead of the
+virtual-array access protocol is very small when no swapping occurs.
+
+A virtual array can be specified to be "pre-zeroed"; when this flag is set,
+never-yet-written sections of the array are set to zero before being made
+available to the caller.  If this flag is not set, never-written sections
+of the array contain garbage.  (This feature exists primarily because the
+equivalent logic would otherwise be needed in jdcoefct.c for progressive
+JPEG mode; we may as well make it available for possible other uses.)
+
+The first write pass on a virtual array is required to occur in top-to-bottom
+order; read passes, as well as any write passes after the first one, may
+access the array in any order.  This restriction exists partly to simplify
+the virtual array control logic, and partly because some file systems may not
+support seeking beyond the current end-of-file in a temporary file.  The main
+implication of this restriction is that rearrangement of rows (such as
+converting top-to-bottom data order to bottom-to-top) must be handled while
+reading data out of the virtual array, not while putting it in.
+
+
+*** Memory manager internal structure ***
+
+To isolate system dependencies as much as possible, we have broken the
+memory manager into two parts.  There is a reasonably system-independent
+"front end" (jmemmgr.c) and a "back end" that contains only the code
+likely to change across systems.  All of the memory management methods
+outlined above are implemented by the front end.  The back end provides
+the following routines for use by the front end (none of these routines
+are known to the rest of the JPEG code):
+
+jpeg_mem_init, jpeg_mem_term    system-dependent initialization/shutdown
+
+jpeg_get_small, jpeg_free_small interface to malloc and free library routines
+                                (or their equivalents)
+
+jpeg_get_large, jpeg_free_large historically was used to interface with
+                                FAR malloc/free on MS-DOS machines;  now the
+                                same as jpeg_get_small/jpeg_free_small
+
+jpeg_mem_available              estimate available memory
+
+jpeg_open_backing_store         create a backing-store object
+
+read_backing_store,             manipulate a backing-store object
+write_backing_store,
+close_backing_store
+
+On some systems there will be more than one type of backing-store object
+(specifically, in MS-DOS a backing store file might be an area of extended
+memory as well as a disk file).  jpeg_open_backing_store is responsible for
+choosing how to implement a given object.  The read/write/close routines
+are method pointers in the structure that describes a given object; this
+lets them be different for different object types.
+
+It may be necessary to ensure that backing store objects are explicitly
+released upon abnormal program termination.  For example, MS-DOS won't free
+extended memory by itself.  To support this, we will expect the main program
+or surrounding application to arrange to call self_destruct (typically via
+jpeg_destroy) upon abnormal termination.  This may require a SIGINT signal
+handler or equivalent.  We don't want to have the back end module install its
+own signal handler, because that would pre-empt the surrounding application's
+ability to control signal handling.
+
+The IJG distribution includes several memory manager back end implementations.
+Usually the same back end should be suitable for all applications on a given
+system, but it is possible for an application to supply its own back end at
+need.
+
+
+*** Implications of DNL marker ***
+
+Some JPEG files may use a DNL marker to postpone definition of the image
+height (this would be useful for a fax-like scanner's output, for instance).
+In these files the SOF marker claims the image height is 0, and you only
+find out the true image height at the end of the first scan.
+
+We could read these files as follows:
+1. Upon seeing zero image height, replace it by 65535 (the maximum allowed).
+2. When the DNL is found, update the image height in the global image
+   descriptor.
+This implies that control modules must avoid making copies of the image
+height, and must re-test for termination after each MCU row.  This would
+be easy enough to do.
+
+In cases where image-size data structures are allocated, this approach will
+result in very inefficient use of virtual memory or much-larger-than-necessary
+temporary files.  This seems acceptable for something that probably won't be a
+mainstream usage.  People might have to forgo use of memory-hogging options
+(such as two-pass color quantization or noninterleaved JPEG files) if they
+want efficient conversion of such files.  (One could improve efficiency by
+demanding a user-supplied upper bound for the height, less than 65536; in most
+cases it could be much less.)
+
+The standard also permits the SOF marker to overestimate the image height,
+with a DNL to give the true, smaller height at the end of the first scan.
+This would solve the space problems if the overestimate wasn't too great.
+However, it implies that you don't even know whether DNL will be used.
+
+This leads to a couple of very serious objections:
+1. Testing for a DNL marker must occur in the inner loop of the decompressor's
+   Huffman decoder; this implies a speed penalty whether the feature is used
+   or not.
+2. There is no way to hide the last-minute change in image height from an
+   application using the decoder.  Thus *every* application using the IJG
+   library would suffer a complexity penalty whether it cared about DNL or
+   not.
+We currently do not support DNL because of these problems.
+
+A different approach is to insist that DNL-using files be preprocessed by a
+separate program that reads ahead to the DNL, then goes back and fixes the SOF
+marker.  This is a much simpler solution and is probably far more efficient.
+Even if one wants piped input, buffering the first scan of the JPEG file needs
+a lot smaller temp file than is implied by the maximum-height method.  For
+this approach we'd simply treat DNL as a no-op in the decompressor (at most,
+check that it matches the SOF image height).
+
+We will not worry about making the compressor capable of outputting DNL.
+Something similar to the first scheme above could be applied if anyone ever
+wants to make that work.
diff --git a/tjbench.c b/tjbench.c
index f135da4..81b36f6 100644
--- a/tjbench.c
+++ b/tjbench.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2014 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2016 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -45,25 +45,40 @@
 #define _throwtj(m) _throw(m, tjGetErrorStr())
 #define _throwbmp(m) _throw(m, bmpgeterr())
 
-enum {YUVENCODE=1, YUVDECODE};
-int flags=TJFLAG_NOREALLOC, decomponly=0, yuv=0, quiet=0, dotile=0,
-	pf=TJPF_BGR;
+int flags=TJFLAG_NOREALLOC, componly=0, decomponly=0, doyuv=0, quiet=0,
+	dotile=0, pf=TJPF_BGR, yuvpad=1, warmup=1, dowrite=1;
 char *ext="ppm";
 const char *pixFormatStr[TJ_NUMPF]=
 {
-	"RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "GRAY"
+	"RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "GRAY", "", "", "", "", "CMYK"
 };
 const char *subNameLong[TJ_NUMSAMP]=
 {
-	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0"
+	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
 };
-const char *subName[NUMSUBOPT]={"444", "422", "420", "GRAY", "440"};
+const char *csName[TJ_NUMCS]=
+{
+	"RGB", "YCbCr", "GRAY", "CMYK", "YCCK"
+};
+const char *subName[TJ_NUMSAMP]={"444", "422", "420", "GRAY", "440", "411"};
 tjscalingfactor *scalingfactors=NULL, sf={1, 1};  int nsf=0;
 int xformop=TJXOP_NONE, xformopt=0;
 int (*customFilter)(short *, tjregion, tjregion, int, int, tjtransform *);
 double benchtime=5.0;
 
 
+char *formatName(int subsamp, int cs, char *buf)
+{
+	if(cs==TJCS_YCbCr) return (char *)subNameLong[subsamp];
+	else if(cs==TJCS_YCCK)
+	{
+		snprintf(buf, 80, "%s %s", csName[cs], subNameLong[subsamp]);
+		return buf;
+	}
+	else return (char *)csName[cs];
+}
+
+
 char *sigfig(double val, int figs, char *buf, int len)
 {
 	char format[80];
@@ -86,21 +101,20 @@
 
 
 /* Decompression test */
-int decomptest(unsigned char *srcbuf, unsigned char **jpegbuf,
+int decomp(unsigned char *srcbuf, unsigned char **jpegbuf,
 	unsigned long *jpegsize, unsigned char *dstbuf, int w, int h,
 	int subsamp, int jpegqual, char *filename, int tilew, int tileh)
 {
 	char tempstr[1024], sizestr[20]="\0", qualstr[6]="\0", *ptr;
 	FILE *file=NULL;  tjhandle handle=NULL;
-	int row, col, i, dstbufalloc=0, retval=0;
-	double start, elapsed;
+	int row, col, iter=0, dstbufalloc=0, retval=0;
+	double elapsed, elapsedDecode;
 	int ps=tjPixelSize[pf];
-	int yuvsize=tjBufSizeYUV(w, h, subsamp), bufsize;
-	int scaledw=(yuv==YUVDECODE)? w : TJSCALED(w, sf);
-	int scaledh=(yuv==YUVDECODE)? h : TJSCALED(h, sf);
+	int scaledw=TJSCALED(w, sf);
+	int scaledh=TJSCALED(h, sf);
 	int pitch=scaledw*ps;
 	int ntilesw=(w+tilew-1)/tilew, ntilesh=(h+tileh-1)/tileh;
-	unsigned char *dstptr, *dstptr2;
+	unsigned char *dstptr, *dstptr2, *yuvbuf=NULL;
 
 	if(jpegqual>0)
 	{
@@ -111,228 +125,176 @@
 	if((handle=tjInitDecompress())==NULL)
 		_throwtj("executing tjInitDecompress()");
 
-	bufsize=(yuv==YUVDECODE? yuvsize:pitch*scaledh);
 	if(dstbuf==NULL)
 	{
-		if((dstbuf=(unsigned char *)malloc(bufsize)) == NULL)
-			_throwunix("allocating image buffer");
+		if((dstbuf=(unsigned char *)malloc(pitch*scaledh))==NULL)
+			_throwunix("allocating destination buffer");
 		dstbufalloc=1;
 	}
 	/* Set the destination buffer to gray so we know whether the decompressor
 	   attempted to write to it */
-	memset(dstbuf, 127, bufsize);
+	memset(dstbuf, 127, pitch*scaledh);
 
-	/* Execute once to preload cache */
-	if(yuv==YUVDECODE)
+	if(doyuv)
 	{
-		if(tjDecompressToYUV(handle, jpegbuf[0], jpegsize[0], dstbuf, flags)==-1)
-			_throwtj("executing tjDecompressToYUV()");
+		int width=dotile? tilew:scaledw;
+		int height=dotile? tileh:scaledh;
+		int yuvsize=tjBufSizeYUV2(width, yuvpad, height, subsamp);
+		if((yuvbuf=(unsigned char *)malloc(yuvsize))==NULL)
+			_throwunix("allocating YUV buffer");
+		memset(yuvbuf, 127, yuvsize);
 	}
-	else if(tjDecompress2(handle, jpegbuf[0], jpegsize[0], dstbuf, scaledw,
-		pitch, scaledh, pf, flags)==-1)
-		_throwtj("executing tjDecompress2()");
 
 	/* Benchmark */
-	for(i=0, start=gettime(); (elapsed=gettime()-start)<benchtime; i++)
+	iter=-warmup;
+	elapsed=elapsedDecode=0.;
+	while(1)
 	{
 		int tile=0;
-		if(yuv==YUVDECODE)
-		{
-			if(tjDecompressToYUV(handle, jpegbuf[0], jpegsize[0], dstbuf, flags)==-1)
-				_throwtj("executing tjDecompressToYUV()");
-		}
-		else for(row=0, dstptr=dstbuf; row<ntilesh; row++, dstptr+=pitch*tileh)
+		double start=gettime();
+		for(row=0, dstptr=dstbuf; row<ntilesh; row++, dstptr+=pitch*tileh)
 		{
 			for(col=0, dstptr2=dstptr; col<ntilesw; col++, tile++, dstptr2+=ps*tilew)
 			{
 				int width=dotile? min(tilew, w-col*tilew):scaledw;
 				int height=dotile? min(tileh, h-row*tileh):scaledh;
-				if(tjDecompress2(handle, jpegbuf[tile], jpegsize[tile], dstptr2, width,
-					pitch, height, pf, flags)==-1)
-					_throwtj("executing tjDecompress2()");
+				if(doyuv)
+				{
+					double startDecode;
+					if(tjDecompressToYUV2(handle, jpegbuf[tile], jpegsize[tile], yuvbuf,
+						width, yuvpad, height, flags)==-1)
+						_throwtj("executing tjDecompressToYUV2()");
+					startDecode=gettime();
+					if(tjDecodeYUV(handle, yuvbuf, yuvpad, subsamp, dstptr2, width,
+						pitch, height, pf, flags)==-1)
+						_throwtj("executing tjDecodeYUV()");
+					if(iter>=0) elapsedDecode+=gettime()-startDecode;
+				}
+				else
+					if(tjDecompress2(handle, jpegbuf[tile], jpegsize[tile], dstptr2,
+						width, pitch, height, pf, flags)==-1)
+						_throwtj("executing tjDecompress2()");
 			}
 		}
+		iter++;
+		if(iter>=1)
+		{
+			elapsed+=gettime()-start;
+			if(elapsed>=benchtime) break;
+		}
 	}
+	if(doyuv) elapsed-=elapsedDecode;
 
 	if(tjDestroy(handle)==-1) _throwtj("executing tjDestroy()");
 	handle=NULL;
 
 	if(quiet)
 	{
-		printf("%s\n",
-			sigfig((double)(w*h)/1000000.*(double)i/elapsed, 4, tempstr, 1024));
+		printf("%-6s%s",
+			sigfig((double)(w*h)/1000000.*(double)iter/elapsed, 4, tempstr, 1024),
+			quiet==2? "\n":"  ");
+		if(doyuv)
+			printf("%s\n",
+				sigfig((double)(w*h)/1000000.*(double)iter/elapsedDecode, 4, tempstr,
+					1024));
+		else if(quiet!=2) printf("\n");
 	}
 	else
 	{
-		printf("D--> Frame rate:           %f fps\n", (double)i/elapsed);
-		printf("     Dest. throughput:     %f Megapixels/sec\n",
-			(double)(w*h)/1000000.*(double)i/elapsed);
-	}
-	if(yuv==YUVDECODE)
-	{
-		snprintf(tempstr, 1024, "%s_%s%s.yuv", filename, subName[subsamp],
-			qualstr);
-		if((file=fopen(tempstr, "wb"))==NULL)
-			_throwunix("opening YUV image for output");
-		if(fwrite(dstbuf, yuvsize, 1, file)!=1)
-			_throwunix("writing YUV image");
-		fclose(file);  file=NULL;
-	}
-	else
-	{
-		if(sf.num!=1 || sf.denom!=1)
-			snprintf(sizestr, 20, "%d_%d", sf.num, sf.denom);
-		else if(tilew!=w || tileh!=h)
-			snprintf(sizestr, 20, "%dx%d", tilew, tileh);
-		else snprintf(sizestr, 20, "full");
-		if(decomponly)
-			snprintf(tempstr, 1024, "%s_%s.%s", filename, sizestr, ext);
-		else
-			snprintf(tempstr, 1024, "%s_%s%s_%s.%s", filename, subName[subsamp],
-				qualstr, sizestr, ext);
-		if(savebmp(tempstr, dstbuf, scaledw, scaledh, pf,
-			(flags&TJFLAG_BOTTOMUP)!=0)==-1)
-			_throwbmp("saving bitmap");
-		ptr=strrchr(tempstr, '.');
-		snprintf(ptr, 1024-(ptr-tempstr), "-err.%s", ext);
-		if(srcbuf && sf.num==1 && sf.denom==1)
+		printf("%s --> Frame rate:         %f fps\n",
+			doyuv? "Decomp to YUV":"Decompress   ", (double)iter/elapsed);
+		printf("                  Throughput:         %f Megapixels/sec\n",
+			(double)(w*h)/1000000.*(double)iter/elapsed);
+		if(doyuv)
 		{
-			if(!quiet) printf("Compression error written to %s.\n", tempstr);
-			if(subsamp==TJ_GRAYSCALE)
-			{
-				int index, index2;
-				for(row=0, index=0; row<h; row++, index+=pitch)
-				{
-					for(col=0, index2=index; col<w; col++, index2+=ps)
-					{
-						int rindex=index2+tjRedOffset[pf];
-						int gindex=index2+tjGreenOffset[pf];
-						int bindex=index2+tjBlueOffset[pf];
-						int y=(int)((double)srcbuf[rindex]*0.299
-							+ (double)srcbuf[gindex]*0.587
-							+ (double)srcbuf[bindex]*0.114 + 0.5);
-						if(y>255) y=255;  if(y<0) y=0;
-						dstbuf[rindex]=abs(dstbuf[rindex]-y);
-						dstbuf[gindex]=abs(dstbuf[gindex]-y);
-						dstbuf[bindex]=abs(dstbuf[bindex]-y);
-					}
-				}
-			}		
-			else
-			{
-				for(row=0; row<h; row++)
-					for(col=0; col<w*ps; col++)
-						dstbuf[pitch*row+col]
-							=abs(dstbuf[pitch*row+col]-srcbuf[pitch*row+col]);
-			}
-			if(savebmp(tempstr, dstbuf, w, h, pf,
-				(flags&TJFLAG_BOTTOMUP)!=0)==-1)
-				_throwbmp("saving bitmap");
+			printf("YUV Decode    --> Frame rate:         %f fps\n",
+				(double)iter/elapsedDecode);
+			printf("                  Throughput:         %f Megapixels/sec\n",
+				(double)(w*h)/1000000.*(double)iter/elapsedDecode);
 		}
 	}
 
+	if (!dowrite) goto bailout;
+
+	if(sf.num!=1 || sf.denom!=1)
+		snprintf(sizestr, 20, "%d_%d", sf.num, sf.denom);
+	else if(tilew!=w || tileh!=h)
+		snprintf(sizestr, 20, "%dx%d", tilew, tileh);
+	else snprintf(sizestr, 20, "full");
+	if(decomponly)
+		snprintf(tempstr, 1024, "%s_%s.%s", filename, sizestr, ext);
+	else
+		snprintf(tempstr, 1024, "%s_%s%s_%s.%s", filename, subName[subsamp],
+			qualstr, sizestr, ext);
+
+	if(savebmp(tempstr, dstbuf, scaledw, scaledh, pf,
+		(flags&TJFLAG_BOTTOMUP)!=0)==-1)
+		_throwbmp("saving bitmap");
+	ptr=strrchr(tempstr, '.');
+	snprintf(ptr, 1024-(ptr-tempstr), "-err.%s", ext);
+	if(srcbuf && sf.num==1 && sf.denom==1)
+	{
+		if(!quiet) printf("Compression error written to %s.\n", tempstr);
+		if(subsamp==TJ_GRAYSCALE)
+		{
+			int index, index2;
+			for(row=0, index=0; row<h; row++, index+=pitch)
+			{
+				for(col=0, index2=index; col<w; col++, index2+=ps)
+				{
+					int rindex=index2+tjRedOffset[pf];
+					int gindex=index2+tjGreenOffset[pf];
+					int bindex=index2+tjBlueOffset[pf];
+					int y=(int)((double)srcbuf[rindex]*0.299
+						+ (double)srcbuf[gindex]*0.587
+						+ (double)srcbuf[bindex]*0.114 + 0.5);
+					if(y>255) y=255;  if(y<0) y=0;
+					dstbuf[rindex]=abs(dstbuf[rindex]-y);
+					dstbuf[gindex]=abs(dstbuf[gindex]-y);
+					dstbuf[bindex]=abs(dstbuf[bindex]-y);
+				}
+			}
+		}
+		else
+		{
+			for(row=0; row<h; row++)
+				for(col=0; col<w*ps; col++)
+					dstbuf[pitch*row+col]
+						=abs(dstbuf[pitch*row+col]-srcbuf[pitch*row+col]);
+		}
+		if(savebmp(tempstr, dstbuf, w, h, pf,
+			(flags&TJFLAG_BOTTOMUP)!=0)==-1)
+			_throwbmp("saving bitmap");
+	}
+
 	bailout:
-	if(file) {fclose(file);  file=NULL;}
-	if(handle) {tjDestroy(handle);  handle=NULL;}
-	if(dstbuf && dstbufalloc) {free(dstbuf);  dstbuf=NULL;}
+	if(file) fclose(file);
+	if(handle) tjDestroy(handle);
+	if(dstbuf && dstbufalloc) free(dstbuf);
+	if(yuvbuf) free(yuvbuf);
 	return retval;
 }
 
 
-void dotestyuv(unsigned char *srcbuf, int w, int h, int subsamp,
+int fullTest(unsigned char *srcbuf, int w, int h, int subsamp, int jpegqual,
 	char *filename)
 {
 	char tempstr[1024], tempstr2[80];
 	FILE *file=NULL;  tjhandle handle=NULL;
-	unsigned char *dstbuf=NULL;
-	double start, elapsed;
-	int i, retval=0, ps=tjPixelSize[pf];
-	int yuvsize=0;
-
-	yuvsize=tjBufSizeYUV(w, h, subsamp);
-	if((dstbuf=(unsigned char *)malloc(yuvsize)) == NULL)
-		_throwunix("allocating image buffer");
-
-	if(!quiet)
-		printf(">>>>>  %s (%s) <--> YUV %s  <<<<<\n", pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "Bottom-up":"Top-down", subNameLong[subsamp]);
-
-	if(quiet==1)
-		printf("%s\t%s\t%s\tN/A\t", pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "BU":"TD", subNameLong[subsamp]);
-
-	if((handle=tjInitCompress())==NULL)
-		_throwtj("executing tjInitCompress()");
-
-	/* Execute once to preload cache */
-	if(tjEncodeYUV2(handle, srcbuf, w, 0, h, pf, dstbuf, subsamp, flags)==-1)
-		_throwtj("executing tjEncodeYUV2()");
-
-	/* Benchmark */
-	for(i=0, start=gettime(); (elapsed=gettime()-start)<benchtime; i++)
-	{
-		if(tjEncodeYUV2(handle, srcbuf, w, 0, h, pf, dstbuf, subsamp, flags)==-1)
-			_throwtj("executing tjEncodeYUV2()");
-	}
-
-	if(tjDestroy(handle)==-1) _throwtj("executing tjDestroy()");
-	handle=NULL;
-
-	if(quiet==1) printf("%-4d  %-4d\t", w, h);
-	if(quiet)
-	{
-		printf("%s%c%s%c",
-			sigfig((double)(w*h)/1000000.*(double)i/elapsed, 4, tempstr, 1024),
-			quiet==2? '\n':'\t',
-			sigfig((double)(w*h*ps)/(double)yuvsize, 4, tempstr2, 80),
-			quiet==2? '\n':'\t');
-	}
-	else
-	{
-		printf("\n%s size: %d x %d\n", "Image", w, h);
-		printf("C--> Frame rate:           %f fps\n", (double)i/elapsed);
-		printf("     Output image size:    %d bytes\n", yuvsize);
-		printf("     Compression ratio:    %f:1\n",
-			(double)(w*h*ps)/(double)yuvsize);
-		printf("     Source throughput:    %f Megapixels/sec\n",
-			(double)(w*h)/1000000.*(double)i/elapsed);
-		printf("     Output bit stream:    %f Megabits/sec\n",
-			(double)yuvsize*8./1000000.*(double)i/elapsed);
-	}
-	snprintf(tempstr, 1024, "%s_%s.yuv", filename, subName[subsamp]);
-	if((file=fopen(tempstr, "wb"))==NULL)
-		_throwunix("opening reference image");
-	if(fwrite(dstbuf, yuvsize, 1, file)!=1)
-		_throwunix("writing reference image");
-	fclose(file);  file=NULL;
-	if(!quiet) printf("Reference image written to %s\n", tempstr);
-
-	bailout:
-	if(file) {fclose(file);  file=NULL;}
-	if(dstbuf) {free(dstbuf);  dstbuf=NULL;}
-	if(handle) {tjDestroy(handle);  handle=NULL;}
-	return;
-}
-
-
-void dotest(unsigned char *srcbuf, int w, int h, int subsamp, int jpegqual,
-	char *filename)
-{
-	char tempstr[1024], tempstr2[80];
-	FILE *file=NULL;  tjhandle handle=NULL;
-	unsigned char **jpegbuf=NULL, *tmpbuf=NULL, *srcptr, *srcptr2;
-	double start, elapsed;
+	unsigned char **jpegbuf=NULL, *yuvbuf=NULL, *tmpbuf=NULL, *srcptr, *srcptr2;
+	double start, elapsed, elapsedEncode;
 	int totaljpegsize=0, row, col, i, tilew=w, tileh=h, retval=0;
+	int iter, yuvsize=0;
 	unsigned long *jpegsize=NULL;
-	int ps=tjPixelSize[pf], ntilesw=1, ntilesh=1, pitch=w*ps;
-
-	if(yuv==YUVENCODE) {dotestyuv(srcbuf, w, h, subsamp, filename);  return;}
+	int ps=tjPixelSize[pf];
+	int ntilesw=1, ntilesh=1, pitch=w*ps;
+	const char *pfStr=pixFormatStr[pf];
 
 	if((tmpbuf=(unsigned char *)malloc(pitch*h)) == NULL)
 		_throwunix("allocating temporary image buffer");
 
 	if(!quiet)
-		printf(">>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", pixFormatStr[pf],
+		printf(">>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", pfStr,
 			(flags&TJFLAG_BOTTOMUP)? "Bottom-up":"Top-down", subNameLong[subsamp],
 			jpegqual);
 
@@ -353,30 +315,36 @@
 		if((flags&TJFLAG_NOREALLOC)!=0)
 			for(i=0; i<ntilesw*ntilesh; i++)
 			{
-				if((jpegbuf[i]=(unsigned char *)malloc(tjBufSize(tilew, tileh,
+				if((jpegbuf[i]=(unsigned char *)tjAlloc(tjBufSize(tilew, tileh,
 					subsamp)))==NULL)
 					_throwunix("allocating JPEG tiles");
 			}
 
 		/* Compression test */
 		if(quiet==1)
-			printf("%s\t%s\t%s\t%d\t", pixFormatStr[pf],
+			printf("%-4s (%s)  %-5s    %-3d   ", pfStr,
 				(flags&TJFLAG_BOTTOMUP)? "BU":"TD", subNameLong[subsamp], jpegqual);
 		for(i=0; i<h; i++)
 			memcpy(&tmpbuf[pitch*i], &srcbuf[w*ps*i], w*ps);
 		if((handle=tjInitCompress())==NULL)
 			_throwtj("executing tjInitCompress()");
 
-		/* Execute once to preload cache */
-		if(tjCompress2(handle, srcbuf, tilew, pitch, tileh, pf, &jpegbuf[0],
-			&jpegsize[0], subsamp, jpegqual, flags)==-1)
-			_throwtj("executing tjCompress2()");
+		if(doyuv)
+		{
+			yuvsize=tjBufSizeYUV2(tilew, yuvpad, tileh, subsamp);
+			if((yuvbuf=(unsigned char *)malloc(yuvsize))==NULL)
+				_throwunix("allocating YUV buffer");
+			memset(yuvbuf, 127, yuvsize);
+		}
 
 		/* Benchmark */
-		for(i=0, start=gettime(); (elapsed=gettime()-start)<benchtime; i++)
+		iter=-warmup;
+		elapsed=elapsedEncode=0.;
+		while(1)
 		{
 			int tile=0;
 			totaljpegsize=0;
+			start=gettime();
 			for(row=0, srcptr=srcbuf; row<ntilesh; row++, srcptr+=pitch*tileh)
 			{
 				for(col=0, srcptr2=srcptr; col<ntilesw; col++, tile++,
@@ -384,40 +352,80 @@
 				{
 					int width=min(tilew, w-col*tilew);
 					int height=min(tileh, h-row*tileh);
-					if(tjCompress2(handle, srcptr2, width, pitch, height, pf,
-						&jpegbuf[tile], &jpegsize[tile], subsamp, jpegqual, flags)==-1)
-						_throwtj("executing tjCompress()2");
+					if(doyuv)
+					{
+						double startEncode=gettime();
+						if(tjEncodeYUV3(handle, srcptr2, width, pitch, height, pf, yuvbuf,
+							yuvpad, subsamp, flags)==-1)
+							_throwtj("executing tjEncodeYUV3()");
+						if(iter>=0) elapsedEncode+=gettime()-startEncode;
+						if(tjCompressFromYUV(handle, yuvbuf, width, yuvpad, height,
+							subsamp, &jpegbuf[tile], &jpegsize[tile], jpegqual, flags)==-1)
+							_throwtj("executing tjCompressFromYUV()");
+					}
+					else
+					{
+						if(tjCompress2(handle, srcptr2, width, pitch, height, pf,
+							&jpegbuf[tile], &jpegsize[tile], subsamp, jpegqual, flags)==-1)
+							_throwtj("executing tjCompress2()");
+					}
 					totaljpegsize+=jpegsize[tile];
 				}
 			}
+			iter++;
+			if(iter>=1)
+			{
+				elapsed+=gettime()-start;
+				if(elapsed>=benchtime) break;
+			}
 		}
+		if(doyuv) elapsed-=elapsedEncode;
 
 		if(tjDestroy(handle)==-1) _throwtj("executing tjDestroy()");
 		handle=NULL;
 
-		if(quiet==1) printf("%-4d  %-4d\t", tilew, tileh);
+		if(quiet==1) printf("%-5d  %-5d   ", tilew, tileh);
 		if(quiet)
 		{
-			printf("%s%c%s%c",
-				sigfig((double)(w*h)/1000000.*(double)i/elapsed, 4, tempstr, 1024),
-				quiet==2? '\n':'\t',
+			if(doyuv)
+				printf("%-6s%s",
+					sigfig((double)(w*h)/1000000.*(double)iter/elapsedEncode, 4, tempstr,
+						1024), quiet==2? "\n":"  ");
+			printf("%-6s%s",
+				sigfig((double)(w*h)/1000000.*(double)iter/elapsed, 4,	tempstr, 1024),
+				quiet==2? "\n":"  ");
+			printf("%-6s%s",
 				sigfig((double)(w*h*ps)/(double)totaljpegsize, 4, tempstr2, 80),
-				quiet==2? '\n':'\t');
+				quiet==2? "\n":"  ");
 		}
 		else
 		{
 			printf("\n%s size: %d x %d\n", dotile? "Tile":"Image", tilew,
 				tileh);
-			printf("C--> Frame rate:           %f fps\n", (double)i/elapsed);
-			printf("     Output image size:    %d bytes\n", totaljpegsize);
-			printf("     Compression ratio:    %f:1\n",
+			if(doyuv)
+			{
+				printf("Encode YUV    --> Frame rate:         %f fps\n",
+					(double)iter/elapsedEncode);
+				printf("                  Output image size:  %d bytes\n", yuvsize);
+				printf("                  Compression ratio:  %f:1\n",
+					(double)(w*h*ps)/(double)yuvsize);
+				printf("                  Throughput:         %f Megapixels/sec\n",
+					(double)(w*h)/1000000.*(double)iter/elapsedEncode);
+				printf("                  Output bit stream:  %f Megabits/sec\n",
+					(double)yuvsize*8./1000000.*(double)iter/elapsedEncode);
+			}
+			printf("%s --> Frame rate:         %f fps\n",
+				doyuv? "Comp from YUV":"Compress     ", (double)iter/elapsed);
+			printf("                  Output image size:  %d bytes\n",
+				totaljpegsize);
+			printf("                  Compression ratio:  %f:1\n",
 				(double)(w*h*ps)/(double)totaljpegsize);
-			printf("     Source throughput:    %f Megapixels/sec\n",
-				(double)(w*h)/1000000.*(double)i/elapsed);
-			printf("     Output bit stream:    %f Megabits/sec\n",
-				(double)totaljpegsize*8./1000000.*(double)i/elapsed);
+			printf("                  Throughput:         %f Megapixels/sec\n",
+				(double)(w*h)/1000000.*(double)iter/elapsed);
+			printf("                  Output bit stream:  %f Megabits/sec\n",
+				(double)totaljpegsize*8./1000000.*(double)iter/elapsed);
 		}
-		if(tilew==w && tileh==h)
+		if(tilew==w && tileh==h && dowrite)
 		{
 			snprintf(tempstr, 1024, "%s_%s_Q%d.jpg", filename, subName[subsamp],
 				jpegqual);
@@ -430,16 +438,23 @@
 		}
 
 		/* Decompression test */
-		if(decomptest(srcbuf, jpegbuf, jpegsize, tmpbuf, w, h, subsamp, jpegqual,
-			filename, tilew, tileh)==-1)
-			goto bailout;
+		if(!componly)
+		{
+			if(decomp(srcbuf, jpegbuf, jpegsize, tmpbuf, w, h, subsamp, jpegqual,
+				filename, tilew, tileh)==-1)
+				goto bailout;
+		}
 
 		for(i=0; i<ntilesw*ntilesh; i++)
 		{
-			if(jpegbuf[i]) free(jpegbuf[i]);  jpegbuf[i]=NULL;
+			if(jpegbuf[i]) tjFree(jpegbuf[i]);  jpegbuf[i]=NULL;
 		}
 		free(jpegbuf);  jpegbuf=NULL;
 		free(jpegsize);  jpegsize=NULL;
+		if(doyuv)
+		{
+			free(yuvbuf);  yuvbuf=NULL;
+		}
 
 		if(tilew==w && tileh==h) break;
 	}
@@ -450,27 +465,28 @@
 	{
 		for(i=0; i<ntilesw*ntilesh; i++)
 		{
-			if(jpegbuf[i]) free(jpegbuf[i]);  jpegbuf[i]=NULL;
+			if(jpegbuf[i]) tjFree(jpegbuf[i]);  jpegbuf[i]=NULL;
 		}
 		free(jpegbuf);  jpegbuf=NULL;
 	}
+	if(yuvbuf) {free(yuvbuf);  yuvbuf=NULL;}
 	if(jpegsize) {free(jpegsize);  jpegsize=NULL;}
 	if(tmpbuf) {free(tmpbuf);  tmpbuf=NULL;}
 	if(handle) {tjDestroy(handle);  handle=NULL;}
-	return;
+	return retval;
 }
 
 
-void dodecomptest(char *filename)
+int decompTest(char *filename)
 {
 	FILE *file=NULL;  tjhandle handle=NULL;
 	unsigned char **jpegbuf=NULL, *srcbuf=NULL;
 	unsigned long *jpegsize=NULL, srcsize, totaljpegsize;
 	tjtransform *t=NULL;
-	int w=0, h=0, subsamp=-1, _w, _h, _tilew, _tileh,
+	int w=0, h=0, subsamp=-1, cs=-1, _w, _h, _tilew, _tileh,
 		_ntilesw, _ntilesh, _subsamp;
 	char *temp=NULL, tempstr[80], tempstr2[80];
-	int row, col, i, tilew, tileh, ntilesw=1, ntilesh=1, retval=0;
+	int row, col, i, iter, tilew, tileh, ntilesw=1, ntilesh=1, retval=0;
 	double start, elapsed;
 	int ps=tjPixelSize[pf], tile;
 
@@ -491,21 +507,28 @@
 
 	if((handle=tjInitTransform())==NULL)
 		_throwtj("executing tjInitTransform()");
-	if(tjDecompressHeader2(handle, srcbuf, srcsize, &w, &h, &subsamp)==-1)
-		_throwtj("executing tjDecompressHeader2()");
+	if(tjDecompressHeader3(handle, srcbuf, srcsize, &w, &h, &subsamp, &cs)==-1)
+		_throwtj("executing tjDecompressHeader3()");
+	if(cs==TJCS_YCCK || cs==TJCS_CMYK)
+	{
+		pf=TJPF_CMYK;  ps=tjPixelSize[pf];
+	}
 
 	if(quiet==1)
 	{
 		printf("All performance values in Mpixels/sec\n\n");
-		printf("Bitmap\tBitmap\tJPEG\t%s %s \tXform\tComp\tDecomp\n",
+		printf("Bitmap     JPEG   JPEG     %s  %s   Xform   Comp    Decomp  ",
 			dotile? "Tile ":"Image", dotile? "Tile ":"Image");
-		printf("Format\tOrder\tSubsamp\tWidth Height\tPerf \tRatio\tPerf\n\n");
+		if(doyuv) printf("Decode");
+		printf("\n");
+		printf("Format     CS     Subsamp  Width  Height  Perf    Ratio   Perf    ");
+		if(doyuv) printf("Perf");
+		printf("\n\n");
 	}
 	else if(!quiet)
-	{
-		printf(">>>>>  JPEG %s --> %s (%s)  <<<<<\n", subNameLong[subsamp],
-			pixFormatStr[pf], (flags&TJFLAG_BOTTOMUP)? "Bottom-up":"Top-down");
-	}
+		printf(">>>>>  JPEG %s --> %s (%s)  <<<<<\n",
+			formatName(subsamp, cs, tempstr), pixFormatStr[pf],
+			(flags&TJFLAG_BOTTOMUP)? "Bottom-up":"Top-down");
 
 	for(tilew=dotile? 16:w, tileh=dotile? 16:h; ; tilew*=2, tileh*=2)
 	{
@@ -524,7 +547,7 @@
 		if((flags&TJFLAG_NOREALLOC)!=0 || !dotile)
 			for(i=0; i<ntilesw*ntilesh; i++)
 			{
-				if((jpegbuf[i]=(unsigned char *)malloc(tjBufSize(tilew, tileh,
+				if((jpegbuf[i]=(unsigned char *)tjAlloc(tjBufSize(tilew, tileh,
 					subsamp)))==NULL)
 					_throwunix("allocating JPEG tiles");
 			}
@@ -540,9 +563,9 @@
 		}
 		else if(quiet==1)
 		{
-			printf("%s\t%s\t%s\t", pixFormatStr[pf],
-				(flags&TJFLAG_BOTTOMUP)? "BU":"TD", subNameLong[subsamp]);
-			printf("%-4d  %-4d\t", tilew, tileh);
+			printf("%-4s (%s)  %-5s  %-5s    ", pixFormatStr[pf],
+				(flags&TJFLAG_BOTTOMUP)? "BU":"TD", csName[cs], subNameLong[subsamp]);
+			printf("%-5d  %-5d   ", tilew, tileh);
 		}
 
 		_subsamp=subsamp;
@@ -570,6 +593,13 @@
 			_ntilesw=(_w+_tilew-1)/_tilew;
 			_ntilesh=(_h+_tileh-1)/_tileh;
 
+			if(xformop==TJXOP_TRANSPOSE || xformop==TJXOP_TRANSVERSE
+				|| xformop==TJXOP_ROT90 || xformop==TJXOP_ROT270)
+			{
+				if(_subsamp==TJSAMP_422) _subsamp=TJSAMP_440;
+				else if(_subsamp==TJSAMP_440) _subsamp=TJSAMP_422;
+			}
+
 			for(row=0, tile=0; row<_ntilesh; row++)
 			{
 				for(col=0; col<_ntilesw; col++, tile++)
@@ -583,16 +613,26 @@
 					t[tile].customFilter=customFilter;
 					if(t[tile].options&TJXOPT_NOOUTPUT && jpegbuf[tile])
 					{
-						free(jpegbuf[tile]);  jpegbuf[tile]=NULL;
+						tjFree(jpegbuf[tile]);  jpegbuf[tile]=NULL;
 					}
 				}
 			}
 
-			start=gettime();
-			if(tjTransform(handle, srcbuf, srcsize, _ntilesw*_ntilesh, jpegbuf,
-				jpegsize, t, flags)==-1)
-				_throwtj("executing tjTransform()");
-			elapsed=gettime()-start;
+			iter=-warmup;
+			elapsed=0.;
+			while(1)
+			{
+				start=gettime();
+				if(tjTransform(handle, srcbuf, srcsize, _ntilesw*_ntilesh, jpegbuf,
+					jpegsize, t, flags)==-1)
+					_throwtj("executing tjTransform()");
+				iter++;
+				if(iter>=1)
+				{
+					elapsed+=gettime()-start;
+					if(elapsed>=benchtime) break;
+				}
+			}
 
 			free(t);  t=NULL;
 
@@ -601,27 +641,27 @@
 
 			if(quiet)
 			{
-				printf("%s%c%s%c",
+				printf("%-6s%s%-6s%s",
 					sigfig((double)(w*h)/1000000./elapsed, 4, tempstr, 80),
-					quiet==2? '\n':'\t',
+					quiet==2? "\n":"  ",
 					sigfig((double)(w*h*ps)/(double)totaljpegsize, 4, tempstr2, 80),
-					quiet==2? '\n':'\t');
+					quiet==2? "\n":"  ");
 			}
 			else if(!quiet)
 			{
-				printf("X--> Frame rate:           %f fps\n", 1.0/elapsed);
-				printf("     Output image size:    %lu bytes\n", totaljpegsize);
-				printf("     Compression ratio:    %f:1\n",
+				printf("Transform     --> Frame rate:         %f fps\n", 1.0/elapsed);
+				printf("                  Output image size:  %lu bytes\n", totaljpegsize);
+				printf("                  Compression ratio:  %f:1\n",
 					(double)(w*h*ps)/(double)totaljpegsize);
-				printf("     Source throughput:    %f Megapixels/sec\n",
+				printf("                  Throughput:         %f Megapixels/sec\n",
 					(double)(w*h)/1000000./elapsed);
-				printf("     Output bit stream:    %f Megabits/sec\n",
+				printf("                  Output bit stream:  %f Megabits/sec\n",
 					(double)totaljpegsize*8./1000000./elapsed);
 			}
 		}
 		else
 		{
-			if(quiet==1) printf("N/A\tN/A\t");
+			if(quiet==1) printf("N/A     N/A     ");
 			jpegsize[0]=srcsize;
 			memcpy(jpegbuf[0], srcbuf, srcsize);
 		}
@@ -630,7 +670,7 @@
 		if(h==tileh) _tileh=_h;
 		if(!(xformopt&TJXOPT_NOOUTPUT))
 		{
-			if(decomptest(NULL, jpegbuf, jpegsize, NULL, _w, _h, _subsamp, 0,
+			if(decomp(NULL, jpegbuf, jpegsize, NULL, _w, _h, _subsamp, 0,
 				filename, _tilew, _tileh)==-1)
 				goto bailout;
 		}
@@ -638,7 +678,7 @@
 
 		for(i=0; i<ntilesw*ntilesh; i++)
 		{
-			free(jpegbuf[i]);  jpegbuf[i]=NULL;
+			tjFree(jpegbuf[i]);  jpegbuf[i]=NULL;
 		}
 		free(jpegbuf);  jpegbuf=NULL;
 		if(jpegsize) {free(jpegsize);  jpegsize=NULL;}
@@ -652,7 +692,7 @@
 	{
 		for(i=0; i<ntilesw*ntilesh; i++)
 		{
-			if(jpegbuf[i]) free(jpegbuf[i]);  jpegbuf[i]=NULL;
+			if(jpegbuf[i]) tjFree(jpegbuf[i]);  jpegbuf[i]=NULL;
 		}
 		free(jpegbuf);  jpegbuf=NULL;
 	}
@@ -660,7 +700,7 @@
 	if(srcbuf) {free(srcbuf);  srcbuf=NULL;}
 	if(t) {free(t);  t=NULL;}
 	if(handle) {tjDestroy(handle);  handle=NULL;}
-	return;
+	return retval;
 }
 
 
@@ -673,14 +713,15 @@
 	printf("       <Inputfile (JPG)> [options]\n\n");
 	printf("Options:\n\n");
 	printf("-alloc = Dynamically allocate JPEG image buffers\n");
-	printf("-bmp = Generate output images in Windows Bitmap format (default=PPM)\n");
+	printf("-bmp = Generate output images in Windows Bitmap format (default = PPM)\n");
 	printf("-bottomup = Test bottom-up compression/decompression\n");
 	printf("-tile = Test performance of the codec when the image is encoded as separate\n");
 	printf("     tiles of varying sizes.\n");
-	printf("-forcemmx, -forcesse, -forcesse2, -forcesse3 =\n");
-	printf("     Force MMX, SSE, SSE2, or SSE3 code paths in the underlying codec\n");
 	printf("-rgb, -bgr, -rgbx, -bgrx, -xbgr, -xrgb =\n");
-	printf("     Test the specified color conversion path in the codec (default: BGR)\n");
+	printf("     Test the specified color conversion path in the codec (default = BGR)\n");
+	printf("-cmyk = Indirectly test YCCK JPEG compression/decompression (the source\n");
+	printf("     and destination bitmaps are still RGB.  The conversion is done\n");
+	printf("     internally prior to compression or after decompression.)\n");
 	printf("-fastupsample = Use the fastest chrominance upsampling algorithm available in\n");
 	printf("     the underlying codec\n");
 	printf("-fastdct = Use the fastest DCT/IDCT algorithms available in the underlying\n");
@@ -688,12 +729,15 @@
 	printf("-accuratedct = Use the most accurate DCT/IDCT algorithms available in the\n");
 	printf("     underlying codec\n");
 	printf("-subsamp <s> = When testing JPEG compression, this option specifies the level\n");
-	printf("     of chrominance subsampling to use (<s> = 444, 422, 440, 420, or GRAY).\n");
-	printf("     The default is to test Grayscale, 4:2:0, 4:2:2, and 4:4:4 in sequence.\n");
+	printf("     of chrominance subsampling to use (<s> = 444, 422, 440, 420, 411, or\n");
+	printf("     GRAY).  The default is to test Grayscale, 4:2:0, 4:2:2, and 4:4:4 in\n");
+	printf("     sequence.\n");
 	printf("-quiet = Output results in tabular rather than verbose format\n");
-	printf("-yuvencode = Encode RGB input as planar YUV rather than compressing as JPEG\n");
-	printf("-yuvdecode = Decode JPEG image to planar YUV rather than RGB\n");
-	printf("-scale M/N = scale down the width/height of the decompressed JPEG image by a\n");
+	printf("-yuv = Test YUV encoding/decoding functions\n");
+	printf("-yuvpad <p> = If testing YUV encoding/decoding, this specifies the number of\n");
+	printf("     bytes to which each row of each plane in the intermediate YUV image is\n");
+	printf("     padded (default = 1)\n");
+	printf("-scale M/N = Scale down the width/height of the decompressed JPEG image by a\n");
 	printf("     factor of M/N (M/N = ");
 	for(i=0; i<nsf; i++)
 	{
@@ -712,7 +756,12 @@
 	printf("     decompression (these options are mutually exclusive)\n");
 	printf("-grayscale = Perform lossless grayscale conversion prior to decompression\n");
 	printf("     test (can be combined with the other transforms above)\n");
-	printf("-benchtime <t> = Run each benchmark for at least <t> seconds (default = 5.0)\n\n");
+	printf("-benchtime <t> = Run each benchmark for at least <t> seconds (default = 5.0)\n");
+	printf("-warmup <w> = Execute each benchmark <w> times to prime the cache before\n");
+	printf("     taking performance measurements (default = 1)\n");
+	printf("-componly = Stop after running compression tests.  Do not test decompression.\n");
+	printf("-nowrite = Do not write reference or output images (improves consistency of\n");
+	printf("     performance measurements.)\n\n");
 	printf("NOTE:  If the quality is specified as a range (e.g. 90-100), a separate\n");
 	printf("test will be performed for all quality values in the range.\n\n");
 	exit(1);
@@ -721,7 +770,7 @@
 
 int main(int argc, char *argv[])
 {
-	unsigned char *srcbuf=NULL;  int w, h, i, j;
+	unsigned char *srcbuf=NULL;  int w=0, h=0, i, j;
 	int minqual=-1, maxqual=-1;  char *temp;
 	int minarg=2, retval=0, subsamp=-1;
 
@@ -739,24 +788,7 @@
 
 	printf("\n");
 
-	if(argc>minarg)
-	{
-		for(i=minarg; i<argc; i++)
-		{
-			if(!strcasecmp(argv[i], "-yuvencode"))
-			{
-				printf("Testing YUV planar encoding\n\n");
-				yuv=YUVENCODE;  maxqual=minqual=100;
-			}
-			if(!strcasecmp(argv[i], "-yuvdecode"))
-			{
-				printf("Testing YUV planar decoding\n\n");
-				yuv=YUVDECODE;
-			}
-		}
-	}
-
-	if(!decomponly && yuv!=YUVENCODE)
+	if(!decomponly)
 	{
 		minarg=3;
 		if(argc<minarg) usage(argv[0]);
@@ -779,26 +811,6 @@
 			{
 				dotile=1;  xformopt|=TJXOPT_CROP;
 			}
-			if(!strcasecmp(argv[i], "-forcesse3"))
-			{
-				printf("Forcing SSE3 code\n\n");
-				flags|=TJFLAG_FORCESSE3;
-			}
-			if(!strcasecmp(argv[i], "-forcesse2"))
-			{
-				printf("Forcing SSE2 code\n\n");
-				flags|=TJFLAG_FORCESSE2;
-			}
-			if(!strcasecmp(argv[i], "-forcesse"))
-			{
-				printf("Forcing SSE code\n\n");
-				flags|=TJFLAG_FORCESSE;
-			}
-			if(!strcasecmp(argv[i], "-forcemmx"))
-			{
-				printf("Forcing MMX code\n\n");
-				flags|=TJFLAG_FORCEMMX;
-			}
 			if(!strcasecmp(argv[i], "-fastupsample"))
 			{
 				printf("Using fast upsampling code\n\n");
@@ -820,6 +832,7 @@
 			if(!strcasecmp(argv[i], "-bgrx")) pf=TJPF_BGRX;
 			if(!strcasecmp(argv[i], "-xbgr")) pf=TJPF_XBGR;
 			if(!strcasecmp(argv[i], "-xrgb")) pf=TJPF_XRGB;
+			if(!strcasecmp(argv[i], "-cmyk")) pf=TJPF_CMYK;
 			if(!strcasecmp(argv[i], "-bottomup")) flags|=TJFLAG_BOTTOMUP;
 			if(!strcasecmp(argv[i], "-quiet")) quiet=1;
 			if(!strcasecmp(argv[i], "-qq")) quiet=2;
@@ -857,9 +870,29 @@
 				if(temp>0.0) benchtime=temp;
 				else usage(argv[0]);
 			}
+			if(!strcasecmp(argv[i], "-warmup") && i<argc-1)
+			{
+				int temp=atoi(argv[++i]);
+				if(temp>=0)
+				{
+					warmup=temp;
+					printf("Warmup runs = %d\n\n", warmup);
+				}
+				else usage(argv[0]);
+			}
 			if(!strcmp(argv[i], "-?")) usage(argv[0]);
 			if(!strcasecmp(argv[i], "-alloc")) flags&=(~TJFLAG_NOREALLOC);
 			if(!strcasecmp(argv[i], "-bmp")) ext="bmp";
+			if(!strcasecmp(argv[i], "-yuv"))
+			{
+				printf("Testing YUV planar encoding/decoding\n\n");
+				doyuv=1;
+			}
+			if(!strcasecmp(argv[i], "-yuvpad") && i<argc-1)
+			{
+				int temp=atoi(argv[++i]);
+				if(temp>=1) yuvpad=temp;
+			}
 			if(!strcasecmp(argv[i], "-subsamp") && i<argc-1)
 			{
 				i++;
@@ -873,9 +906,12 @@
 						case 422:  subsamp=TJSAMP_422;  break;
 						case 440:  subsamp=TJSAMP_440;  break;
 						case 420:  subsamp=TJSAMP_420;  break;
+						case 411:  subsamp=TJSAMP_411;  break;
 					}
 				}
 			}
+			if(!strcasecmp(argv[i], "-componly")) componly=1;
+			if(!strcasecmp(argv[i], "-nowrite")) dowrite=0;
 		}
 	}
 
@@ -886,10 +922,10 @@
 		dotile=0;
 	}
 
-	if(yuv && dotile)
+	if((flags&TJFLAG_NOREALLOC)==0 && dotile)
 	{
 		printf("Disabling tiled compression/decompression tests, because those tests do not\n");
-		printf("work when YUV encoding or decoding is enabled.\n\n");
+		printf("work when dynamic JPEG buffer allocation is enabled.\n\n");
 		dotile=0;
 	}
 
@@ -904,36 +940,47 @@
 	if(quiet==1 && !decomponly)
 	{
 		printf("All performance values in Mpixels/sec\n\n");
-		printf("Bitmap\tBitmap\tJPEG\tJPEG\t%s %s \tComp\tComp\tDecomp\n",
+		printf("Bitmap     JPEG     JPEG  %s  %s   ",
 			dotile? "Tile ":"Image", dotile? "Tile ":"Image");
-		printf("Format\tOrder\tSubsamp\tQual\tWidth Height\tPerf \tRatio\tPerf\n\n");
+		if(doyuv) printf("Encode  ");
+		printf("Comp    Comp    Decomp  ");
+		if(doyuv) printf("Decode");
+		printf("\n");
+		printf("Format     Subsamp  Qual  Width  Height  ");
+		if(doyuv) printf("Perf    ");
+		printf("Perf    Ratio   Perf    ");
+		if(doyuv) printf("Perf");
+		printf("\n\n");
 	}
 
 	if(decomponly)
 	{
-		dodecomptest(argv[1]);
+		decompTest(argv[1]);
 		printf("\n");
 		goto bailout;
 	}
 	if(subsamp>=0 && subsamp<TJ_NUMSAMP)
 	{
 		for(i=maxqual; i>=minqual; i--)
-			dotest(srcbuf, w, h, subsamp, i, argv[1]);
+			fullTest(srcbuf, w, h, subsamp, i, argv[1]);
 		printf("\n");
 	}
 	else
 	{
+		if(pf!=TJPF_CMYK)
+		{
+			for(i=maxqual; i>=minqual; i--)
+				fullTest(srcbuf, w, h, TJSAMP_GRAY, i, argv[1]);
+			printf("\n");
+		}
 		for(i=maxqual; i>=minqual; i--)
-			dotest(srcbuf, w, h, TJSAMP_GRAY, i, argv[1]);
+			fullTest(srcbuf, w, h, TJSAMP_420, i, argv[1]);
 		printf("\n");
 		for(i=maxqual; i>=minqual; i--)
-			dotest(srcbuf, w, h, TJSAMP_420, i, argv[1]);
+			fullTest(srcbuf, w, h, TJSAMP_422, i, argv[1]);
 		printf("\n");
 		for(i=maxqual; i>=minqual; i--)
-			dotest(srcbuf, w, h, TJSAMP_422, i, argv[1]);
-		printf("\n");
-		for(i=maxqual; i>=minqual; i--)
-			dotest(srcbuf, w, h, TJSAMP_444, i, argv[1]);
+			fullTest(srcbuf, w, h, TJSAMP_444, i, argv[1]);
 		printf("\n");
 	}
 
diff --git a/tjbenchtest.in b/tjbenchtest.in
new file mode 100644
index 0000000..ef11b24
--- /dev/null
+++ b/tjbenchtest.in
@@ -0,0 +1,249 @@
+#!/bin/bash
+
+set -u
+set -e
+trap onexit INT
+trap onexit TERM
+trap onexit EXIT
+
+onexit()
+{
+	if [ -d $OUTDIR ]; then
+		rm -rf $OUTDIR
+	fi
+}
+
+runme()
+{
+	echo \*\*\* $*
+	$*
+}
+
+EXT=bmp
+IMAGES="vgl_5674_0098.${EXT} vgl_6434_0018a.${EXT} vgl_6548_0026a.${EXT} nightshot_iso_100.${EXT}"
+IMGDIR=@srcdir@/testimages
+OUTDIR=`mktemp -d /tmp/__tjbenchtest_output.XXXXXX`
+EXEDIR=.
+BMPARG=
+NSARG=
+YUVARG=
+ALLOC=0
+ALLOCARG=
+if [ "$EXT" = "bmp" ]; then BMPARG=-bmp; fi
+
+if [ -d $OUTDIR ]; then
+	rm -rf $OUTDIR
+fi
+mkdir -p $OUTDIR
+
+exec >$EXEDIR/tjbenchtest.log
+
+if [ $# -gt 0 ]; then
+	if [ "$1" = "-yuv" ]; then
+		NSARG=-nosmooth
+		YUVARG=-yuv
+
+# NOTE: The combination of tjEncodeYUV*() and tjCompressFromYUV*() does not
+# always produce bitwise-identical results to tjCompress*() if subsampling is
+# enabled.  In both cases, if the image width or height are not evenly
+# divisible by the MCU width/height, then the bottom and/or right edge are
+# expanded.  However, the libjpeg code performs this expansion prior to
+# downsampling, and TurboJPEG performs it in tjCompressFromYUV*(), which is
+# after downsampling.  Thus, the two will agree only if the width/height along
+# each downsampled dimension is an odd number or is evenly divisible by the MCU
+# width/height.  This disagreement basically amounts to a round-off error, but
+# there is no easy way around it, so for now, we just test the only image that
+# works.  (NOTE: nightshot_iso_100 does not suffer from the above issue, but
+# it suffers from an unrelated problem whereby the combination of
+# tjDecompressToYUV*() and tjDecodeYUV*() do not produce bitwise-identical
+# results to tjDecompress*() if decompression scaling is enabled.  This latter
+# phenomenon is not yet fully understood but is also believed to be some sort
+# of round-off error.)
+		IMAGES="vgl_6548_0026a.${EXT}"
+	fi
+	if [ "$1" = "-alloc" ]; then
+		ALLOCARG=-alloc
+		ALLOC=1
+	fi
+fi
+
+# Standard tests
+for image in $IMAGES; do
+
+	cp $IMGDIR/$image $OUTDIR
+	basename=`basename $image .${EXT}`
+	runme $EXEDIR/cjpeg -quality 95 -dct fast -grayscale -outfile $OUTDIR/${basename}_GRAY_fast_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 2x2 -outfile $OUTDIR/${basename}_420_fast_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 2x1 -outfile $OUTDIR/${basename}_422_fast_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 1x1 -outfile $OUTDIR/${basename}_444_fast_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct int -grayscale -outfile $OUTDIR/${basename}_GRAY_accurate_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 2x2 -outfile $OUTDIR/${basename}_420_accurate_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 2x1 -outfile $OUTDIR/${basename}_422_accurate_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 1x1 -outfile $OUTDIR/${basename}_444_accurate_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	for samp in GRAY 420 422 444; do
+		runme $EXEDIR/djpeg -rgb $NSARG $BMPARG -outfile $OUTDIR/${basename}_${samp}_default_djpeg.${EXT} $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+		runme $EXEDIR/djpeg -dct fast -rgb $NSARG $BMPARG -outfile $OUTDIR/${basename}_${samp}_fast_djpeg.${EXT} $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+		runme $EXEDIR/djpeg -dct int -rgb $NSARG $BMPARG -outfile $OUTDIR/${basename}_${samp}_accurate_djpeg.${EXT} $OUTDIR/${basename}_${samp}_accurate_cjpeg.jpg
+	done
+	for samp in 420 422; do
+		runme $EXEDIR/djpeg -nosmooth $BMPARG -outfile $OUTDIR/${basename}_${samp}_default_nosmooth_djpeg.${EXT} $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+		runme $EXEDIR/djpeg -dct fast -nosmooth $BMPARG -outfile $OUTDIR/${basename}_${samp}_fast_nosmooth_djpeg.${EXT} $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+		runme $EXEDIR/djpeg -dct int -nosmooth $BMPARG -outfile $OUTDIR/${basename}_${samp}_accurate_nosmooth_djpeg.${EXT} $OUTDIR/${basename}_${samp}_accurate_cjpeg.jpg
+	done
+
+	# Compression
+	for dct in accurate fast; do
+		runme $EXEDIR/tjbench $OUTDIR/$image 95 -rgb -quiet -benchtime 0.01 -warmup 0 -${dct}dct $YUVARG $ALLOCARG
+		for samp in GRAY 420 422 444; do
+			runme cmp $OUTDIR/${basename}_${samp}_Q95.jpg $OUTDIR/${basename}_${samp}_${dct}_cjpeg.jpg
+		done
+	done
+
+	for dct in fast accurate default; do
+		dctarg=-${dct}dct
+		if [ "${dct}" = "default" ]; then
+			dctarg=
+		fi
+
+		# Tiled compression & decompression
+		runme $EXEDIR/tjbench $OUTDIR/$image 95 -rgb -tile -quiet -benchtime 0.01 -warmup 0 ${dctarg} $YUVARG $ALLOCARG
+		for samp in GRAY 444; do
+			if [ $ALLOC = 1 ]; then
+				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_${samp}_${dct}_djpeg.${EXT}
+				rm $OUTDIR/${basename}_${samp}_Q95_full.${EXT}
+			else
+				for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].${EXT} \
+					$OUTDIR/${basename}_${samp}_Q95_full.${EXT}; do
+					runme cmp $i $OUTDIR/${basename}_${samp}_${dct}_djpeg.${EXT}
+					rm $i
+				done
+			fi
+		done
+		runme $EXEDIR/tjbench $OUTDIR/$image 95 -rgb -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample ${dctarg} $YUVARG $ALLOCARG
+		for samp in 420 422; do
+			if [ $ALLOC = 1 ]; then
+				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_${samp}_${dct}_nosmooth_djpeg.${EXT}
+				rm $OUTDIR/${basename}_${samp}_Q95_full.${EXT}
+			else
+				for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].${EXT} \
+					$OUTDIR/${basename}_${samp}_Q95_full.${EXT}; do
+					runme cmp $i $OUTDIR/${basename}_${samp}_${dct}_nosmooth_djpeg.${EXT}
+					rm $i
+				done
+			fi
+		done
+
+		# Tiled decompression
+		for samp in GRAY 444; do
+			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -tile -quiet -benchtime 0.01 -warmup 0 ${dctarg} $YUVARG $ALLOCARG
+			if [ $ALLOC = 1 ]; then
+				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_${samp}_${dct}_djpeg.${EXT}
+				rm $OUTDIR/${basename}_${samp}_Q95_full.${EXT}
+			else
+				for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].${EXT} \
+					$OUTDIR/${basename}_${samp}_Q95_full.${EXT}; do
+					runme cmp $i $OUTDIR/${basename}_${samp}_${dct}_djpeg.${EXT}
+					rm $i
+				done
+			fi
+		done
+		for samp in 420 422; do
+			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample ${dctarg} $YUVARG $ALLOCARG
+			if [ $ALLOC = 1 ]; then
+				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_${samp}_${dct}_nosmooth_djpeg.${EXT}
+				rm $OUTDIR/${basename}_${samp}_Q95_full.${EXT}
+			else
+				for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].${EXT} \
+					$OUTDIR/${basename}_${samp}_Q95_full.${EXT}; do
+					runme cmp $i $OUTDIR/${basename}_${samp}_${dct}_nosmooth_djpeg.${EXT}
+					rm $i
+				done
+			fi
+		done
+	done
+
+	# Scaled decompression
+	for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
+		scalearg=`echo $scale | sed s@_@/@g`
+		for samp in GRAY 420 422 444; do
+			runme $EXEDIR/djpeg -rgb -scale ${scalearg} $NSARG $BMPARG -outfile $OUTDIR/${basename}_${samp}_${scale}_djpeg.${EXT} $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -scale ${scalearg} -quiet -benchtime 0.01 -warmup 0 $YUVARG $ALLOCARG
+			runme cmp $OUTDIR/${basename}_${samp}_Q95_${scale}.${EXT} $OUTDIR/${basename}_${samp}_${scale}_djpeg.${EXT}
+			rm $OUTDIR/${basename}_${samp}_Q95_${scale}.${EXT}
+		done
+	done
+
+	# Transforms
+	for samp in GRAY 420 422 444; do
+		runme $EXEDIR/jpegtran -flip horizontal -trim -outfile $OUTDIR/${basename}_${samp}_hflip_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+		runme $EXEDIR/jpegtran -flip vertical -trim -outfile $OUTDIR/${basename}_${samp}_vflip_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+		runme $EXEDIR/jpegtran -transpose -trim -outfile $OUTDIR/${basename}_${samp}_transpose_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+		runme $EXEDIR/jpegtran -transverse -trim -outfile $OUTDIR/${basename}_${samp}_transverse_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+		runme $EXEDIR/jpegtran -rotate 90 -trim -outfile $OUTDIR/${basename}_${samp}_rot90_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+		runme $EXEDIR/jpegtran -rotate 180 -trim -outfile $OUTDIR/${basename}_${samp}_rot180_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+		runme $EXEDIR/jpegtran -rotate 270 -trim -outfile $OUTDIR/${basename}_${samp}_rot270_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+	done
+	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
+		for samp in GRAY 444; do
+			runme $EXEDIR/djpeg -rgb $BMPARG -outfile $OUTDIR/${basename}_${samp}_${xform}_jpegtran.${EXT} $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
+			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -$xform -tile -quiet -benchtime 0.01 -warmup 0 $YUVARG $ALLOCARG
+			if [ $ALLOC = 1 ]; then
+				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_${samp}_${xform}_jpegtran.${EXT}
+				rm $OUTDIR/${basename}_${samp}_Q95_full.${EXT}
+			else
+				for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].${EXT} \
+					$OUTDIR/${basename}_${samp}_Q95_full.${EXT}; do
+					runme cmp $i $OUTDIR/${basename}_${samp}_${xform}_jpegtran.${EXT}
+					rm $i
+				done
+			fi
+		done
+		for samp in 420 422; do
+			runme $EXEDIR/djpeg -nosmooth -rgb $BMPARG -outfile $OUTDIR/${basename}_${samp}_${xform}_jpegtran.${EXT} $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
+			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -$xform -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample $YUVARG $ALLOCARG
+			if [ $ALLOC = 1 ]; then
+				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_${samp}_${xform}_jpegtran.${EXT}
+				rm $OUTDIR/${basename}_${samp}_Q95_full.${EXT}
+			else
+				for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].${EXT} \
+					$OUTDIR/${basename}_${samp}_Q95_full.${EXT}; do
+					runme cmp $i $OUTDIR/${basename}_${samp}_${xform}_jpegtran.${EXT}
+					rm $i
+				done
+			fi
+		done
+	done
+
+	# Grayscale transform
+	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
+		for samp in GRAY 444 422 420; do
+			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -$xform -tile -quiet -benchtime 0.01 -warmup 0 -grayscale $YUVARG $ALLOCARG
+			if [ $ALLOC = 1 ]; then
+				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_GRAY_${xform}_jpegtran.${EXT}
+				rm $OUTDIR/${basename}_${samp}_Q95_full.${EXT}
+			else
+				for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].${EXT} \
+					$OUTDIR/${basename}_${samp}_Q95_full.${EXT}; do
+					runme cmp $i $OUTDIR/${basename}_GRAY_${xform}_jpegtran.${EXT}
+					rm $i
+				done
+			fi
+		done
+	done
+
+	# Transforms with scaling
+	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
+		for samp in GRAY 444 422 420; do
+			for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
+				scalearg=`echo $scale | sed s@_@/@g`
+				runme $EXEDIR/djpeg -rgb -scale ${scalearg} $NSARG $BMPARG -outfile $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.${EXT} $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
+				runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -$xform -scale ${scalearg} -quiet -benchtime 0.01 -warmup 0 $YUVARG $ALLOCARG
+				runme cmp $OUTDIR/${basename}_${samp}_Q95_${scale}.${EXT} $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.${EXT}
+				rm $OUTDIR/${basename}_${samp}_Q95_${scale}.${EXT}
+			done
+		done
+	done
+
+done
+
+echo SUCCESS!
diff --git a/tjbenchtest.java.in b/tjbenchtest.java.in
new file mode 100644
index 0000000..acdabd0
--- /dev/null
+++ b/tjbenchtest.java.in
@@ -0,0 +1,207 @@
+#!/bin/bash
+
+set -u
+set -e
+trap onexit INT
+trap onexit TERM
+trap onexit EXIT
+
+onexit()
+{
+	if [ -d $OUTDIR ]; then
+		rm -rf $OUTDIR
+	fi
+}
+
+runme()
+{
+	echo \*\*\* $*
+	$*
+}
+
+IMAGES="vgl_5674_0098.bmp vgl_6434_0018a.bmp vgl_6548_0026a.bmp nightshot_iso_100.bmp"
+IMGDIR=@srcdir@/testimages
+OUTDIR=`mktemp -d /tmp/__tjbenchtest_java_output.XXXXXX`
+EXEDIR=.
+JAVA="@JAVA@ -cp java/turbojpeg.jar -Djava.library.path=.libs"
+BMPARG=
+NSARG=
+YUVARG=
+
+if [ -d $OUTDIR ]; then
+	rm -rf $OUTDIR
+fi
+mkdir -p $OUTDIR
+
+exec >$EXEDIR/tjbenchtest-java.log
+
+if [ $# -gt 0 ]; then
+	if [ "$1" = "-yuv" ]; then
+		NSARG=-nosmooth
+		YUVARG=-yuv
+
+# NOTE: The combination of tjEncodeYUV*() and tjCompressFromYUV*() does not
+# always produce bitwise-identical results to tjCompress*() if subsampling is
+# enabled.  In both cases, if the image width or height are not evenly
+# divisible by the MCU width/height, then the bottom and/or right edge are
+# expanded.  However, the libjpeg code performs this expansion prior to
+# downsampling, and TurboJPEG performs it in tjCompressFromYUV*(), which is
+# after downsampling.  Thus, the two will agree only if the width/height along
+# each downsampled dimension is an odd number or is evenly divisible by the MCU
+# width/height.  This disagreement basically amounts to a round-off error, but
+# there is no easy way around it, so for now, we just test the only image that
+# works.  (NOTE: nightshot_iso_100 does not suffer from the above issue, but
+# it suffers from an unrelated problem whereby the combination of
+# tjDecompressToYUV*() and tjDecodeYUV*() do not produce bitwise-identical
+# results to tjDecompress*() if decompression scaling is enabled.  This latter
+# phenomenon is not yet fully understood but is also believed to be some sort
+# of round-off error.)
+		IMAGES="vgl_6548_0026a.bmp"
+	fi
+fi
+
+# Standard tests
+for image in $IMAGES; do
+
+	cp $IMGDIR/$image $OUTDIR
+	basename=`basename $image .bmp`
+	runme $EXEDIR/cjpeg -quality 95 -dct fast -grayscale -outfile $OUTDIR/${basename}_GRAY_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 2x2 -outfile $OUTDIR/${basename}_420_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 2x1 -outfile $OUTDIR/${basename}_422_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 1x1 -outfile $OUTDIR/${basename}_444_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct int -grayscale -outfile $OUTDIR/${basename}_GRAY_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 2x2 -outfile $OUTDIR/${basename}_420_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 2x1 -outfile $OUTDIR/${basename}_422_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 1x1 -outfile $OUTDIR/${basename}_444_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
+	for samp in GRAY 420 422 444; do
+		runme $EXEDIR/djpeg -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_default_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+		runme $EXEDIR/djpeg -dct fast -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_fast_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+		runme $EXEDIR/djpeg -dct int -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_accurate_djpeg.bmp $OUTDIR/${basename}_${samp}_accurate_cjpeg.jpg
+	done
+	for samp in 420 422; do
+		runme $EXEDIR/djpeg -nosmooth -bmp -outfile $OUTDIR/${basename}_${samp}_default_nosmooth_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+		runme $EXEDIR/djpeg -dct fast -nosmooth -bmp -outfile $OUTDIR/${basename}_${samp}_fast_nosmooth_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+		runme $EXEDIR/djpeg -dct int -nosmooth -bmp -outfile $OUTDIR/${basename}_${samp}_accurate_nosmooth_djpeg.bmp $OUTDIR/${basename}_${samp}_accurate_cjpeg.jpg
+	done
+
+	# Compression
+	for dct in accurate fast; do
+		runme $JAVA TJBench $OUTDIR/$image 95 -rgb -quiet -benchtime 0.01 -warmup 0 -${dct}dct $YUVARG
+		for samp in GRAY 420 422 444; do
+			runme cmp $OUTDIR/${basename}_${samp}_Q95.jpg $OUTDIR/${basename}_${samp}_${dct}_cjpeg.jpg
+		done
+	done
+
+	for dct in fast accurate default; do
+		dctarg=-${dct}dct
+		if [ "${dct}" = "default" ]; then
+			dctarg=
+		fi
+
+		# Tiled compression & decompression
+		runme $JAVA TJBench $OUTDIR/$image 95 -rgb -tile -quiet -benchtime 0.01 -warmup 0 ${dctarg} $YUVARG
+		for samp in GRAY 444; do
+			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
+				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
+				runme cmp -i 54:54 $i $OUTDIR/${basename}_${samp}_${dct}_djpeg.bmp
+				rm $i
+			done
+		done
+		runme $JAVA TJBench $OUTDIR/$image 95 -rgb -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample ${dctarg} $YUVARG
+		for samp in 420 422; do
+			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
+				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
+				runme cmp -i 54:54 $i $OUTDIR/${basename}_${samp}_${dct}_nosmooth_djpeg.bmp
+				rm $i
+			done
+		done
+
+		# Tiled decompression
+		for samp in GRAY 444; do
+			runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -tile -quiet -benchtime 0.01 -warmup 0 ${dctarg} $YUVARG
+			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
+				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
+				runme cmp -i 54:54 $i $OUTDIR/${basename}_${samp}_${dct}_djpeg.bmp
+				rm $i
+			done
+		done
+		for samp in 420 422; do
+			runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample ${dctarg} $YUVARG
+			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
+				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
+				runme cmp $i -i 54:54 $OUTDIR/${basename}_${samp}_${dct}_nosmooth_djpeg.bmp
+				rm $i
+			done
+		done
+	done
+
+	# Scaled decompression
+	for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
+		scalearg=`echo $scale | sed s@_@/@g`
+		for samp in GRAY 420 422 444; do
+			runme $EXEDIR/djpeg -rgb -scale ${scalearg} $NSARG -bmp -outfile $OUTDIR/${basename}_${samp}_${scale}_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+			runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -scale ${scalearg} -quiet -benchtime 0.01 -warmup 0 $YUVARG
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_Q95_${scale}.bmp $OUTDIR/${basename}_${samp}_${scale}_djpeg.bmp
+			rm $OUTDIR/${basename}_${samp}_Q95_${scale}.bmp
+		done
+	done
+
+	# Transforms
+	for samp in GRAY 420 422 444; do
+		runme $EXEDIR/jpegtran -flip horizontal -trim -outfile $OUTDIR/${basename}_${samp}_hflip_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+		runme $EXEDIR/jpegtran -flip vertical -trim -outfile $OUTDIR/${basename}_${samp}_vflip_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+		runme $EXEDIR/jpegtran -transpose -trim -outfile $OUTDIR/${basename}_${samp}_transpose_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+		runme $EXEDIR/jpegtran -transverse -trim -outfile $OUTDIR/${basename}_${samp}_transverse_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+		runme $EXEDIR/jpegtran -rotate 90 -trim -outfile $OUTDIR/${basename}_${samp}_rot90_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+		runme $EXEDIR/jpegtran -rotate 180 -trim -outfile $OUTDIR/${basename}_${samp}_rot180_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+		runme $EXEDIR/jpegtran -rotate 270 -trim -outfile $OUTDIR/${basename}_${samp}_rot270_jpegtran.jpg $OUTDIR/${basename}_${samp}_Q95.jpg
+	done
+	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
+		for samp in GRAY 444; do
+			runme $EXEDIR/djpeg -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
+			runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -$xform -tile -quiet -benchtime 0.01 -warmup 0 $YUVARG
+			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
+				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
+				runme cmp -i 54:54 $i $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp
+				rm $i
+			done
+		done
+		for samp in 420 422; do
+			runme $EXEDIR/djpeg -nosmooth -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
+			runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -$xform -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample $YUVARG
+			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
+				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
+				runme cmp -i 54:54 $i $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp
+				rm $i
+			done
+		done
+	done
+
+	# Grayscale transform
+	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
+		for samp in GRAY 444 422 420; do
+			runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -$xform -tile -quiet -benchtime 0.01 -warmup 0 -grayscale $YUVARG
+			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
+				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
+				runme cmp -i 54:54 $i $OUTDIR/${basename}_GRAY_${xform}_jpegtran.bmp
+				rm $i
+			done
+		done
+	done
+
+	# Transforms with scaling
+	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
+		for samp in GRAY 444 422 420; do
+			for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
+				scalearg=`echo $scale | sed s@_@/@g`
+				runme $EXEDIR/djpeg -rgb -scale ${scalearg} $NSARG -bmp -outfile $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
+				runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -$xform -scale ${scalearg} -quiet -benchtime 0.01 -warmup 0 $YUVARG
+				runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_Q95_${scale}.bmp $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.bmp
+				rm $OUTDIR/${basename}_${samp}_Q95_${scale}.bmp
+			done
+		done
+	done
+
+done
+
+echo SUCCESS!
diff --git a/tjexampletest.in b/tjexampletest.in
new file mode 100644
index 0000000..40b342e
--- /dev/null
+++ b/tjexampletest.in
@@ -0,0 +1,150 @@
+#!/bin/bash
+
+set -u
+set -e
+trap onexit INT
+trap onexit TERM
+trap onexit EXIT
+
+onexit()
+{
+	if [ -d $OUTDIR ]; then
+		rm -rf $OUTDIR
+	fi
+}
+
+runme()
+{
+	echo \*\*\* $*
+	$*
+}
+
+IMAGES="vgl_5674_0098.bmp vgl_6434_0018a.bmp vgl_6548_0026a.bmp nightshot_iso_100.bmp"
+IMGDIR=@srcdir@/testimages
+OUTDIR=__tjexampletest_output
+EXEDIR=.
+JAVA="@JAVA@ -cp java/turbojpeg.jar -Djava.library.path=.libs"
+
+if [ -d $OUTDIR ]; then
+	rm -rf $OUTDIR
+fi
+mkdir -p $OUTDIR
+
+exec >$EXEDIR/tjexampletest.log
+
+for image in $IMAGES; do
+
+	cp $IMGDIR/$image $OUTDIR
+	basename=`basename $image .bmp`
+	$EXEDIR/cjpeg -quality 95 -dct fast -grayscale $IMGDIR/${basename}.bmp >$OUTDIR/${basename}_GRAY_fast_cjpeg.jpg
+	$EXEDIR/cjpeg -quality 95 -dct fast -sample 2x2 $IMGDIR/${basename}.bmp >$OUTDIR/${basename}_420_fast_cjpeg.jpg
+	$EXEDIR/cjpeg -quality 95 -dct fast -sample 2x1 $IMGDIR/${basename}.bmp >$OUTDIR/${basename}_422_fast_cjpeg.jpg
+	$EXEDIR/cjpeg -quality 95 -dct fast -sample 1x1 $IMGDIR/${basename}.bmp >$OUTDIR/${basename}_444_fast_cjpeg.jpg
+	$EXEDIR/cjpeg -quality 95 -dct int -grayscale $IMGDIR/${basename}.bmp >$OUTDIR/${basename}_GRAY_accurate_cjpeg.jpg
+	$EXEDIR/cjpeg -quality 95 -dct int -sample 2x2 $IMGDIR/${basename}.bmp >$OUTDIR/${basename}_420_accurate_cjpeg.jpg
+	$EXEDIR/cjpeg -quality 95 -dct int -sample 2x1 $IMGDIR/${basename}.bmp >$OUTDIR/${basename}_422_accurate_cjpeg.jpg
+	$EXEDIR/cjpeg -quality 95 -dct int -sample 1x1 $IMGDIR/${basename}.bmp >$OUTDIR/${basename}_444_accurate_cjpeg.jpg
+	for samp in GRAY 420 422 444; do
+		$EXEDIR/djpeg -rgb -bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg >$OUTDIR/${basename}_${samp}_default_djpeg.bmp
+		$EXEDIR/djpeg -dct fast -rgb -bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg >$OUTDIR/${basename}_${samp}_fast_djpeg.bmp
+		$EXEDIR/djpeg -dct int -rgb -bmp $OUTDIR/${basename}_${samp}_accurate_cjpeg.jpg >$OUTDIR/${basename}_${samp}_accurate_djpeg.bmp
+	done
+	for samp in 420 422; do
+		$EXEDIR/djpeg -nosmooth -bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg >$OUTDIR/${basename}_${samp}_default_nosmooth_djpeg.bmp
+		$EXEDIR/djpeg -dct fast -nosmooth -bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg >$OUTDIR/${basename}_${samp}_fast_nosmooth_djpeg.bmp
+		$EXEDIR/djpeg -dct int -nosmooth -bmp $OUTDIR/${basename}_${samp}_accurate_cjpeg.jpg >$OUTDIR/${basename}_${samp}_accurate_nosmooth_djpeg.bmp
+	done
+
+	# Compression
+	for dct in fast accurate; do
+		for samp in GRAY 420 422 444; do
+			runme $JAVA TJExample $OUTDIR/$image $OUTDIR/${basename}_${samp}_${dct}.jpg -q 95 -samp ${samp} -${dct}dct
+			runme cmp $OUTDIR/${basename}_${samp}_${dct}.jpg $OUTDIR/${basename}_${samp}_${dct}_cjpeg.jpg
+		done
+	done
+
+	# Decompression
+	for dct in fast accurate default; do
+		srcdct=${dct}
+		dctarg=-${dct}dct
+		if [ "${dct}" = "default" ]; then
+			srcdct=fast
+			dctarg=
+		fi
+		for samp in GRAY 420 422 444; do
+			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_${srcdct}.jpg $OUTDIR/${basename}_${samp}_${dct}.bmp ${dctarg}
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${dct}.bmp $OUTDIR/${basename}_${samp}_${dct}_djpeg.bmp
+			rm $OUTDIR/${basename}_${samp}_${dct}.bmp
+		done
+		for samp in 420 422; do
+			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_${srcdct}.jpg $OUTDIR/${basename}_${samp}_${dct}_nosmooth.bmp -fastupsample ${dctarg}
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${dct}_nosmooth.bmp $OUTDIR/${basename}_${samp}_${dct}_nosmooth_djpeg.bmp
+			rm $OUTDIR/${basename}_${samp}_${dct}_nosmooth.bmp
+		done
+	done
+
+	# Scaled decompression
+	for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
+		scalearg=`echo $scale | sed s@_@/@g`
+		for samp in GRAY 420 422 444; do
+			$EXEDIR/djpeg -rgb -bmp -scale ${scalearg} $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg >$OUTDIR/${basename}_${samp}_${scale}_djpeg.bmp
+			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${scale}.bmp -scale ${scalearg}
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${scale}.bmp $OUTDIR/${basename}_${samp}_${scale}_djpeg.bmp
+			rm $OUTDIR/${basename}_${samp}_${scale}.bmp
+		done
+	done
+
+	# Transforms
+	for samp in GRAY 420 422 444; do
+		$EXEDIR/jpegtran -crop 70x60+16+16 -flip horizontal -trim $OUTDIR/${basename}_${samp}_fast.jpg >$OUTDIR/${basename}_${samp}_hflip_jpegtran.jpg
+		$EXEDIR/jpegtran -crop 70x60+16+16 -flip vertical -trim $OUTDIR/${basename}_${samp}_fast.jpg >$OUTDIR/${basename}_${samp}_vflip_jpegtran.jpg
+		$EXEDIR/jpegtran -crop 70x60+16+16 -transpose -trim $OUTDIR/${basename}_${samp}_fast.jpg >$OUTDIR/${basename}_${samp}_transpose_jpegtran.jpg
+		$EXEDIR/jpegtran -crop 70x60+16+16 -transverse -trim $OUTDIR/${basename}_${samp}_fast.jpg >$OUTDIR/${basename}_${samp}_transverse_jpegtran.jpg
+		$EXEDIR/jpegtran -crop 70x60+16+16 -rotate 90 -trim $OUTDIR/${basename}_${samp}_fast.jpg >$OUTDIR/${basename}_${samp}_rot90_jpegtran.jpg
+		$EXEDIR/jpegtran -crop 70x60+16+16 -rotate 180 -trim $OUTDIR/${basename}_${samp}_fast.jpg >$OUTDIR/${basename}_${samp}_rot180_jpegtran.jpg
+		$EXEDIR/jpegtran -crop 70x60+16+16 -rotate 270 -trim $OUTDIR/${basename}_${samp}_fast.jpg >$OUTDIR/${basename}_${samp}_rot270_jpegtran.jpg
+	done
+	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
+		for samp in GRAY 420 422 444; do
+			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.jpg -$xform -crop 16,16,70x60
+			runme cmp $OUTDIR/${basename}_${samp}_${xform}.jpg $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
+			$EXEDIR/djpeg -rgb -bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg >$OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp
+			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.bmp -$xform -crop 16,16,70x60
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${xform}.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp
+			rm $OUTDIR/${basename}_${samp}_${xform}.bmp
+		done
+		for samp in 420 422; do
+			$EXEDIR/djpeg -nosmooth -rgb -bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg >$OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp
+			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.bmp -$xform -crop 16,16,70x60 -fastupsample
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${xform}.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp
+			rm $OUTDIR/${basename}_${samp}_${xform}.bmp
+		done
+	done
+
+	# Grayscale transform
+	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
+		for samp in GRAY 444 422 420; do
+			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.jpg -$xform -grayscale -crop 16,16,70x60
+			runme cmp $OUTDIR/${basename}_${samp}_${xform}.jpg $OUTDIR/${basename}_GRAY_${xform}_jpegtran.jpg
+			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.bmp -$xform -grayscale -crop 16,16,70x60
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${xform}.bmp $OUTDIR/${basename}_GRAY_${xform}_jpegtran.bmp
+			rm $OUTDIR/${basename}_${samp}_${xform}.bmp
+		done
+	done
+
+	# Transforms with scaling
+	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
+		for samp in GRAY 444 422 420; do
+			for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
+				scalearg=`echo $scale | sed s@_@/@g`
+				$EXEDIR/djpeg -rgb -bmp -scale ${scalearg} $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg >$OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.bmp
+				runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}_${scale}.bmp -$xform -scale ${scalearg} -crop 16,16,70x60
+				runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${xform}_${scale}.bmp $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.bmp
+				rm $OUTDIR/${basename}_${samp}_${xform}_${scale}.bmp
+			done
+		done
+	done
+
+done
+
+echo SUCCESS!
diff --git a/tjunittest.c b/tjunittest.c
index 3bb194d..6a4022f 100644
--- a/tjunittest.c
+++ b/tjunittest.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2012, 2014 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2014 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,6 +47,8 @@
 	printf("\nUSAGE: %s [options]\n", progName);
 	printf("Options:\n");
 	printf("-yuv = test YUV encoding/decoding support\n");
+	printf("-noyuvpad = do not pad each line of each Y, U, and V plane to the nearest\n");
+	printf("            4-byte boundary\n");
 	printf("-alloc = test automatic buffer allocation\n");
 	exit(1);
 }
@@ -59,25 +61,25 @@
 
 const char *subNameLong[TJ_NUMSAMP]=
 {
-	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0"
+	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
 };
-const char *subName[TJ_NUMSAMP]={"444", "422", "420", "GRAY", "440"};
+const char *subName[TJ_NUMSAMP]={"444", "422", "420", "GRAY", "440", "411"};
 
 const char *pixFormatStr[TJ_NUMPF]=
 {
 	"RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "Grayscale",
-	"RGBA", "BGRA", "ABGR", "ARGB"
+	"RGBA", "BGRA", "ABGR", "ARGB", "CMYK"
 };
 
-const int alphaOffset[TJ_NUMPF] = {-1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0};
+const int alphaOffset[TJ_NUMPF] = {-1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1};
 
 const int _3byteFormats[]={TJPF_RGB, TJPF_BGR};
-const int _4byteFormats[]={TJPF_RGBX, TJPF_BGRX, TJPF_XBGR, TJPF_XRGB};
+const int _4byteFormats[]={TJPF_RGBX, TJPF_BGRX, TJPF_XBGR, TJPF_XRGB,
+	TJPF_CMYK};
 const int _onlyGray[]={TJPF_GRAY};
 const int _onlyRGB[]={TJPF_RGB};
 
-enum {YUVENCODE=1, YUVDECODE};
-int yuv=0, alloc=0;
+int doyuv=0, alloc=0, pad=4;
 
 int exitStatus=0;
 #define bailout() {exitStatus=-1;  goto bailout;}
@@ -91,9 +93,9 @@
 	int ps=tjPixelSize[pf];
 	int index, row, col, halfway=16;
 
-	memset(buf, 0, w*h*ps);
 	if(pf==TJPF_GRAY)
 	{
+		memset(buf, 0, w*h*ps);
 		for(row=0; row<h; row++)
 		{
 			for(col=0; col<w; col++)
@@ -105,8 +107,30 @@
 			}
 		}
 	}
+	else if(pf==TJPF_CMYK)
+	{
+		memset(buf, 255, w*h*ps);
+		for(row=0; row<h; row++)
+		{
+			for(col=0; col<w; col++)
+			{
+				if(flags&TJFLAG_BOTTOMUP) index=(h-row-1)*w+col;
+				else index=row*w+col;
+				if(((row/8)+(col/8))%2==0)
+				{
+					if(row>=halfway) buf[index*ps+3]=0;
+				}
+				else
+				{
+					buf[index*ps+2]=0;
+					if(row<halfway) buf[index*ps+1]=0;
+				}
+			}
+		}
+	}
 	else
 	{
+		memset(buf, 0, w*h*ps);
 		for(row=0; row<h; row++)
 		{
 			for(col=0; col<w; col++)
@@ -165,6 +189,36 @@
 	int halfway=16*sf.num/sf.denom;
 	int blocksize=8*sf.num/sf.denom;
 
+	if(pf==TJPF_CMYK)
+	{
+		for(row=0; row<h; row++)
+		{
+			for(col=0; col<w; col++)
+			{
+				unsigned char c, m, y, k;
+				if(flags&TJFLAG_BOTTOMUP) index=(h-row-1)*w+col;
+				else index=row*w+col;
+				c=buf[index*ps];
+				m=buf[index*ps+1];
+				y=buf[index*ps+2];
+				k=buf[index*ps+3];
+				if(((row/blocksize)+(col/blocksize))%2==0)
+				{
+					checkval255(c);  checkval255(m);  checkval255(y);
+					if(row<halfway) checkval255(k)
+					else checkval0(k)
+				}
+				else
+				{
+					checkval255(c);  checkval0(y);  checkval255(k);
+					if(row<halfway) checkval0(m)
+					else checkval255(m)
+				}
+			}
+		}
+		return 1;
+	}
+
 	for(row=0; row<h; row++)
 	{
 		for(col=0; col<w; col++)
@@ -223,8 +277,13 @@
 		{
 			for(col=0; col<w; col++)
 			{
-				printf("%.3d/%.3d/%.3d ", buf[(row*w+col)*ps+roffset],
-					buf[(row*w+col)*ps+goffset], buf[(row*w+col)*ps+boffset]);
+				if(pf==TJPF_CMYK)
+					printf("%.3d/%.3d/%.3d/%.3d ", buf[(row*w+col)*ps],
+						buf[(row*w+col)*ps+1], buf[(row*w+col)*ps+2],
+						buf[(row*w+col)*ps+3]);
+				else
+					printf("%.3d/%.3d/%.3d ", buf[(row*w+col)*ps+roffset],
+						buf[(row*w+col)*ps+goffset], buf[(row*w+col)*ps+boffset]);
 			}
 			printf("\n");
 		}
@@ -235,22 +294,24 @@
 
 #define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
 
-int checkBufYUV(unsigned char *buf, int w, int h, int subsamp)
+int checkBufYUV(unsigned char *buf, int w, int h, int subsamp,
+	tjscalingfactor sf)
 {
 	int row, col;
 	int hsf=tjMCUWidth[subsamp]/8, vsf=tjMCUHeight[subsamp]/8;
 	int pw=PAD(w, hsf), ph=PAD(h, vsf);
 	int cw=pw/hsf, ch=ph/vsf;
-	int ypitch=PAD(pw, 4), uvpitch=PAD(cw, 4);
+	int ypitch=PAD(pw, pad), uvpitch=PAD(cw, pad);
 	int retval=1;
-	int halfway=16;
+	int halfway=16*sf.num/sf.denom;
+	int blocksize=8*sf.num/sf.denom;
 
 	for(row=0; row<ph; row++)
 	{
 		for(col=0; col<pw; col++)
 		{
 			unsigned char y=buf[ypitch*row+col];
-			if(((row/8)+(col/8))%2==0)
+			if(((row/blocksize)+(col/blocksize))%2==0)
 			{
 				if(row<halfway) checkval255(y)  else checkval0(y);
 			}
@@ -262,14 +323,14 @@
 	}
 	if(subsamp!=TJSAMP_GRAY)
 	{
-		halfway=16/vsf;
+		int halfway=16/vsf*sf.num/sf.denom;
 		for(row=0; row<ch; row++)
 		{
 			for(col=0; col<cw; col++)
 			{
 				unsigned char u=buf[ypitch*ph + (uvpitch*row+col)],
 					v=buf[ypitch*ph + uvpitch*ch + (uvpitch*row+col)];
-				if(((row*vsf/8)+(col*hsf/8))%2==0)
+				if(((row*vsf/blocksize)+(col*hsf/blocksize))%2==0)
 				{
 					checkval(u, 128);  checkval(v, 128);
 				}
@@ -335,57 +396,57 @@
 	unsigned long *dstSize, int w, int h, int pf, char *basename,
 	int subsamp, int jpegQual, int flags)
 {
-	char tempStr[1024];  unsigned char *srcBuf=NULL;
-	double t;
-
-	if(yuv==YUVENCODE)
-		printf("%s %s -> %s YUV ... ", pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "Bottom-Up":"Top-Down ", subNameLong[subsamp]);
-	else
-		printf("%s %s -> %s Q%d ... ", pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "Bottom-Up":"Top-Down ", subNameLong[subsamp],
-			jpegQual);
+	char tempStr[1024];  unsigned char *srcBuf=NULL, *yuvBuf=NULL;
+	const char *pfStr=pixFormatStr[pf];
+	const char *buStrLong=(flags&TJFLAG_BOTTOMUP)? "Bottom-Up":"Top-Down ";
+	const char *buStr=(flags&TJFLAG_BOTTOMUP)? "BU":"TD";
 
 	if((srcBuf=(unsigned char *)malloc(w*h*tjPixelSize[pf]))==NULL)
 		_throw("Memory allocation failure");
 	initBuf(srcBuf, w, h, pf, flags);
+
 	if(*dstBuf && *dstSize>0) memset(*dstBuf, 0, *dstSize);
 
-	t=gettime();
-	if(yuv==YUVENCODE)
+
+	if(!alloc) flags|=TJFLAG_NOREALLOC;
+	if(doyuv)
 	{
-		_tj(tjEncodeYUV2(handle, srcBuf, w, 0, h, pf, *dstBuf, subsamp, flags));
+		unsigned long yuvSize=tjBufSizeYUV2(w, pad, h, subsamp);
+		tjscalingfactor sf={1, 1};
+		tjhandle handle2=tjInitCompress();
+		if(!handle2) _throwtj();
+
+		if((yuvBuf=(unsigned char *)malloc(yuvSize))==NULL)
+			_throw("Memory allocation failure");
+		memset(yuvBuf, 0, yuvSize);
+
+		printf("%s %s -> YUV %s ... ", pfStr, buStrLong, subNameLong[subsamp]);
+		_tj(tjEncodeYUV3(handle2, srcBuf, w, 0, h, pf, yuvBuf, pad, subsamp,
+			flags));
+		tjDestroy(handle2);
+		if(checkBufYUV(yuvBuf, w, h, subsamp, sf)) printf("Passed.\n");
+		else printf("FAILED!\n");
+
+		printf("YUV %s %s -> JPEG Q%d ... ", subNameLong[subsamp], buStrLong,
+			jpegQual);
+		_tj(tjCompressFromYUV(handle, yuvBuf, w, pad, h, subsamp, dstBuf,
+			dstSize, jpegQual, flags));
 	}
 	else
 	{
-		if(!alloc)
-		{
-			flags|=TJFLAG_NOREALLOC;
-			*dstSize=(yuv==YUVENCODE? tjBufSizeYUV(w, h, subsamp)
-				: tjBufSize(w, h, subsamp));
-		}
+		printf("%s %s -> %s Q%d ... ", pfStr, buStrLong, subNameLong[subsamp],
+			jpegQual);
 		_tj(tjCompress2(handle, srcBuf, w, 0, h, pf, dstBuf, dstSize, subsamp,
 			jpegQual, flags));
 	}
-	t=gettime()-t;
 
-	if(yuv==YUVENCODE)
-		snprintf(tempStr, 1024, "%s_enc_%s_%s_%s.yuv", basename, pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "BU":"TD", subName[subsamp]);
-	else
-		snprintf(tempStr, 1024, "%s_enc_%s_%s_%s_Q%d.jpg", basename,
-			pixFormatStr[pf], (flags&TJFLAG_BOTTOMUP)? "BU":"TD", subName[subsamp],
-			jpegQual);
+	snprintf(tempStr, 1024, "%s_enc_%s_%s_%s_Q%d.jpg", basename, pfStr, buStr,
+		subName[subsamp], jpegQual);
 	writeJPEG(*dstBuf, *dstSize, tempStr);
-	if(yuv==YUVENCODE)
-	{
-		if(checkBufYUV(*dstBuf, w, h, subsamp)) printf("Passed.");
-		else printf("FAILED!");
-	}
-	else printf("Done.");
-	printf("  %f ms\n  Result in %s\n", t*1000., tempStr);
+	printf("Done.\n  Result in %s\n", tempStr);
 
 	bailout:
+	if(yuvBuf) free(yuvBuf);
 	if(srcBuf) free(srcBuf);
 }
 
@@ -394,16 +455,49 @@
 	unsigned long jpegSize, int w, int h, int pf, char *basename, int subsamp,
 	int flags, tjscalingfactor sf)
 {
-	unsigned char *dstBuf=NULL;
-	int _hdrw=0, _hdrh=0, _hdrsubsamp=-1;  double t;
+	unsigned char *dstBuf=NULL, *yuvBuf=NULL;
+	int _hdrw=0, _hdrh=0, _hdrsubsamp=-1;
 	int scaledWidth=TJSCALED(w, sf);
 	int scaledHeight=TJSCALED(h, sf);
 	unsigned long dstSize=0;
 
-	if(yuv==YUVENCODE) return;
+	_tj(tjDecompressHeader2(handle, jpegBuf, jpegSize, &_hdrw, &_hdrh,
+		&_hdrsubsamp));
+	if(_hdrw!=w || _hdrh!=h || _hdrsubsamp!=subsamp)
+		_throw("Incorrect JPEG header");
 
-	if(yuv==YUVDECODE)
-		printf("JPEG -> YUV %s ... ", subNameLong[subsamp]);
+	dstSize=scaledWidth*scaledHeight*tjPixelSize[pf];
+	if((dstBuf=(unsigned char *)malloc(dstSize))==NULL)
+		_throw("Memory allocation failure");
+	memset(dstBuf, 0, dstSize);
+
+	if(doyuv)
+	{
+		unsigned long yuvSize=tjBufSizeYUV2(scaledWidth, pad, scaledHeight,
+			subsamp);
+		tjhandle handle2=tjInitDecompress();
+		if(!handle2) _throwtj();
+
+		if((yuvBuf=(unsigned char *)malloc(yuvSize))==NULL)
+			_throw("Memory allocation failure");
+		memset(yuvBuf, 0, yuvSize);
+
+		printf("JPEG -> YUV %s ", subNameLong[subsamp]);
+		if(sf.num!=1 || sf.denom!=1)
+			printf("%d/%d ... ", sf.num, sf.denom);
+		else printf("... ");
+		_tj(tjDecompressToYUV2(handle, jpegBuf, jpegSize, yuvBuf, scaledWidth,
+			pad, scaledHeight, flags));
+		if(checkBufYUV(yuvBuf, scaledWidth, scaledHeight, subsamp, sf))
+			printf("Passed.\n");
+		else printf("FAILED!\n");
+
+		printf("YUV %s -> %s %s ... ", subNameLong[subsamp], pixFormatStr[pf],
+			(flags&TJFLAG_BOTTOMUP)? "Bottom-Up":"Top-Down ");
+		_tj(tjDecodeYUV(handle2, yuvBuf, pad, subsamp, dstBuf, scaledWidth, 0,
+			scaledHeight, pf, flags));
+		tjDestroy(handle2);
+	}
 	else
 	{
 		printf("JPEG -> %s %s ", pixFormatStr[pf],
@@ -411,45 +505,17 @@
 		if(sf.num!=1 || sf.denom!=1)
 			printf("%d/%d ... ", sf.num, sf.denom);
 		else printf("... ");
-	}
-
-	_tj(tjDecompressHeader2(handle, jpegBuf, jpegSize, &_hdrw, &_hdrh,
-		&_hdrsubsamp));
-	if(_hdrw!=w || _hdrh!=h || _hdrsubsamp!=subsamp)
-		_throw("Incorrect JPEG header");
-
-	if(yuv==YUVDECODE) dstSize=tjBufSizeYUV(w, h, subsamp);
-	else dstSize=scaledWidth*scaledHeight*tjPixelSize[pf];
-	if((dstBuf=(unsigned char *)malloc(dstSize))==NULL)
-		_throw("Memory allocation failure");
-	memset(dstBuf, 0, dstSize);
-
-	t=gettime();
-	if(yuv==YUVDECODE)
-	{
-		_tj(tjDecompressToYUV(handle, jpegBuf, jpegSize, dstBuf, flags));
-	}
-	else
-	{
 		_tj(tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, scaledWidth, 0,
 			scaledHeight, pf, flags));
 	}
-	t=gettime()-t;
 
-	if(yuv==YUVDECODE)
-	{
-		if(checkBufYUV(dstBuf, w, h, subsamp)) printf("Passed.");
-		else printf("FAILED!");
-	}
-	else
-	{
-		if(checkBuf(dstBuf, scaledWidth, scaledHeight, pf, subsamp, sf, flags))
-			printf("Passed.");
-		else printf("FAILED!");
-	}
-	printf("  %f ms\n", t*1000.);
+	if(checkBuf(dstBuf, scaledWidth, scaledHeight, pf, subsamp, sf, flags))
+		printf("Passed.");
+	else printf("FAILED!");
+	printf("\n");
 
 	bailout:
+	if(yuvBuf) free(yuvBuf);
 	if(dstBuf) free(dstBuf);
 }
 
@@ -459,18 +525,19 @@
 	int flags)
 {
 	int i, n=0;
-	tjscalingfactor *sf=tjGetScalingFactors(&n), sf1={1, 1};
+	tjscalingfactor *sf=tjGetScalingFactors(&n);
 	if(!sf || !n) _throwtj();
 
-	if((subsamp==TJSAMP_444 || subsamp==TJSAMP_GRAY) && !yuv)
+	for(i=0; i<n; i++)
 	{
-		for(i=0; i<n; i++)
+		if(subsamp==TJSAMP_444 || subsamp==TJSAMP_GRAY ||
+			(subsamp==TJSAMP_411 && sf[i].num==1 &&
+				(sf[i].denom==2 || sf[i].denom==1)) ||
+			(subsamp!=TJSAMP_411 && sf[i].num==1 &&
+				(sf[i].denom==4 || sf[i].denom==2 || sf[i].denom==1)))
 			_decompTest(handle, jpegBuf, jpegSize, w, h, pf, basename, subsamp,
 				flags, sf[i]);
 	}
-	else
-		_decompTest(handle, jpegBuf, jpegSize, w, h, pf, basename, subsamp, flags,
-			sf1);
 
 	bailout:
 	return;
@@ -485,12 +552,10 @@
 	unsigned long size=0;  int pfi, pf, i;
 
 	if(!alloc)
-	{
-		size=(yuv==YUVENCODE? tjBufSizeYUV(w, h, subsamp)
-			: tjBufSize(w, h, subsamp));
+		size=tjBufSize(w, h, subsamp);
+	if(size!=0)
 		if((dstBuf=(unsigned char *)tjAlloc(size))==NULL)
 			_throw("Memory allocation failure.");
-	}
 
 	if((chandle=tjInitCompress())==NULL || (dhandle=tjInitDecompress())==NULL)
 		_throwtj();
@@ -500,13 +565,10 @@
 		for(i=0; i<2; i++)
 		{
 			int flags=0;
-			if(subsamp==TJSAMP_422 || subsamp==TJSAMP_420 || subsamp==TJSAMP_440)
+			if(subsamp==TJSAMP_422 || subsamp==TJSAMP_420 || subsamp==TJSAMP_440 ||
+				subsamp==TJSAMP_411)
 				flags|=TJFLAG_FASTUPSAMPLE;
-			if(i==1)
-			{
-				if(yuv==YUVDECODE) goto bailout;
-				else flags|=TJFLAG_BOTTOMUP;
-			}
+			if(i==1) flags|=TJFLAG_BOTTOMUP;
 			pf=formats[pfi];
 			compTest(chandle, &dstBuf, &size, w, h, pf, basename, subsamp, 100,
 				flags);
@@ -551,9 +613,9 @@
 				if(h%100==0) printf("%.4d x %.4d\b\b\b\b\b\b\b\b\b\b\b", w, h);
 				if((srcBuf=(unsigned char *)malloc(w*h*4))==NULL)
 					_throw("Memory allocation failure");
-				if(!alloc || yuv==YUVENCODE)
+				if(!alloc || doyuv)
 				{
-					if(yuv==YUVENCODE) dstSize=tjBufSizeYUV(w, h, subsamp);
+					if(doyuv) dstSize=tjBufSizeYUV2(w, pad, h, subsamp);
 					else dstSize=tjBufSize(w, h, subsamp);
 					if((dstBuf=(unsigned char *)tjAlloc(dstSize))==NULL)
 						_throw("Memory allocation failure");
@@ -565,10 +627,10 @@
 					else srcBuf[i]=255;
 				}
 
-				if(yuv==YUVENCODE)
+				if(doyuv)
 				{
-					_tj(tjEncodeYUV2(handle, srcBuf, w, 0, h, TJPF_BGRX, dstBuf, subsamp,
-						0));
+					_tj(tjEncodeYUV3(handle, srcBuf, w, 0, h, TJPF_BGRX, dstBuf, pad,
+						subsamp, 0));
 				}
 				else
 				{
@@ -576,13 +638,16 @@
 						&dstSize, subsamp, 100, alloc? 0:TJFLAG_NOREALLOC));
 				}
 				free(srcBuf);  srcBuf=NULL;
-				tjFree(dstBuf);  dstBuf=NULL;
+				if(!alloc || doyuv)
+				{
+					tjFree(dstBuf);  dstBuf=NULL;
+				}
 
 				if((srcBuf=(unsigned char *)malloc(h*w*4))==NULL)
 					_throw("Memory allocation failure");
-				if(!alloc || yuv==YUVENCODE)
+				if(!alloc || doyuv)
 				{
-					if(yuv==YUVENCODE) dstSize=tjBufSizeYUV(h, w, subsamp);
+					if(doyuv) dstSize=tjBufSizeYUV2(h, pad, w, subsamp);
 					else dstSize=tjBufSize(h, w, subsamp);
 					if((dstBuf=(unsigned char *)tjAlloc(dstSize))==NULL)
 						_throw("Memory allocation failure");
@@ -594,10 +659,10 @@
 					else srcBuf[i]=255;
 				}
 
-				if(yuv==YUVENCODE)
+				if(doyuv)
 				{
-					_tj(tjEncodeYUV2(handle, srcBuf, h, 0, w, TJPF_BGRX, dstBuf, subsamp,
-						0));
+					_tj(tjEncodeYUV3(handle, srcBuf, h, 0, w, TJPF_BGRX, dstBuf, pad,
+						subsamp, 0));
 				}
 				else
 				{
@@ -605,7 +670,10 @@
 						&dstSize, subsamp, 100, alloc? 0:TJFLAG_NOREALLOC));
 				}
 				free(srcBuf);  srcBuf=NULL;
-				tjFree(dstBuf);  dstBuf=NULL;
+				if(!alloc || doyuv)
+				{
+					tjFree(dstBuf);  dstBuf=NULL;
+				}
 			}
 		}
 	}
@@ -613,14 +681,14 @@
 
 	bailout:
 	if(srcBuf) free(srcBuf);
-	if(dstBuf) free(dstBuf);
+	if(dstBuf) tjFree(dstBuf);
 	if(handle) tjDestroy(handle);
 }
 
 
 int main(int argc, char *argv[])
 {
-	int doyuv=0, i;
+	int i, num4bf=5;
 	#ifdef _WIN32
 	srand((unsigned int)time(NULL));
 	#endif
@@ -629,41 +697,38 @@
 		for(i=1; i<argc; i++)
 		{
 			if(!strcasecmp(argv[i], "-yuv")) doyuv=1;
+			if(!strcasecmp(argv[i], "-noyuvpad")) pad=1;
 			if(!strcasecmp(argv[i], "-alloc")) alloc=1;
 			if(!strncasecmp(argv[i], "-h", 2) || !strcasecmp(argv[i], "-?"))
 				usage(argv[0]);
 		}
 	}
 	if(alloc) printf("Testing automatic buffer allocation\n");
-	if(doyuv) {yuv=YUVENCODE;  alloc=0;}
+	if(doyuv) num4bf=4;
 	doTest(35, 39, _3byteFormats, 2, TJSAMP_444, "test");
-	doTest(39, 41, _4byteFormats, 4, TJSAMP_444, "test");
+	doTest(39, 41, _4byteFormats, num4bf, TJSAMP_444, "test");
 	doTest(41, 35, _3byteFormats, 2, TJSAMP_422, "test");
-	doTest(35, 39, _4byteFormats, 4, TJSAMP_422, "test");
+	doTest(35, 39, _4byteFormats, num4bf, TJSAMP_422, "test");
 	doTest(39, 41, _3byteFormats, 2, TJSAMP_420, "test");
-	doTest(41, 35, _4byteFormats, 4, TJSAMP_420, "test");
+	doTest(41, 35, _4byteFormats, num4bf, TJSAMP_420, "test");
 	doTest(35, 39, _3byteFormats, 2, TJSAMP_440, "test");
-	doTest(39, 41, _4byteFormats, 4, TJSAMP_440, "test");
-	doTest(35, 39, _onlyGray, 1, TJSAMP_GRAY, "test");
-	doTest(39, 41, _3byteFormats, 2, TJSAMP_GRAY, "test");
-	doTest(41, 35, _4byteFormats, 4, TJSAMP_GRAY, "test");
+	doTest(39, 41, _4byteFormats, num4bf, TJSAMP_440, "test");
+	doTest(41, 35, _3byteFormats, 2, TJSAMP_411, "test");
+	doTest(35, 39, _4byteFormats, num4bf, TJSAMP_411, "test");
+	doTest(39, 41, _onlyGray, 1, TJSAMP_GRAY, "test");
+	doTest(41, 35, _3byteFormats, 2, TJSAMP_GRAY, "test");
+	doTest(35, 39, _4byteFormats, 4, TJSAMP_GRAY, "test");
 	bufSizeTest();
 	if(doyuv)
 	{
 		printf("\n--------------------\n\n");
-		yuv=YUVDECODE;
 		doTest(48, 48, _onlyRGB, 1, TJSAMP_444, "test_yuv0");
-		doTest(35, 39, _onlyRGB, 1, TJSAMP_444, "test_yuv1");
 		doTest(48, 48, _onlyRGB, 1, TJSAMP_422, "test_yuv0");
-		doTest(39, 41, _onlyRGB, 1, TJSAMP_422, "test_yuv1");
 		doTest(48, 48, _onlyRGB, 1, TJSAMP_420, "test_yuv0");
-		doTest(41, 35, _onlyRGB, 1, TJSAMP_420, "test_yuv1");
 		doTest(48, 48, _onlyRGB, 1, TJSAMP_440, "test_yuv0");
-		doTest(35, 39, _onlyRGB, 1, TJSAMP_440, "test_yuv1");
+		doTest(48, 48, _onlyRGB, 1, TJSAMP_411, "test_yuv0");
 		doTest(48, 48, _onlyRGB, 1, TJSAMP_GRAY, "test_yuv0");
-		doTest(35, 39, _onlyRGB, 1, TJSAMP_GRAY, "test_yuv1");
 		doTest(48, 48, _onlyGray, 1, TJSAMP_GRAY, "test_yuv0");
-		doTest(39, 41, _onlyGray, 1, TJSAMP_GRAY, "test_yuv1");
 	}
 
 	return exitStatus;
diff --git a/transupp.c b/transupp.c
index a16b20a..d1c56c6 100644
--- a/transupp.c
+++ b/transupp.c
@@ -5,7 +5,8 @@
  * Copyright (C) 1997-2011, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains image transformation routines and other utility code
  * used by the jpegtran sample application.  These are NOT part of the core
@@ -21,9 +22,9 @@
 
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "transupp.h"		/* My own external interface */
+#include "transupp.h"           /* My own external interface */
 #include "jpegcomp.h"
-#include <ctype.h>		/* to declare isdigit() */
+#include <ctype.h>              /* to declare isdigit() */
 
 
 #if JPEG_LIB_VERSION >= 70
@@ -89,9 +90,9 @@
 
 LOCAL(void)
 do_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	 JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	 jvirt_barray_ptr *src_coef_arrays,
-	 jvirt_barray_ptr *dst_coef_arrays)
+         JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+         jvirt_barray_ptr *src_coef_arrays,
+         jvirt_barray_ptr *dst_coef_arrays)
 /* Crop.  This is only used when no rotate/flip is requested with the crop. */
 {
   JDIMENSION dst_blk_y, x_crop_blocks, y_crop_blocks;
@@ -107,18 +108,18 @@
     x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
     y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
+         dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION) compptr->v_samp_factor, TRUE);
       src_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	 dst_blk_y + y_crop_blocks,
-	 (JDIMENSION) compptr->v_samp_factor, FALSE);
+        ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+         dst_blk_y + y_crop_blocks,
+         (JDIMENSION) compptr->v_samp_factor, FALSE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
-			dst_buffer[offset_y],
-			compptr->width_in_blocks);
+        jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
+                        dst_buffer[offset_y],
+                        compptr->width_in_blocks);
       }
     }
   }
@@ -127,8 +128,8 @@
 
 LOCAL(void)
 do_flip_h_no_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-		   JDIMENSION x_crop_offset,
-		   jvirt_barray_ptr *src_coef_arrays)
+                   JDIMENSION x_crop_offset,
+                   jvirt_barray_ptr *src_coef_arrays)
 /* Horizontal flip; done in-place, so no separate dest array is required.
  * NB: this only works when y_crop_offset is zero.
  */
@@ -153,39 +154,39 @@
     comp_width = MCU_cols * compptr->h_samp_factor;
     x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
     for (blk_y = 0; blk_y < compptr->height_in_blocks;
-	 blk_y += compptr->v_samp_factor) {
+         blk_y += compptr->v_samp_factor) {
       buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, src_coef_arrays[ci], blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr) srcinfo, src_coef_arrays[ci], blk_y,
+         (JDIMENSION) compptr->v_samp_factor, TRUE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	/* Do the mirroring */
-	for (blk_x = 0; blk_x * 2 < comp_width; blk_x++) {
-	  ptr1 = buffer[offset_y][blk_x];
-	  ptr2 = buffer[offset_y][comp_width - blk_x - 1];
-	  /* this unrolled loop doesn't need to know which row it's on... */
-	  for (k = 0; k < DCTSIZE2; k += 2) {
-	    temp1 = *ptr1;	/* swap even column */
-	    temp2 = *ptr2;
-	    *ptr1++ = temp2;
-	    *ptr2++ = temp1;
-	    temp1 = *ptr1;	/* swap odd column with sign change */
-	    temp2 = *ptr2;
-	    *ptr1++ = -temp2;
-	    *ptr2++ = -temp1;
-	  }
-	}
-	if (x_crop_blocks > 0) {
-	  /* Now left-justify the portion of the data to be kept.
-	   * We can't use a single jcopy_block_row() call because that routine
-	   * depends on memcpy(), whose behavior is unspecified for overlapping
-	   * source and destination areas.  Sigh.
-	   */
-	  for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) {
-	    jcopy_block_row(buffer[offset_y] + blk_x + x_crop_blocks,
-			    buffer[offset_y] + blk_x,
-			    (JDIMENSION) 1);
-	  }
-	}
+        /* Do the mirroring */
+        for (blk_x = 0; blk_x * 2 < comp_width; blk_x++) {
+          ptr1 = buffer[offset_y][blk_x];
+          ptr2 = buffer[offset_y][comp_width - blk_x - 1];
+          /* this unrolled loop doesn't need to know which row it's on... */
+          for (k = 0; k < DCTSIZE2; k += 2) {
+            temp1 = *ptr1;      /* swap even column */
+            temp2 = *ptr2;
+            *ptr1++ = temp2;
+            *ptr2++ = temp1;
+            temp1 = *ptr1;      /* swap odd column with sign change */
+            temp2 = *ptr2;
+            *ptr1++ = -temp2;
+            *ptr2++ = -temp1;
+          }
+        }
+        if (x_crop_blocks > 0) {
+          /* Now left-justify the portion of the data to be kept.
+           * We can't use a single jcopy_block_row() call because that routine
+           * depends on memcpy(), whose behavior is unspecified for overlapping
+           * source and destination areas.  Sigh.
+           */
+          for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) {
+            jcopy_block_row(buffer[offset_y] + blk_x + x_crop_blocks,
+                            buffer[offset_y] + blk_x,
+                            (JDIMENSION) 1);
+          }
+        }
       }
     }
   }
@@ -194,9 +195,9 @@
 
 LOCAL(void)
 do_flip_h (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	   JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	   jvirt_barray_ptr *src_coef_arrays,
-	   jvirt_barray_ptr *dst_coef_arrays)
+           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+           jvirt_barray_ptr *src_coef_arrays,
+           jvirt_barray_ptr *dst_coef_arrays)
 /* Horizontal flip in general cropping case */
 {
   JDIMENSION MCU_cols, comp_width, dst_blk_x, dst_blk_y;
@@ -220,34 +221,34 @@
     x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
     y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
+         dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION) compptr->v_samp_factor, TRUE);
       src_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	 dst_blk_y + y_crop_blocks,
-	 (JDIMENSION) compptr->v_samp_factor, FALSE);
+        ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+         dst_blk_y + y_crop_blocks,
+         (JDIMENSION) compptr->v_samp_factor, FALSE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	dst_row_ptr = dst_buffer[offset_y];
-	src_row_ptr = src_buffer[offset_y];
-	for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
-	  if (x_crop_blocks + dst_blk_x < comp_width) {
-	    /* Do the mirrorable blocks */
-	    dst_ptr = dst_row_ptr[dst_blk_x];
-	    src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
-	    /* this unrolled loop doesn't need to know which row it's on... */
-	    for (k = 0; k < DCTSIZE2; k += 2) {
-	      *dst_ptr++ = *src_ptr++;	 /* copy even column */
-	      *dst_ptr++ = - *src_ptr++; /* copy odd column with sign change */
-	    }
-	  } else {
-	    /* Copy last partial block(s) verbatim */
-	    jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks,
-			    dst_row_ptr + dst_blk_x,
-			    (JDIMENSION) 1);
-	  }
-	}
+        dst_row_ptr = dst_buffer[offset_y];
+        src_row_ptr = src_buffer[offset_y];
+        for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
+          if (x_crop_blocks + dst_blk_x < comp_width) {
+            /* Do the mirrorable blocks */
+            dst_ptr = dst_row_ptr[dst_blk_x];
+            src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
+            /* this unrolled loop doesn't need to know which row it's on... */
+            for (k = 0; k < DCTSIZE2; k += 2) {
+              *dst_ptr++ = *src_ptr++;   /* copy even column */
+              *dst_ptr++ = - *src_ptr++; /* copy odd column with sign change */
+            }
+          } else {
+            /* Copy last partial block(s) verbatim */
+            jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks,
+                            dst_row_ptr + dst_blk_x,
+                            (JDIMENSION) 1);
+          }
+        }
       }
     }
   }
@@ -256,9 +257,9 @@
 
 LOCAL(void)
 do_flip_v (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	   JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	   jvirt_barray_ptr *src_coef_arrays,
-	   jvirt_barray_ptr *dst_coef_arrays)
+           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+           jvirt_barray_ptr *src_coef_arrays,
+           jvirt_barray_ptr *dst_coef_arrays)
 /* Vertical flip */
 {
   JDIMENSION MCU_rows, comp_height, dst_blk_x, dst_blk_y;
@@ -285,49 +286,49 @@
     x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
     y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
+         dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION) compptr->v_samp_factor, TRUE);
       if (y_crop_blocks + dst_blk_y < comp_height) {
-	/* Row is within the mirrorable area. */
-	src_buffer = (*srcinfo->mem->access_virt_barray)
-	  ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	   comp_height - y_crop_blocks - dst_blk_y -
-	   (JDIMENSION) compptr->v_samp_factor,
-	   (JDIMENSION) compptr->v_samp_factor, FALSE);
+        /* Row is within the mirrorable area. */
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+           comp_height - y_crop_blocks - dst_blk_y -
+           (JDIMENSION) compptr->v_samp_factor,
+           (JDIMENSION) compptr->v_samp_factor, FALSE);
       } else {
-	/* Bottom-edge blocks will be copied verbatim. */
-	src_buffer = (*srcinfo->mem->access_virt_barray)
-	  ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	   dst_blk_y + y_crop_blocks,
-	   (JDIMENSION) compptr->v_samp_factor, FALSE);
+        /* Bottom-edge blocks will be copied verbatim. */
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks,
+           (JDIMENSION) compptr->v_samp_factor, FALSE);
       }
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	if (y_crop_blocks + dst_blk_y < comp_height) {
-	  /* Row is within the mirrorable area. */
-	  dst_row_ptr = dst_buffer[offset_y];
-	  src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1];
-	  src_row_ptr += x_crop_blocks;
-	  for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
-	       dst_blk_x++) {
-	    dst_ptr = dst_row_ptr[dst_blk_x];
-	    src_ptr = src_row_ptr[dst_blk_x];
-	    for (i = 0; i < DCTSIZE; i += 2) {
-	      /* copy even row */
-	      for (j = 0; j < DCTSIZE; j++)
-		*dst_ptr++ = *src_ptr++;
-	      /* copy odd row with sign change */
-	      for (j = 0; j < DCTSIZE; j++)
-		*dst_ptr++ = - *src_ptr++;
-	    }
-	  }
-	} else {
-	  /* Just copy row verbatim. */
-	  jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
-			  dst_buffer[offset_y],
-			  compptr->width_in_blocks);
-	}
+        if (y_crop_blocks + dst_blk_y < comp_height) {
+          /* Row is within the mirrorable area. */
+          dst_row_ptr = dst_buffer[offset_y];
+          src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1];
+          src_row_ptr += x_crop_blocks;
+          for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+               dst_blk_x++) {
+            dst_ptr = dst_row_ptr[dst_blk_x];
+            src_ptr = src_row_ptr[dst_blk_x];
+            for (i = 0; i < DCTSIZE; i += 2) {
+              /* copy even row */
+              for (j = 0; j < DCTSIZE; j++)
+                *dst_ptr++ = *src_ptr++;
+              /* copy odd row with sign change */
+              for (j = 0; j < DCTSIZE; j++)
+                *dst_ptr++ = - *src_ptr++;
+            }
+          }
+        } else {
+          /* Just copy row verbatim. */
+          jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
+                          dst_buffer[offset_y],
+                          compptr->width_in_blocks);
+        }
       }
     }
   }
@@ -336,9 +337,9 @@
 
 LOCAL(void)
 do_transpose (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	      JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	      jvirt_barray_ptr *src_coef_arrays,
-	      jvirt_barray_ptr *dst_coef_arrays)
+              JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+              jvirt_barray_ptr *src_coef_arrays,
+              jvirt_barray_ptr *dst_coef_arrays)
 /* Transpose source into destination */
 {
   JDIMENSION dst_blk_x, dst_blk_y, x_crop_blocks, y_crop_blocks;
@@ -357,25 +358,25 @@
     x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
     y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
+         dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION) compptr->v_samp_factor, TRUE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
-	     dst_blk_x += compptr->h_samp_factor) {
-	  src_buffer = (*srcinfo->mem->access_virt_barray)
-	    ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	     dst_blk_x + x_crop_blocks,
-	     (JDIMENSION) compptr->h_samp_factor, FALSE);
-	  for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
-	    dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
-	    src_ptr = src_buffer[offset_x][dst_blk_y + offset_y + y_crop_blocks];
-	    for (i = 0; i < DCTSIZE; i++)
-	      for (j = 0; j < DCTSIZE; j++)
-		dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-	  }
-	}
+        for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+             dst_blk_x += compptr->h_samp_factor) {
+          src_buffer = (*srcinfo->mem->access_virt_barray)
+            ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+             dst_blk_x + x_crop_blocks,
+             (JDIMENSION) compptr->h_samp_factor, FALSE);
+          for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
+            dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
+            src_ptr = src_buffer[offset_x][dst_blk_y + offset_y + y_crop_blocks];
+            for (i = 0; i < DCTSIZE; i++)
+              for (j = 0; j < DCTSIZE; j++)
+                dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+          }
+        }
       }
     }
   }
@@ -384,9 +385,9 @@
 
 LOCAL(void)
 do_rot_90 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	   JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	   jvirt_barray_ptr *src_coef_arrays,
-	   jvirt_barray_ptr *dst_coef_arrays)
+           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+           jvirt_barray_ptr *src_coef_arrays,
+           jvirt_barray_ptr *dst_coef_arrays)
 /* 90 degree rotation is equivalent to
  *   1. Transposing the image;
  *   2. Horizontal mirroring.
@@ -413,50 +414,50 @@
     x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
     y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
+         dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION) compptr->v_samp_factor, TRUE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
-	     dst_blk_x += compptr->h_samp_factor) {
-	  if (x_crop_blocks + dst_blk_x < comp_width) {
-	    /* Block is within the mirrorable area. */
-	    src_buffer = (*srcinfo->mem->access_virt_barray)
-	      ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	       comp_width - x_crop_blocks - dst_blk_x -
-	       (JDIMENSION) compptr->h_samp_factor,
-	       (JDIMENSION) compptr->h_samp_factor, FALSE);
-	  } else {
-	    /* Edge blocks are transposed but not mirrored. */
-	    src_buffer = (*srcinfo->mem->access_virt_barray)
-	      ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	       dst_blk_x + x_crop_blocks,
-	       (JDIMENSION) compptr->h_samp_factor, FALSE);
-	  }
-	  for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
-	    dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
-	    if (x_crop_blocks + dst_blk_x < comp_width) {
-	      /* Block is within the mirrorable area. */
-	      src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1]
-		[dst_blk_y + offset_y + y_crop_blocks];
-	      for (i = 0; i < DCTSIZE; i++) {
-		for (j = 0; j < DCTSIZE; j++)
-		  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-		i++;
-		for (j = 0; j < DCTSIZE; j++)
-		  dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
-	      }
-	    } else {
-	      /* Edge blocks are transposed but not mirrored. */
-	      src_ptr = src_buffer[offset_x]
-		[dst_blk_y + offset_y + y_crop_blocks];
-	      for (i = 0; i < DCTSIZE; i++)
-		for (j = 0; j < DCTSIZE; j++)
-		  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-	    }
-	  }
-	}
+        for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+             dst_blk_x += compptr->h_samp_factor) {
+          if (x_crop_blocks + dst_blk_x < comp_width) {
+            /* Block is within the mirrorable area. */
+            src_buffer = (*srcinfo->mem->access_virt_barray)
+              ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+               comp_width - x_crop_blocks - dst_blk_x -
+               (JDIMENSION) compptr->h_samp_factor,
+               (JDIMENSION) compptr->h_samp_factor, FALSE);
+          } else {
+            /* Edge blocks are transposed but not mirrored. */
+            src_buffer = (*srcinfo->mem->access_virt_barray)
+              ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+               dst_blk_x + x_crop_blocks,
+               (JDIMENSION) compptr->h_samp_factor, FALSE);
+          }
+          for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
+            dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
+            if (x_crop_blocks + dst_blk_x < comp_width) {
+              /* Block is within the mirrorable area. */
+              src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1]
+                [dst_blk_y + offset_y + y_crop_blocks];
+              for (i = 0; i < DCTSIZE; i++) {
+                for (j = 0; j < DCTSIZE; j++)
+                  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                i++;
+                for (j = 0; j < DCTSIZE; j++)
+                  dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
+              }
+            } else {
+              /* Edge blocks are transposed but not mirrored. */
+              src_ptr = src_buffer[offset_x]
+                [dst_blk_y + offset_y + y_crop_blocks];
+              for (i = 0; i < DCTSIZE; i++)
+                for (j = 0; j < DCTSIZE; j++)
+                  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+            }
+          }
+        }
       }
     }
   }
@@ -465,9 +466,9 @@
 
 LOCAL(void)
 do_rot_270 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	    JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	    jvirt_barray_ptr *src_coef_arrays,
-	    jvirt_barray_ptr *dst_coef_arrays)
+            JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+            jvirt_barray_ptr *src_coef_arrays,
+            jvirt_barray_ptr *dst_coef_arrays)
 /* 270 degree rotation is equivalent to
  *   1. Horizontal mirroring;
  *   2. Transposing the image.
@@ -494,40 +495,40 @@
     x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
     y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
+         dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION) compptr->v_samp_factor, TRUE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
-	     dst_blk_x += compptr->h_samp_factor) {
-	  src_buffer = (*srcinfo->mem->access_virt_barray)
-	    ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	     dst_blk_x + x_crop_blocks,
-	     (JDIMENSION) compptr->h_samp_factor, FALSE);
-	  for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
-	    dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
-	    if (y_crop_blocks + dst_blk_y < comp_height) {
-	      /* Block is within the mirrorable area. */
-	      src_ptr = src_buffer[offset_x]
-		[comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
-	      for (i = 0; i < DCTSIZE; i++) {
-		for (j = 0; j < DCTSIZE; j++) {
-		  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-		  j++;
-		  dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
-		}
-	      }
-	    } else {
-	      /* Edge blocks are transposed but not mirrored. */
-	      src_ptr = src_buffer[offset_x]
-		[dst_blk_y + offset_y + y_crop_blocks];
-	      for (i = 0; i < DCTSIZE; i++)
-		for (j = 0; j < DCTSIZE; j++)
-		  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-	    }
-	  }
-	}
+        for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+             dst_blk_x += compptr->h_samp_factor) {
+          src_buffer = (*srcinfo->mem->access_virt_barray)
+            ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+             dst_blk_x + x_crop_blocks,
+             (JDIMENSION) compptr->h_samp_factor, FALSE);
+          for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
+            dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
+            if (y_crop_blocks + dst_blk_y < comp_height) {
+              /* Block is within the mirrorable area. */
+              src_ptr = src_buffer[offset_x]
+                [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
+              for (i = 0; i < DCTSIZE; i++) {
+                for (j = 0; j < DCTSIZE; j++) {
+                  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                  j++;
+                  dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
+                }
+              }
+            } else {
+              /* Edge blocks are transposed but not mirrored. */
+              src_ptr = src_buffer[offset_x]
+                [dst_blk_y + offset_y + y_crop_blocks];
+              for (i = 0; i < DCTSIZE; i++)
+                for (j = 0; j < DCTSIZE; j++)
+                  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+            }
+          }
+        }
       }
     }
   }
@@ -536,9 +537,9 @@
 
 LOCAL(void)
 do_rot_180 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	    JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	    jvirt_barray_ptr *src_coef_arrays,
-	    jvirt_barray_ptr *dst_coef_arrays)
+            JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+            jvirt_barray_ptr *src_coef_arrays,
+            jvirt_barray_ptr *dst_coef_arrays)
 /* 180 degree rotation is equivalent to
  *   1. Vertical mirroring;
  *   2. Horizontal mirroring.
@@ -565,77 +566,77 @@
     x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
     y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
+         dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION) compptr->v_samp_factor, TRUE);
       if (y_crop_blocks + dst_blk_y < comp_height) {
-	/* Row is within the vertically mirrorable area. */
-	src_buffer = (*srcinfo->mem->access_virt_barray)
-	  ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	   comp_height - y_crop_blocks - dst_blk_y -
-	   (JDIMENSION) compptr->v_samp_factor,
-	   (JDIMENSION) compptr->v_samp_factor, FALSE);
+        /* Row is within the vertically mirrorable area. */
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+           comp_height - y_crop_blocks - dst_blk_y -
+           (JDIMENSION) compptr->v_samp_factor,
+           (JDIMENSION) compptr->v_samp_factor, FALSE);
       } else {
-	/* Bottom-edge rows are only mirrored horizontally. */
-	src_buffer = (*srcinfo->mem->access_virt_barray)
-	  ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	   dst_blk_y + y_crop_blocks,
-	   (JDIMENSION) compptr->v_samp_factor, FALSE);
+        /* Bottom-edge rows are only mirrored horizontally. */
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks,
+           (JDIMENSION) compptr->v_samp_factor, FALSE);
       }
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	dst_row_ptr = dst_buffer[offset_y];
-	if (y_crop_blocks + dst_blk_y < comp_height) {
-	  /* Row is within the mirrorable area. */
-	  src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1];
-	  for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
-	    dst_ptr = dst_row_ptr[dst_blk_x];
-	    if (x_crop_blocks + dst_blk_x < comp_width) {
-	      /* Process the blocks that can be mirrored both ways. */
-	      src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
-	      for (i = 0; i < DCTSIZE; i += 2) {
-		/* For even row, negate every odd column. */
-		for (j = 0; j < DCTSIZE; j += 2) {
-		  *dst_ptr++ = *src_ptr++;
-		  *dst_ptr++ = - *src_ptr++;
-		}
-		/* For odd row, negate every even column. */
-		for (j = 0; j < DCTSIZE; j += 2) {
-		  *dst_ptr++ = - *src_ptr++;
-		  *dst_ptr++ = *src_ptr++;
-		}
-	      }
-	    } else {
-	      /* Any remaining right-edge blocks are only mirrored vertically. */
-	      src_ptr = src_row_ptr[x_crop_blocks + dst_blk_x];
-	      for (i = 0; i < DCTSIZE; i += 2) {
-		for (j = 0; j < DCTSIZE; j++)
-		  *dst_ptr++ = *src_ptr++;
-		for (j = 0; j < DCTSIZE; j++)
-		  *dst_ptr++ = - *src_ptr++;
-	      }
-	    }
-	  }
-	} else {
-	  /* Remaining rows are just mirrored horizontally. */
-	  src_row_ptr = src_buffer[offset_y];
-	  for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
-	    if (x_crop_blocks + dst_blk_x < comp_width) {
-	      /* Process the blocks that can be mirrored. */
-	      dst_ptr = dst_row_ptr[dst_blk_x];
-	      src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
-	      for (i = 0; i < DCTSIZE2; i += 2) {
-		*dst_ptr++ = *src_ptr++;
-		*dst_ptr++ = - *src_ptr++;
-	      }
-	    } else {
-	      /* Any remaining right-edge blocks are only copied. */
-	      jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks,
-			      dst_row_ptr + dst_blk_x,
-			      (JDIMENSION) 1);
-	    }
-	  }
-	}
+        dst_row_ptr = dst_buffer[offset_y];
+        if (y_crop_blocks + dst_blk_y < comp_height) {
+          /* Row is within the mirrorable area. */
+          src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1];
+          for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
+            dst_ptr = dst_row_ptr[dst_blk_x];
+            if (x_crop_blocks + dst_blk_x < comp_width) {
+              /* Process the blocks that can be mirrored both ways. */
+              src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
+              for (i = 0; i < DCTSIZE; i += 2) {
+                /* For even row, negate every odd column. */
+                for (j = 0; j < DCTSIZE; j += 2) {
+                  *dst_ptr++ = *src_ptr++;
+                  *dst_ptr++ = - *src_ptr++;
+                }
+                /* For odd row, negate every even column. */
+                for (j = 0; j < DCTSIZE; j += 2) {
+                  *dst_ptr++ = - *src_ptr++;
+                  *dst_ptr++ = *src_ptr++;
+                }
+              }
+            } else {
+              /* Any remaining right-edge blocks are only mirrored vertically. */
+              src_ptr = src_row_ptr[x_crop_blocks + dst_blk_x];
+              for (i = 0; i < DCTSIZE; i += 2) {
+                for (j = 0; j < DCTSIZE; j++)
+                  *dst_ptr++ = *src_ptr++;
+                for (j = 0; j < DCTSIZE; j++)
+                  *dst_ptr++ = - *src_ptr++;
+              }
+            }
+          }
+        } else {
+          /* Remaining rows are just mirrored horizontally. */
+          src_row_ptr = src_buffer[offset_y];
+          for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
+            if (x_crop_blocks + dst_blk_x < comp_width) {
+              /* Process the blocks that can be mirrored. */
+              dst_ptr = dst_row_ptr[dst_blk_x];
+              src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
+              for (i = 0; i < DCTSIZE2; i += 2) {
+                *dst_ptr++ = *src_ptr++;
+                *dst_ptr++ = - *src_ptr++;
+              }
+            } else {
+              /* Any remaining right-edge blocks are only copied. */
+              jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks,
+                              dst_row_ptr + dst_blk_x,
+                              (JDIMENSION) 1);
+            }
+          }
+        }
       }
     }
   }
@@ -644,9 +645,9 @@
 
 LOCAL(void)
 do_transverse (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	       JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	       jvirt_barray_ptr *src_coef_arrays,
-	       jvirt_barray_ptr *dst_coef_arrays)
+               JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+               jvirt_barray_ptr *src_coef_arrays,
+               jvirt_barray_ptr *dst_coef_arrays)
 /* Transverse transpose is equivalent to
  *   1. 180 degree rotation;
  *   2. Transposition;
@@ -676,81 +677,81 @@
     x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
     y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
+         dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION) compptr->v_samp_factor, TRUE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
-	     dst_blk_x += compptr->h_samp_factor) {
-	  if (x_crop_blocks + dst_blk_x < comp_width) {
-	    /* Block is within the mirrorable area. */
-	    src_buffer = (*srcinfo->mem->access_virt_barray)
-	      ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	       comp_width - x_crop_blocks - dst_blk_x -
-	       (JDIMENSION) compptr->h_samp_factor,
-	       (JDIMENSION) compptr->h_samp_factor, FALSE);
-	  } else {
-	    src_buffer = (*srcinfo->mem->access_virt_barray)
-	      ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	       dst_blk_x + x_crop_blocks,
-	       (JDIMENSION) compptr->h_samp_factor, FALSE);
-	  }
-	  for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
-	    dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
-	    if (y_crop_blocks + dst_blk_y < comp_height) {
-	      if (x_crop_blocks + dst_blk_x < comp_width) {
-		/* Block is within the mirrorable area. */
-		src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1]
-		  [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
-		for (i = 0; i < DCTSIZE; i++) {
-		  for (j = 0; j < DCTSIZE; j++) {
-		    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-		    j++;
-		    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
-		  }
-		  i++;
-		  for (j = 0; j < DCTSIZE; j++) {
-		    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
-		    j++;
-		    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-		  }
-		}
-	      } else {
-		/* Right-edge blocks are mirrored in y only */
-		src_ptr = src_buffer[offset_x]
-		  [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
-		for (i = 0; i < DCTSIZE; i++) {
-		  for (j = 0; j < DCTSIZE; j++) {
-		    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-		    j++;
-		    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
-		  }
-		}
-	      }
-	    } else {
-	      if (x_crop_blocks + dst_blk_x < comp_width) {
-		/* Bottom-edge blocks are mirrored in x only */
-		src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1]
-		  [dst_blk_y + offset_y + y_crop_blocks];
-		for (i = 0; i < DCTSIZE; i++) {
-		  for (j = 0; j < DCTSIZE; j++)
-		    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-		  i++;
-		  for (j = 0; j < DCTSIZE; j++)
-		    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
-		}
-	      } else {
-		/* At lower right corner, just transpose, no mirroring */
-		src_ptr = src_buffer[offset_x]
-		  [dst_blk_y + offset_y + y_crop_blocks];
-		for (i = 0; i < DCTSIZE; i++)
-		  for (j = 0; j < DCTSIZE; j++)
-		    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-	      }
-	    }
-	  }
-	}
+        for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+             dst_blk_x += compptr->h_samp_factor) {
+          if (x_crop_blocks + dst_blk_x < comp_width) {
+            /* Block is within the mirrorable area. */
+            src_buffer = (*srcinfo->mem->access_virt_barray)
+              ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+               comp_width - x_crop_blocks - dst_blk_x -
+               (JDIMENSION) compptr->h_samp_factor,
+               (JDIMENSION) compptr->h_samp_factor, FALSE);
+          } else {
+            src_buffer = (*srcinfo->mem->access_virt_barray)
+              ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+               dst_blk_x + x_crop_blocks,
+               (JDIMENSION) compptr->h_samp_factor, FALSE);
+          }
+          for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
+            dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
+            if (y_crop_blocks + dst_blk_y < comp_height) {
+              if (x_crop_blocks + dst_blk_x < comp_width) {
+                /* Block is within the mirrorable area. */
+                src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1]
+                  [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
+                for (i = 0; i < DCTSIZE; i++) {
+                  for (j = 0; j < DCTSIZE; j++) {
+                    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                    j++;
+                    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
+                  }
+                  i++;
+                  for (j = 0; j < DCTSIZE; j++) {
+                    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
+                    j++;
+                    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                  }
+                }
+              } else {
+                /* Right-edge blocks are mirrored in y only */
+                src_ptr = src_buffer[offset_x]
+                  [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
+                for (i = 0; i < DCTSIZE; i++) {
+                  for (j = 0; j < DCTSIZE; j++) {
+                    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                    j++;
+                    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
+                  }
+                }
+              }
+            } else {
+              if (x_crop_blocks + dst_blk_x < comp_width) {
+                /* Bottom-edge blocks are mirrored in x only */
+                src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1]
+                  [dst_blk_y + offset_y + y_crop_blocks];
+                for (i = 0; i < DCTSIZE; i++) {
+                  for (j = 0; j < DCTSIZE; j++)
+                    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                  i++;
+                  for (j = 0; j < DCTSIZE; j++)
+                    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
+                }
+              } else {
+                /* At lower right corner, just transpose, no mirroring */
+                src_ptr = src_buffer[offset_x]
+                  [dst_blk_y + offset_y + y_crop_blocks];
+                for (i = 0; i < DCTSIZE; i++)
+                  for (j = 0; j < DCTSIZE; j++)
+                    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+              }
+            }
+          }
+        }
       }
     }
   }
@@ -763,9 +764,9 @@
  */
 
 LOCAL(boolean)
-jt_read_integer (const char ** strptr, JDIMENSION * result)
+jt_read_integer (const char **strptr, JDIMENSION *result)
 {
-  const char * ptr = *strptr;
+  const char *ptr = *strptr;
   JDIMENSION val = 0;
 
   for (; isdigit(*ptr); ptr++) {
@@ -773,7 +774,7 @@
   }
   *result = val;
   if (ptr == *strptr)
-    return FALSE;		/* oops, no digits */
+    return FALSE;               /* oops, no digits */
   *strptr = ptr;
   return TRUE;
 }
@@ -783,7 +784,7 @@
  * The routine returns TRUE if the spec string is valid, FALSE if not.
  *
  * The crop spec string should have the format
- *	<width>[f]x<height>[f]{+-}<xoffset>{+-}<yoffset>
+ *      <width>[f]x<height>[f]{+-}<xoffset>{+-}<yoffset>
  * where width, height, xoffset, and yoffset are unsigned integers.
  * Each of the elements can be omitted to indicate a default value.
  * (A weakness of this style is that it is not possible to omit xoffset
@@ -888,7 +889,7 @@
 
 GLOBAL(boolean)
 jtransform_request_workspace (j_decompress_ptr srcinfo,
-			      jpeg_transform_info *info)
+                              jpeg_transform_info *info)
 {
   jvirt_barray_ptr *coef_arrays;
   boolean need_workspace, transpose_it;
@@ -921,18 +922,18 @@
   if (info->perfect) {
     if (info->num_components == 1) {
       if (!jtransform_perfect_transform(srcinfo->output_width,
-	  srcinfo->output_height,
-	  srcinfo->_min_DCT_h_scaled_size,
-	  srcinfo->_min_DCT_v_scaled_size,
-	  info->transform))
-	return FALSE;
+          srcinfo->output_height,
+          srcinfo->_min_DCT_h_scaled_size,
+          srcinfo->_min_DCT_v_scaled_size,
+          info->transform))
+        return FALSE;
     } else {
       if (!jtransform_perfect_transform(srcinfo->output_width,
-	  srcinfo->output_height,
-	  srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size,
-	  srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size,
-	  info->transform))
-	return FALSE;
+          srcinfo->output_height,
+          srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size,
+          srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size,
+          info->transform))
+        return FALSE;
     }
   }
 
@@ -953,9 +954,9 @@
       info->iMCU_sample_height = srcinfo->_min_DCT_h_scaled_size;
     } else {
       info->iMCU_sample_width =
-	srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size;
+        srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size;
       info->iMCU_sample_height =
-	srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size;
+        srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size;
     }
     break;
   default:
@@ -966,9 +967,9 @@
       info->iMCU_sample_height = srcinfo->_min_DCT_v_scaled_size;
     } else {
       info->iMCU_sample_width =
-	srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size;
+        srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size;
       info->iMCU_sample_height =
-	srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size;
+        srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size;
     }
     break;
   }
@@ -979,11 +980,11 @@
   if (info->crop) {
     /* Insert default values for unset crop parameters */
     if (info->crop_xoffset_set == JCROP_UNSET)
-      info->crop_xoffset = 0;	/* default to +0 */
+      info->crop_xoffset = 0;   /* default to +0 */
     if (info->crop_yoffset_set == JCROP_UNSET)
-      info->crop_yoffset = 0;	/* default to +0 */
+      info->crop_yoffset = 0;   /* default to +0 */
     if (info->crop_xoffset >= info->output_width ||
-	info->crop_yoffset >= info->output_height)
+        info->crop_yoffset >= info->output_height)
       ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
     if (info->crop_width_set == JCROP_UNSET)
       info->crop_width = info->output_width - info->crop_xoffset;
@@ -991,9 +992,9 @@
       info->crop_height = info->output_height - info->crop_yoffset;
     /* Ensure parameters are valid */
     if (info->crop_width <= 0 || info->crop_width > info->output_width ||
-	info->crop_height <= 0 || info->crop_height > info->output_height ||
-	info->crop_xoffset > info->output_width - info->crop_width ||
-	info->crop_yoffset > info->output_height - info->crop_height)
+        info->crop_height <= 0 || info->crop_height > info->output_height ||
+        info->crop_xoffset > info->output_width - info->crop_width ||
+        info->crop_yoffset > info->output_height - info->crop_height)
       ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
     /* Convert negative crop offsets into regular offsets */
     if (info->crop_xoffset_set == JCROP_NEG)
@@ -1093,30 +1094,30 @@
   if (need_workspace) {
     coef_arrays = (jvirt_barray_ptr *)
       (*srcinfo->mem->alloc_small) ((j_common_ptr) srcinfo, JPOOL_IMAGE,
-		SIZEOF(jvirt_barray_ptr) * info->num_components);
+                sizeof(jvirt_barray_ptr) * info->num_components);
     width_in_iMCUs = (JDIMENSION)
       jdiv_round_up((long) info->output_width,
-		    (long) info->iMCU_sample_width);
+                    (long) info->iMCU_sample_width);
     height_in_iMCUs = (JDIMENSION)
       jdiv_round_up((long) info->output_height,
-		    (long) info->iMCU_sample_height);
+                    (long) info->iMCU_sample_height);
     for (ci = 0; ci < info->num_components; ci++) {
       compptr = srcinfo->comp_info + ci;
       if (info->num_components == 1) {
-	/* we're going to force samp factors to 1x1 in this case */
-	h_samp_factor = v_samp_factor = 1;
+        /* we're going to force samp factors to 1x1 in this case */
+        h_samp_factor = v_samp_factor = 1;
       } else if (transpose_it) {
-	h_samp_factor = compptr->v_samp_factor;
-	v_samp_factor = compptr->h_samp_factor;
+        h_samp_factor = compptr->v_samp_factor;
+        v_samp_factor = compptr->h_samp_factor;
       } else {
-	h_samp_factor = compptr->h_samp_factor;
-	v_samp_factor = compptr->v_samp_factor;
+        h_samp_factor = compptr->h_samp_factor;
+        v_samp_factor = compptr->v_samp_factor;
       }
       width_in_blocks = width_in_iMCUs * h_samp_factor;
       height_in_blocks = height_in_iMCUs * v_samp_factor;
       coef_arrays[ci] = (*srcinfo->mem->request_virt_barray)
-	((j_common_ptr) srcinfo, JPOOL_IMAGE, FALSE,
-	 width_in_blocks, height_in_blocks, (JDIMENSION) v_samp_factor);
+        ((j_common_ptr) srcinfo, JPOOL_IMAGE, FALSE,
+         width_in_blocks, height_in_blocks, (JDIMENSION) v_samp_factor);
     }
     info->workspace_coef_arrays = coef_arrays;
   } else
@@ -1160,11 +1161,11 @@
     qtblptr = dstinfo->quant_tbl_ptrs[tblno];
     if (qtblptr != NULL) {
       for (i = 0; i < DCTSIZE; i++) {
-	for (j = 0; j < i; j++) {
-	  qtemp = qtblptr->quantval[i*DCTSIZE+j];
-	  qtblptr->quantval[i*DCTSIZE+j] = qtblptr->quantval[j*DCTSIZE+i];
-	  qtblptr->quantval[j*DCTSIZE+i] = qtemp;
-	}
+        for (j = 0; j < i; j++) {
+          qtemp = qtblptr->quantval[i*DCTSIZE+j];
+          qtblptr->quantval[i*DCTSIZE+j] = qtblptr->quantval[j*DCTSIZE+i];
+          qtblptr->quantval[j*DCTSIZE+i] = qtemp;
+        }
       }
     }
   }
@@ -1178,8 +1179,8 @@
 
 #if JPEG_LIB_VERSION >= 70
 LOCAL(void)
-adjust_exif_parameters (JOCTET FAR * data, unsigned int length,
-			JDIMENSION new_width, JDIMENSION new_height)
+adjust_exif_parameters (JOCTET *data, unsigned int length,
+                        JDIMENSION new_width, JDIMENSION new_height)
 {
   boolean is_motorola; /* Flag for byte order */
   unsigned int number_of_tags, tagnum;
@@ -1296,31 +1297,31 @@
     }
     if (tagnum == 0xA002 || tagnum == 0xA003) {
       if (tagnum == 0xA002)
-	new_value = new_width; /* ExifImageWidth Tag */
+        new_value = new_width; /* ExifImageWidth Tag */
       else
-	new_value = new_height; /* ExifImageHeight Tag */
+        new_value = new_height; /* ExifImageHeight Tag */
       if (is_motorola) {
-	data[offset+2] = 0; /* Format = unsigned long (4 octets) */
-	data[offset+3] = 4;
-	data[offset+4] = 0; /* Number Of Components = 1 */
-	data[offset+5] = 0;
-	data[offset+6] = 0;
-	data[offset+7] = 1;
-	data[offset+8] = 0;
-	data[offset+9] = 0;
-	data[offset+10] = (JOCTET)((new_value >> 8) & 0xFF);
-	data[offset+11] = (JOCTET)(new_value & 0xFF);
+        data[offset+2] = 0; /* Format = unsigned long (4 octets) */
+        data[offset+3] = 4;
+        data[offset+4] = 0; /* Number Of Components = 1 */
+        data[offset+5] = 0;
+        data[offset+6] = 0;
+        data[offset+7] = 1;
+        data[offset+8] = 0;
+        data[offset+9] = 0;
+        data[offset+10] = (JOCTET)((new_value >> 8) & 0xFF);
+        data[offset+11] = (JOCTET)(new_value & 0xFF);
       } else {
-	data[offset+2] = 4; /* Format = unsigned long (4 octets) */
-	data[offset+3] = 0;
-	data[offset+4] = 1; /* Number Of Components = 1 */
-	data[offset+5] = 0;
-	data[offset+6] = 0;
-	data[offset+7] = 0;
-	data[offset+8] = (JOCTET)(new_value & 0xFF);
-	data[offset+9] = (JOCTET)((new_value >> 8) & 0xFF);
-	data[offset+10] = 0;
-	data[offset+11] = 0;
+        data[offset+2] = 4; /* Format = unsigned long (4 octets) */
+        data[offset+3] = 0;
+        data[offset+4] = 1; /* Number Of Components = 1 */
+        data[offset+5] = 0;
+        data[offset+6] = 0;
+        data[offset+7] = 0;
+        data[offset+8] = (JOCTET)(new_value & 0xFF);
+        data[offset+9] = (JOCTET)((new_value >> 8) & 0xFF);
+        data[offset+10] = 0;
+        data[offset+11] = 0;
       }
     }
     offset += 12;
@@ -1342,9 +1343,9 @@
 
 GLOBAL(jvirt_barray_ptr *)
 jtransform_adjust_parameters (j_decompress_ptr srcinfo,
-			      j_compress_ptr dstinfo,
-			      jvirt_barray_ptr *src_coef_arrays,
-			      jpeg_transform_info *info)
+                              j_compress_ptr dstinfo,
+                              jvirt_barray_ptr *src_coef_arrays,
+                              jpeg_transform_info *info)
 {
   /* If force-to-grayscale is requested, adjust destination parameters */
   if (info->force_grayscale) {
@@ -1354,11 +1355,11 @@
      * isn't worth extra code space.  But we check it to avoid crashing.)
      */
     if (((dstinfo->jpeg_color_space == JCS_YCbCr &&
-	  dstinfo->num_components == 3) ||
-	 (dstinfo->jpeg_color_space == JCS_GRAYSCALE &&
-	  dstinfo->num_components == 1)) &&
-	srcinfo->comp_info[0].h_samp_factor == srcinfo->max_h_samp_factor &&
-	srcinfo->comp_info[0].v_samp_factor == srcinfo->max_v_samp_factor) {
+          dstinfo->num_components == 3) ||
+         (dstinfo->jpeg_color_space == JCS_GRAYSCALE &&
+          dstinfo->num_components == 1)) &&
+        srcinfo->comp_info[0].h_samp_factor == srcinfo->max_h_samp_factor &&
+        srcinfo->comp_info[0].v_samp_factor == srcinfo->max_v_samp_factor) {
       /* We use jpeg_set_colorspace to make sure subsidiary settings get fixed
        * properly.  Among other things, it sets the target h_samp_factor &
        * v_samp_factor to 1, which typically won't match the source.
@@ -1423,11 +1424,11 @@
 #if JPEG_LIB_VERSION >= 70
     /* Adjust Exif image parameters */
     if (dstinfo->jpeg_width != srcinfo->image_width ||
-	dstinfo->jpeg_height != srcinfo->image_height)
+        dstinfo->jpeg_height != srcinfo->image_height)
       /* Align data segment to start of TIFF structure for parsing */
       adjust_exif_parameters(srcinfo->marker_list->data + 6,
-	srcinfo->marker_list->data_length - 6,
-	dstinfo->jpeg_width, dstinfo->jpeg_height);
+        srcinfo->marker_list->data_length - 6,
+        dstinfo->jpeg_width, dstinfo->jpeg_height);
 #endif
   }
 
@@ -1449,9 +1450,9 @@
 
 GLOBAL(void)
 jtransform_execute_transform (j_decompress_ptr srcinfo,
-			      j_compress_ptr dstinfo,
-			      jvirt_barray_ptr *src_coef_arrays,
-			      jpeg_transform_info *info)
+                              j_compress_ptr dstinfo,
+                              jvirt_barray_ptr *src_coef_arrays,
+                              jpeg_transform_info *info)
 {
   jvirt_barray_ptr *dst_coef_arrays = info->workspace_coef_arrays;
 
@@ -1462,39 +1463,39 @@
   case JXFORM_NONE:
     if (info->x_crop_offset != 0 || info->y_crop_offset != 0)
       do_crop(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-	      src_coef_arrays, dst_coef_arrays);
+              src_coef_arrays, dst_coef_arrays);
     break;
   case JXFORM_FLIP_H:
     if (info->y_crop_offset != 0 || info->slow_hflip)
       do_flip_h(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-		src_coef_arrays, dst_coef_arrays);
+                src_coef_arrays, dst_coef_arrays);
     else
       do_flip_h_no_crop(srcinfo, dstinfo, info->x_crop_offset,
-			src_coef_arrays);
+                        src_coef_arrays);
     break;
   case JXFORM_FLIP_V:
     do_flip_v(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-	      src_coef_arrays, dst_coef_arrays);
+              src_coef_arrays, dst_coef_arrays);
     break;
   case JXFORM_TRANSPOSE:
     do_transpose(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-		 src_coef_arrays, dst_coef_arrays);
+                 src_coef_arrays, dst_coef_arrays);
     break;
   case JXFORM_TRANSVERSE:
     do_transverse(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-		  src_coef_arrays, dst_coef_arrays);
+                  src_coef_arrays, dst_coef_arrays);
     break;
   case JXFORM_ROT_90:
     do_rot_90(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-	      src_coef_arrays, dst_coef_arrays);
+              src_coef_arrays, dst_coef_arrays);
     break;
   case JXFORM_ROT_180:
     do_rot_180(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-	       src_coef_arrays, dst_coef_arrays);
+               src_coef_arrays, dst_coef_arrays);
     break;
   case JXFORM_ROT_270:
     do_rot_270(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-	       src_coef_arrays, dst_coef_arrays);
+               src_coef_arrays, dst_coef_arrays);
     break;
   }
 }
@@ -1522,8 +1523,8 @@
 
 GLOBAL(boolean)
 jtransform_perfect_transform(JDIMENSION image_width, JDIMENSION image_height,
-			     int MCU_width, int MCU_height,
-			     JXFORM_CODE transform)
+                             int MCU_width, int MCU_height,
+                             JXFORM_CODE transform)
 {
   boolean result = TRUE; /* initialize TRUE */
 
@@ -1586,7 +1587,7 @@
 
 GLOBAL(void)
 jcopy_markers_execute (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-		       JCOPY_OPTION option)
+                       JCOPY_OPTION option)
 {
   jpeg_saved_marker_ptr marker;
 
@@ -1597,34 +1598,24 @@
    */
   for (marker = srcinfo->marker_list; marker != NULL; marker = marker->next) {
     if (dstinfo->write_JFIF_header &&
-	marker->marker == JPEG_APP0 &&
-	marker->data_length >= 5 &&
-	GETJOCTET(marker->data[0]) == 0x4A &&
-	GETJOCTET(marker->data[1]) == 0x46 &&
-	GETJOCTET(marker->data[2]) == 0x49 &&
-	GETJOCTET(marker->data[3]) == 0x46 &&
-	GETJOCTET(marker->data[4]) == 0)
-      continue;			/* reject duplicate JFIF */
+        marker->marker == JPEG_APP0 &&
+        marker->data_length >= 5 &&
+        GETJOCTET(marker->data[0]) == 0x4A &&
+        GETJOCTET(marker->data[1]) == 0x46 &&
+        GETJOCTET(marker->data[2]) == 0x49 &&
+        GETJOCTET(marker->data[3]) == 0x46 &&
+        GETJOCTET(marker->data[4]) == 0)
+      continue;                 /* reject duplicate JFIF */
     if (dstinfo->write_Adobe_marker &&
-	marker->marker == JPEG_APP0+14 &&
-	marker->data_length >= 5 &&
-	GETJOCTET(marker->data[0]) == 0x41 &&
-	GETJOCTET(marker->data[1]) == 0x64 &&
-	GETJOCTET(marker->data[2]) == 0x6F &&
-	GETJOCTET(marker->data[3]) == 0x62 &&
-	GETJOCTET(marker->data[4]) == 0x65)
-      continue;			/* reject duplicate Adobe */
-#ifdef NEED_FAR_POINTERS
-    /* We could use jpeg_write_marker if the data weren't FAR... */
-    {
-      unsigned int i;
-      jpeg_write_m_header(dstinfo, marker->marker, marker->data_length);
-      for (i = 0; i < marker->data_length; i++)
-	jpeg_write_m_byte(dstinfo, marker->data[i]);
-    }
-#else
+        marker->marker == JPEG_APP0+14 &&
+        marker->data_length >= 5 &&
+        GETJOCTET(marker->data[0]) == 0x41 &&
+        GETJOCTET(marker->data[1]) == 0x64 &&
+        GETJOCTET(marker->data[2]) == 0x6F &&
+        GETJOCTET(marker->data[3]) == 0x62 &&
+        GETJOCTET(marker->data[4]) == 0x65)
+      continue;                 /* reject duplicate Adobe */
     jpeg_write_marker(dstinfo, marker->marker,
-		      marker->data, marker->data_length);
-#endif
+                      marker->data, marker->data_length);
   }
 }
diff --git a/transupp.h b/transupp.h
index cfbaca4..bf3118a 100644
--- a/transupp.h
+++ b/transupp.h
@@ -1,9 +1,12 @@
 /*
  * transupp.h
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1997-2011, Thomas G. Lane, Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains declarations for image transformation routines and
  * other utility code used by the jpegtran sample application.  These are
@@ -19,7 +22,7 @@
 
 /* If you happen not to want the image transform support, disable it here */
 #ifndef TRANSFORMS_SUPPORTED
-#define TRANSFORMS_SUPPORTED 1		/* 0 disables transform code */
+#define TRANSFORMS_SUPPORTED 1          /* 0 disables transform code */
 #endif
 
 /*
@@ -77,32 +80,19 @@
  */
 
 
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jtransform_parse_crop_spec	jTrParCrop
-#define jtransform_request_workspace	jTrRequest
-#define jtransform_adjust_parameters	jTrAdjust
-#define jtransform_execute_transform	jTrExec
-#define jtransform_perfect_transform	jTrPerfect
-#define jcopy_markers_setup		jCMrkSetup
-#define jcopy_markers_execute		jCMrkExec
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
-
 /*
  * Codes for supported types of image transformations.
  */
 
 typedef enum {
-	JXFORM_NONE,		/* no transformation */
-	JXFORM_FLIP_H,		/* horizontal flip */
-	JXFORM_FLIP_V,		/* vertical flip */
-	JXFORM_TRANSPOSE,	/* transpose across UL-to-LR axis */
-	JXFORM_TRANSVERSE,	/* transpose across UR-to-LL axis */
-	JXFORM_ROT_90,		/* 90-degree clockwise rotation */
-	JXFORM_ROT_180,		/* 180-degree rotation */
-	JXFORM_ROT_270		/* 270-degree clockwise (or 90 ccw) */
+  JXFORM_NONE,            /* no transformation */
+  JXFORM_FLIP_H,          /* horizontal flip */
+  JXFORM_FLIP_V,          /* vertical flip */
+  JXFORM_TRANSPOSE,       /* transpose across UL-to-LR axis */
+  JXFORM_TRANSVERSE,      /* transpose across UR-to-LL axis */
+  JXFORM_ROT_90,          /* 90-degree clockwise rotation */
+  JXFORM_ROT_180,         /* 180-degree rotation */
+  JXFORM_ROT_270          /* 270-degree clockwise (or 90 ccw) */
 } JXFORM_CODE;
 
 /*
@@ -112,10 +102,10 @@
  */
 
 typedef enum {
-        JCROP_UNSET,
-        JCROP_POS,
-        JCROP_NEG,
-        JCROP_FORCE
+  JCROP_UNSET,
+  JCROP_POS,
+  JCROP_NEG,
+  JCROP_FORCE
 } JCROP_CODE;
 
 /*
@@ -126,11 +116,11 @@
 
 typedef struct {
   /* Options: set by caller */
-  JXFORM_CODE transform;	/* image transform operator */
-  boolean perfect;		/* if TRUE, fail if partial MCUs are requested */
-  boolean trim;			/* if TRUE, trim partial MCUs as needed */
-  boolean force_grayscale;	/* if TRUE, convert color image to grayscale */
-  boolean crop;			/* if TRUE, crop source image */
+  JXFORM_CODE transform;        /* image transform operator */
+  boolean perfect;              /* if TRUE, fail if partial MCUs are requested */
+  boolean trim;                 /* if TRUE, trim partial MCUs as needed */
+  boolean force_grayscale;      /* if TRUE, convert color image to grayscale */
+  boolean crop;                 /* if TRUE, crop source image */
   boolean slow_hflip;  /* For best performance, the JXFORM_FLIP_H transform
                           normally modifies the source coefficients in place.
                           Setting this to TRUE will instead use a slower,
@@ -142,23 +132,23 @@
   /* Crop parameters: application need not set these unless crop is TRUE.
    * These can be filled in by jtransform_parse_crop_spec().
    */
-  JDIMENSION crop_width;	/* Width of selected region */
-  JCROP_CODE crop_width_set;	/* (forced disables adjustment) */
-  JDIMENSION crop_height;	/* Height of selected region */
-  JCROP_CODE crop_height_set;	/* (forced disables adjustment) */
-  JDIMENSION crop_xoffset;	/* X offset of selected region */
-  JCROP_CODE crop_xoffset_set;	/* (negative measures from right edge) */
-  JDIMENSION crop_yoffset;	/* Y offset of selected region */
-  JCROP_CODE crop_yoffset_set;	/* (negative measures from bottom edge) */
+  JDIMENSION crop_width;        /* Width of selected region */
+  JCROP_CODE crop_width_set;    /* (forced disables adjustment) */
+  JDIMENSION crop_height;       /* Height of selected region */
+  JCROP_CODE crop_height_set;   /* (forced disables adjustment) */
+  JDIMENSION crop_xoffset;      /* X offset of selected region */
+  JCROP_CODE crop_xoffset_set;  /* (negative measures from right edge) */
+  JDIMENSION crop_yoffset;      /* Y offset of selected region */
+  JCROP_CODE crop_yoffset_set;  /* (negative measures from bottom edge) */
 
   /* Internal workspace: caller should not touch these */
-  int num_components;		/* # of components in workspace */
-  jvirt_barray_ptr * workspace_coef_arrays; /* workspace for transformations */
-  JDIMENSION output_width;	/* cropped destination dimensions */
+  int num_components;           /* # of components in workspace */
+  jvirt_barray_ptr *workspace_coef_arrays; /* workspace for transformations */
+  JDIMENSION output_width;      /* cropped destination dimensions */
   JDIMENSION output_height;
-  JDIMENSION x_crop_offset;	/* destination crop offsets measured in iMCUs */
+  JDIMENSION x_crop_offset;     /* destination crop offsets measured in iMCUs */
   JDIMENSION y_crop_offset;
-  int iMCU_sample_width;	/* destination iMCU size */
+  int iMCU_sample_width;        /* destination iMCU size */
   int iMCU_sample_height;
 } jpeg_transform_info;
 
@@ -167,34 +157,31 @@
 
 /* Parse a crop specification (written in X11 geometry style) */
 EXTERN(boolean) jtransform_parse_crop_spec
-	JPP((jpeg_transform_info *info, const char *spec));
+        (jpeg_transform_info *info, const char *spec);
 /* Request any required workspace */
 EXTERN(boolean) jtransform_request_workspace
-	JPP((j_decompress_ptr srcinfo, jpeg_transform_info *info));
+        (j_decompress_ptr srcinfo, jpeg_transform_info *info);
 /* Adjust output image parameters */
 EXTERN(jvirt_barray_ptr *) jtransform_adjust_parameters
-	JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	     jvirt_barray_ptr *src_coef_arrays,
-	     jpeg_transform_info *info));
+        (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+         jvirt_barray_ptr *src_coef_arrays, jpeg_transform_info *info);
 /* Execute the actual transformation, if any */
 EXTERN(void) jtransform_execute_transform
-	JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	     jvirt_barray_ptr *src_coef_arrays,
-	     jpeg_transform_info *info));
+        (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+         jvirt_barray_ptr *src_coef_arrays, jpeg_transform_info *info);
 /* Determine whether lossless transformation is perfectly
  * possible for a specified image and transformation.
  */
 EXTERN(boolean) jtransform_perfect_transform
-	JPP((JDIMENSION image_width, JDIMENSION image_height,
-	     int MCU_width, int MCU_height,
-	     JXFORM_CODE transform));
+        (JDIMENSION image_width, JDIMENSION image_height, int MCU_width,
+         int MCU_height, JXFORM_CODE transform);
 
 /* jtransform_execute_transform used to be called
  * jtransform_execute_transformation, but some compilers complain about
  * routine names that long.  This macro is here to avoid breaking any
  * old source code that uses the original name...
  */
-#define jtransform_execute_transformation	jtransform_execute_transform
+#define jtransform_execute_transformation       jtransform_execute_transform
 
 #endif /* TRANSFORMS_SUPPORTED */
 
@@ -204,17 +191,17 @@
  */
 
 typedef enum {
-	JCOPYOPT_NONE,		/* copy no optional markers */
-	JCOPYOPT_COMMENTS,	/* copy only comment (COM) markers */
-	JCOPYOPT_ALL		/* copy all optional markers */
+  JCOPYOPT_NONE,          /* copy no optional markers */
+  JCOPYOPT_COMMENTS,      /* copy only comment (COM) markers */
+  JCOPYOPT_ALL            /* copy all optional markers */
 } JCOPY_OPTION;
 
-#define JCOPYOPT_DEFAULT  JCOPYOPT_COMMENTS	/* recommended default */
+#define JCOPYOPT_DEFAULT  JCOPYOPT_COMMENTS     /* recommended default */
 
 /* Setup decompression object to save desired markers in memory */
 EXTERN(void) jcopy_markers_setup
-	JPP((j_decompress_ptr srcinfo, JCOPY_OPTION option));
+        (j_decompress_ptr srcinfo, JCOPY_OPTION option);
 /* Copy markers saved in the given source object to the destination object */
 EXTERN(void) jcopy_markers_execute
-	JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	     JCOPY_OPTION option));
+        (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+         JCOPY_OPTION option);
diff --git a/turbojpeg-jni.c b/turbojpeg-jni.c
index 634bedf..eaba670 100644
--- a/turbojpeg-jni.c
+++ b/turbojpeg-jni.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2013 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2016 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,46 +37,135 @@
 #include "java/org_libjpegturbo_turbojpeg_TJDecompressor.h"
 #include "java/org_libjpegturbo_turbojpeg_TJ.h"
 
-#define _throw(msg) {  \
-	jclass _exccls=(*env)->FindClass(env, "java/lang/Exception");  \
-	if(!_exccls) goto bailout;  \
+#define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
+
+#define _throw(msg, exceptionClass) {  \
+	jclass _exccls=(*env)->FindClass(env, exceptionClass);  \
+	if(!_exccls || (*env)->ExceptionCheck(env)) goto bailout;  \
 	(*env)->ThrowNew(env, _exccls, msg);  \
 	goto bailout;  \
 }
 
-#define bailif0(f) {if(!(f)) {  \
-	char temps[80];  \
-	snprintf(temps, 80, "Unexpected NULL condition in line %d", __LINE__);  \
-	_throw(temps);  \
+#define _throwtj() _throw(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException")
+
+#define _throwarg(msg) _throw(msg, "java/lang/IllegalArgumentException")
+
+#define _throwmem() _throw("Memory allocation failure", "java/lang/OutOfMemoryError");
+
+#define bailif0(f) {if(!(f) || (*env)->ExceptionCheck(env)) {  \
+	goto bailout;  \
 }}
 
 #define gethandle()  \
 	jclass _cls=(*env)->GetObjectClass(env, obj);  \
 	jfieldID _fid;  \
-	if(!_cls) goto bailout;  \
+	if(!_cls || (*env)->ExceptionCheck(env)) goto bailout;  \
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "handle", "J"));  \
-	handle=(tjhandle)(jlong)(*env)->GetLongField(env, obj, _fid);  \
+	handle=(tjhandle)(size_t)(*env)->GetLongField(env, obj, _fid);  \
 
+#ifdef _WIN32
+#define setenv(envvar, value, dummy) _putenv_s(envvar, value)
+#endif
+
+#define prop2env(property, envvar)  \
+{  \
+	if((jName=(*env)->NewStringUTF(env, property))!=NULL  \
+		&& (jValue=(*env)->CallStaticObjectMethod(env, cls, mid, jName))!=NULL)  \
+	{  \
+		if((value=(*env)->GetStringUTFChars(env, jValue, 0))!=NULL)  \
+		{  \
+			setenv(envvar, value, 1);  \
+			(*env)->ReleaseStringUTFChars(env, jValue, value);  \
+		}  \
+	}  \
+}
+
+int ProcessSystemProperties(JNIEnv *env)
+{
+	jclass cls;  jmethodID mid;
+	jstring jName, jValue;
+	const char *value;
+
+	bailif0(cls=(*env)->FindClass(env, "java/lang/System"));
+	bailif0(mid=(*env)->GetStaticMethodID(env, cls, "getProperty",
+		"(Ljava/lang/String;)Ljava/lang/String;"));
+
+	prop2env("turbojpeg.optimize", "TJ_OPTIMIZE");
+	prop2env("turbojpeg.arithmetic", "TJ_ARITHMETIC");
+	prop2env("turbojpeg.restart", "TJ_RESTART");
+	prop2env("turbojpeg.progressive", "TJ_PROGRESSIVE");
+	return 0;
+
+	bailout:
+	return -1;
+}
+
+/* TurboJPEG 1.2.x: TJ::bufSize() */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSize
 	(JNIEnv *env, jclass cls, jint width, jint height, jint jpegSubsamp)
 {
 	jint retval=(jint)tjBufSize(width, height, jpegSubsamp);
-	if(retval==-1) _throw(tjGetErrorStr());
+	if(retval==-1) _throwarg(tjGetErrorStr());
 
 	bailout:
 	return retval;
 }
 
-JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV
+/* TurboJPEG 1.4.x: TJ::bufSizeYUV() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII
+	(JNIEnv *env, jclass cls, jint width, jint pad, jint height, jint subsamp)
+{
+	jint retval=(jint)tjBufSizeYUV2(width, pad, height, subsamp);
+	if(retval==-1) _throwarg(tjGetErrorStr());
+
+	bailout:
+	return retval;
+}
+
+/* TurboJPEG 1.2.x: TJ::bufSizeYUV() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__III
 	(JNIEnv *env, jclass cls, jint width, jint height, jint subsamp)
 {
-	jint retval=(jint)tjBufSizeYUV(width, height, subsamp);
-	if(retval==-1) _throw(tjGetErrorStr());
+	return Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII(env, cls, width,
+		4, height, subsamp);
+}
+
+/* TurboJPEG 1.4.x: TJ::planeSizeYUV() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeSizeYUV__IIIII
+	(JNIEnv *env, jclass cls, jint componentID, jint width, jint stride,
+		jint height, jint subsamp)
+{
+	jint retval=(jint)tjPlaneSizeYUV(componentID, width, stride, height,
+		subsamp);
+	if(retval==-1) _throwarg(tjGetErrorStr());
 
 	bailout:
 	return retval;
 }
 
+/* TurboJPEG 1.4.x: TJ::planeWidth() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeWidth__III
+	(JNIEnv *env, jclass cls, jint componentID, jint width, jint subsamp)
+{
+	jint retval=(jint)tjPlaneWidth(componentID, width, subsamp);
+	if(retval==-1) _throwarg(tjGetErrorStr());
+
+	bailout:
+	return retval;
+}
+
+/* TurboJPEG 1.4.x: TJ::planeHeight() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeHeight__III
+	(JNIEnv *env, jclass cls, jint componentID, jint height, jint subsamp)
+{
+	jint retval=(jint)tjPlaneHeight(componentID, height, subsamp);
+	if(retval==-1) _throwarg(tjGetErrorStr());
+
+	bailout:
+	return retval;
+}
+
+/* TurboJPEG 1.2.x: TJCompressor::init() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_init
 	(JNIEnv *env, jobject obj)
 {
@@ -85,20 +174,20 @@
 	tjhandle handle;
 
 	if((handle=tjInitCompress())==NULL)
-		_throw(tjGetErrorStr());
+		_throwtj();
 
 	bailif0(cls=(*env)->GetObjectClass(env, obj));
 	bailif0(fid=(*env)->GetFieldID(env, cls, "handle", "J"));
-	(*env)->SetLongField(env, obj, fid, (jlong)handle);
+	(*env)->SetLongField(env, obj, fid, (size_t)handle);
 
 	bailout:
 	return;
 }
 
-JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIIIII_3BIII
-	(JNIEnv *env, jobject obj, jbyteArray src, jint x, jint y, jint width,
-		jint pitch, jint height, jint pf, jbyteArray dst, jint jpegSubsamp,
-		jint jpegQual, jint flags)
+static jint TJCompressor_compress
+	(JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint x, jint y,
+		jint width, jint pitch, jint height, jint pf, jbyteArray dst,
+		jint jpegSubsamp, jint jpegQual, jint flags)
 {
 	tjhandle handle=0;
 	unsigned long jpegSize=0;
@@ -109,30 +198,27 @@
 
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || width<1 || height<1
 		|| pitch<0)
-		_throw("Invalid argument in compress()");
+		_throwarg("Invalid argument in compress()");
 	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throw("Mismatch between Java and C API");
+		_throwarg("Mismatch between Java and C API");
 
 	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
-	arraySize=(y+height-1)*actualPitch + x+width;
-	if((*env)->GetArrayLength(env, src)<arraySize)
-		_throw("Source buffer is not large enough");
+	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
+	if((*env)->GetArrayLength(env, src)*srcElementSize<arraySize)
+		_throwarg("Source buffer is not large enough");
 	jpegSize=tjBufSize(width, height, jpegSubsamp);
 	if((*env)->GetArrayLength(env, dst)<(jsize)jpegSize)
-		_throw("Destination buffer is not large enough");
+		_throwarg("Destination buffer is not large enough");
 
 	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
+	if(ProcessSystemProperties(env)<0) goto bailout;
+
 	if(tjCompress2(handle, &srcBuf[y*actualPitch + x*tjPixelSize[pf]], width,
 		pitch, height, pf, &jpegBuf, &jpegSize, jpegSubsamp, jpegQual,
 		flags|TJFLAG_NOREALLOC)==-1)
-	{
-		(*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
-		(*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
-		jpegBuf=srcBuf=NULL;
-		_throw(tjGetErrorStr());
-	}
+		_throwtj();
 
 	bailout:
 	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
@@ -140,76 +226,249 @@
 	return (jint)jpegSize;
 }
 
+/* TurboJPEG 1.3.x: TJCompressor::compress() byte source */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIIIII_3BIII
+	(JNIEnv *env, jobject obj, jbyteArray src, jint x, jint y, jint width,
+		jint pitch, jint height, jint pf, jbyteArray dst, jint jpegSubsamp,
+		jint jpegQual, jint flags)
+{
+	return TJCompressor_compress(env, obj, src, 1, x, y, width, pitch, height,
+		pf, dst, jpegSubsamp, jpegQual, flags);
+}
+
+/* TurboJPEG 1.2.x: TJCompressor::compress() byte source */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIII_3BIII
 	(JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
 		jint height, jint pf, jbyteArray dst, jint jpegSubsamp, jint jpegQual,
 		jint flags)
 {
-	return Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIIIII_3BIII(
-		env, obj, src, 0, 0, width, pitch, height, pf, dst, jpegSubsamp, jpegQual,
-		flags);
+	return TJCompressor_compress(env, obj, src, 1, 0, 0, width, pitch, height,
+		pf, dst, jpegSubsamp, jpegQual, flags);
 }
 
+/* TurboJPEG 1.3.x: TJCompressor::compress() int source */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIIIII_3BIII
 	(JNIEnv *env, jobject obj, jintArray src, jint x, jint y, jint width,
 		jint stride, jint height, jint pf, jbyteArray dst, jint jpegSubsamp,
 		jint jpegQual, jint flags)
 {
+	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
+		_throwarg("Invalid argument in compress()");
+	if(tjPixelSize[pf]!=sizeof(jint))
+		_throwarg("Pixel format must be 32-bit when compressing from an integer buffer.");
+
+	return TJCompressor_compress(env, obj, src, sizeof(jint), x, y, width,
+		stride*sizeof(jint), height, pf, dst, jpegSubsamp, jpegQual, flags);
+
+	bailout:
+	return 0;
+}
+
+/* TurboJPEG 1.2.x: TJCompressor::compress() int source */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIII_3BIII
+	(JNIEnv *env, jobject obj, jintArray src, jint width, jint stride,
+		jint height, jint pf, jbyteArray dst, jint jpegSubsamp, jint jpegQual,
+		jint flags)
+{
+	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
+		_throwarg("Invalid argument in compress()");
+	if(tjPixelSize[pf]!=sizeof(jint))
+		_throwarg("Pixel format must be 32-bit when compressing from an integer buffer.");
+
+	return TJCompressor_compress(env, obj, src, sizeof(jint), 0, 0, width,
+		stride*sizeof(jint), height, pf, dst, jpegSubsamp, jpegQual, flags);
+
+	bailout:
+	return 0;
+}
+
+/* TurboJPEG 1.4.x: TJCompressor::compressFromYUV() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3_3B_3II_3III_3BII
+	(JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+		jint width, jintArray jSrcStrides, jint height, jint subsamp,
+		jbyteArray dst, jint jpegQual, jint flags)
+{
 	tjhandle handle=0;
 	unsigned long jpegSize=0;
-	jsize arraySize=0, actualStride;
-	unsigned char *srcBuf=NULL, *jpegBuf=NULL;
+	jbyteArray jSrcPlanes[3]={NULL, NULL, NULL};
+	const unsigned char *srcPlanes[3];
+	unsigned char *jpegBuf=NULL;
+	int *srcOffsets=NULL, *srcStrides=NULL;
+	int nc=(subsamp==org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY? 1:3), i;
+
+	gethandle();
+
+	if(subsamp<0 || subsamp>=org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
+		_throwarg("Invalid argument in compressFromYUV()");
+	if(org_libjpegturbo_turbojpeg_TJ_NUMSAMP!=TJ_NUMSAMP)
+		_throwarg("Mismatch between Java and C API");
+
+	if((*env)->GetArrayLength(env, srcobjs)<nc)
+		_throwarg("Planes array is too small for the subsampling type");
+	if((*env)->GetArrayLength(env, jSrcOffsets)<nc)
+		_throwarg("Offsets array is too small for the subsampling type");
+	if((*env)->GetArrayLength(env, jSrcStrides)<nc)
+		_throwarg("Strides array is too small for the subsampling type");
+
+	jpegSize=tjBufSize(width, height, subsamp);
+	if((*env)->GetArrayLength(env, dst)<(jsize)jpegSize)
+		_throwarg("Destination buffer is not large enough");
+
+	bailif0(srcOffsets=(*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
+	bailif0(srcStrides=(*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
+	for(i=0; i<nc; i++)
+	{
+		int planeSize=tjPlaneSizeYUV(i, width, srcStrides[i], height, subsamp);
+		int pw=tjPlaneWidth(i, width, subsamp);
+
+		if(planeSize<0 || pw<0)
+			_throwarg(tjGetErrorStr());
+
+		if(srcOffsets[i]<0)
+			_throwarg("Invalid argument in compressFromYUV()");
+		if(srcStrides[i]<0 && srcOffsets[i]-planeSize+pw<0)
+			_throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+
+		bailif0(jSrcPlanes[i]=(*env)->GetObjectArrayElement(env, srcobjs, i));
+		if((*env)->GetArrayLength(env, jSrcPlanes[i])<srcOffsets[i]+planeSize)
+			_throwarg("Source plane is not large enough");
+
+		bailif0(srcPlanes[i]=(*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i],
+			0));
+		srcPlanes[i]=&srcPlanes[i][srcOffsets[i]];
+	}
+	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+	if(ProcessSystemProperties(env)<0) goto bailout;
+
+	if(tjCompressFromYUVPlanes(handle, srcPlanes, width, srcStrides, height,
+		subsamp, &jpegBuf, &jpegSize, jpegQual, flags|TJFLAG_NOREALLOC)==-1)
+		_throwtj();
+
+	bailout:
+	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
+	for(i=0; i<nc; i++)
+	{
+		if(srcPlanes[i] && jSrcPlanes[i])
+			(*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i],
+				(unsigned char *)srcPlanes[i], 0);
+	}
+	if(srcStrides)
+		(*env)->ReleasePrimitiveArrayCritical(env, jSrcStrides, srcStrides, 0);
+	if(srcOffsets)
+		(*env)->ReleasePrimitiveArrayCritical(env, jSrcOffsets, srcOffsets, 0);
+	return (jint)jpegSize;
+}
+
+static void TJCompressor_encodeYUV
+	(JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint x, jint y,
+		jint width, jint pitch, jint height, jint pf, jobjectArray dstobjs,
+		jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
+{
+	tjhandle handle=0;
+	jsize arraySize=0, actualPitch;
+	jbyteArray jDstPlanes[3]={NULL, NULL, NULL};
+	unsigned char *srcBuf=NULL, *dstPlanes[3];
+	int *dstOffsets=NULL, *dstStrides=NULL;
+	int nc=(subsamp==org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY? 1:3), i;
 
 	gethandle();
 
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || width<1 || height<1
-		|| stride<0)
-		_throw("Invalid argument in compress()");
-	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throw("Mismatch between Java and C API");
-	if(tjPixelSize[pf]!=sizeof(jint))
-		_throw("Pixel format must be 32-bit when compressing from an integer buffer.");
+		|| pitch<0 || subsamp<0 || subsamp>=org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
+		_throwarg("Invalid argument in encodeYUV()");
+	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF
+		|| org_libjpegturbo_turbojpeg_TJ_NUMSAMP!=TJ_NUMSAMP)
+		_throwarg("Mismatch between Java and C API");
 
-	actualStride=(stride==0)? width:stride;
-	arraySize=(y+height-1)*actualStride + x+width;
-	if((*env)->GetArrayLength(env, src)<arraySize)
-		_throw("Source buffer is not large enough");
-	jpegSize=tjBufSize(width, height, jpegSubsamp);
-	if((*env)->GetArrayLength(env, dst)<(jsize)jpegSize)
-		_throw("Destination buffer is not large enough");
+	if((*env)->GetArrayLength(env, dstobjs)<nc)
+		_throwarg("Planes array is too small for the subsampling type");
+	if((*env)->GetArrayLength(env, jDstOffsets)<nc)
+		_throwarg("Offsets array is too small for the subsampling type");
+	if((*env)->GetArrayLength(env, jDstStrides)<nc)
+		_throwarg("Strides array is too small for the subsampling type");
 
-	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
-	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
+	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
+	if((*env)->GetArrayLength(env, src)*srcElementSize<arraySize)
+		_throwarg("Source buffer is not large enough");
 
-	if(tjCompress2(handle, &srcBuf[(y*actualStride + x)*sizeof(int)], width,
-		stride*sizeof(jint), height, pf, &jpegBuf, &jpegSize, jpegSubsamp,
-		jpegQual, flags|TJFLAG_NOREALLOC)==-1)
+	bailif0(dstOffsets=(*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
+	bailif0(dstStrides=(*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
+	for(i=0; i<nc; i++)
 	{
-		(*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
-		(*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
-		jpegBuf=srcBuf=NULL;
-		_throw(tjGetErrorStr());
+		int planeSize=tjPlaneSizeYUV(i, width, dstStrides[i], height, subsamp);
+		int pw=tjPlaneWidth(i, width, subsamp);
+
+		if(planeSize<0 || pw<0)
+			_throwarg(tjGetErrorStr());
+
+		if(dstOffsets[i]<0)
+			_throwarg("Invalid argument in encodeYUV()");
+		if(dstStrides[i]<0 && dstOffsets[i]-planeSize+pw<0)
+			_throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+
+		bailif0(jDstPlanes[i]=(*env)->GetObjectArrayElement(env, dstobjs, i));
+		if((*env)->GetArrayLength(env, jDstPlanes[i])<dstOffsets[i]+planeSize)
+			_throwarg("Destination plane is not large enough");
+
+		bailif0(dstPlanes[i]=(*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i],
+			0));
+		dstPlanes[i]=&dstPlanes[i][dstOffsets[i]];
 	}
+	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
+
+	if(tjEncodeYUVPlanes(handle, &srcBuf[y*actualPitch + x*tjPixelSize[pf]],
+		width, pitch, height, pf, dstPlanes, dstStrides, subsamp, flags)==-1)
+		_throwtj();
 
 	bailout:
-	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
 	if(srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
-	return (jint)jpegSize;
+	for(i=0; i<nc; i++)
+	{
+		if(dstPlanes[i] && jDstPlanes[i])
+			(*env)->ReleasePrimitiveArrayCritical(env, jDstPlanes[i], dstPlanes[i],
+				0);
+	}
+	if(dstStrides)
+		(*env)->ReleasePrimitiveArrayCritical(env, jDstStrides, dstStrides, 0);
+	if(dstOffsets)
+		(*env)->ReleasePrimitiveArrayCritical(env, jDstOffsets, dstOffsets, 0);
+	return;
 }
 
-JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIII_3BIII
-	(JNIEnv *env, jobject obj, jintArray src, jint width, jint pitch,
-		jint height, jint pf, jbyteArray dst, jint jpegSubsamp, jint jpegQual,
-		jint flags)
+/* TurboJPEG 1.4.x: TJCompressor::encodeYUV() byte source */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIIIII_3_3B_3I_3III
+	(JNIEnv *env, jobject obj, jbyteArray src, jint x, jint y, jint width,
+		jint pitch, jint height, jint pf, jobjectArray dstobjs,
+		jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
 {
-	return Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIIIII_3BIII(
-		env, obj, src, 0, 0, width, pitch, height, pf, dst, jpegSubsamp, jpegQual,
-		flags);
+	TJCompressor_encodeYUV(env, obj, src, 1, x, y, width, pitch, height, pf,
+		dstobjs, jDstOffsets, jDstStrides, subsamp, flags);
 }
 
-JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BII
-	(JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
-		jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
+/* TurboJPEG 1.4.x: TJCompressor::encodeYUV() int source */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIIIII_3_3B_3I_3III
+	(JNIEnv *env, jobject obj, jintArray src, jint x, jint y, jint width,
+		jint stride, jint height, jint pf, jobjectArray dstobjs,
+		jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
+{
+	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
+		_throwarg("Invalid argument in encodeYUV()");
+	if(tjPixelSize[pf]!=sizeof(jint))
+		_throwarg("Pixel format must be 32-bit when encoding from an integer buffer.");
+
+	TJCompressor_encodeYUV(env, obj, src, sizeof(jint), x, y, width,
+		stride*sizeof(jint), height, pf, dstobjs, jDstOffsets, jDstStrides,
+		subsamp, flags);
+
+	bailout:
+	return;
+}
+
+JNIEXPORT void JNICALL TJCompressor_encodeYUV_12
+	(JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint width,
+		jint pitch, jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
 {
 	tjhandle handle=0;
 	jsize arraySize=0;
@@ -219,28 +478,23 @@
 
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || width<1 || height<1
 		|| pitch<0)
-		_throw("Invalid argument in encodeYUV()");
+		_throwarg("Invalid argument in encodeYUV()");
 	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throw("Mismatch between Java and C API");
+		_throwarg("Mismatch between Java and C API");
 
 	arraySize=(pitch==0)? width*tjPixelSize[pf]*height:pitch*height;
-	if((*env)->GetArrayLength(env, src)<arraySize)
-		_throw("Source buffer is not large enough");
+	if((*env)->GetArrayLength(env, src)*srcElementSize<arraySize)
+		_throwarg("Source buffer is not large enough");
 	if((*env)->GetArrayLength(env, dst)
 		<(jsize)tjBufSizeYUV(width, height, subsamp))
-		_throw("Destination buffer is not large enough");
+		_throwarg("Destination buffer is not large enough");
 
 	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
 	if(tjEncodeYUV2(handle, srcBuf, width, pitch, height, pf, dstBuf, subsamp,
 		flags)==-1)
-	{
-		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-		(*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
-		dstBuf=srcBuf=NULL;
-		_throw(tjGetErrorStr());
-	}
+		_throwtj();
 
 	bailout:
 	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
@@ -248,49 +502,33 @@
 	return;
 }
 
+/* TurboJPEG 1.2.x: TJCompressor::encodeYUV() byte source */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BII
+	(JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
+		jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
+{
+	TJCompressor_encodeYUV_12(env, obj, src, 1, width, pitch, height, pf, dst,
+		subsamp, flags);
+}
+
+/* TurboJPEG 1.2.x: TJCompressor::encodeYUV() int source */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BII
 	(JNIEnv *env, jobject obj, jintArray src, jint width, jint stride,
 		jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
 {
-	tjhandle handle=0;
-	jsize arraySize=0;
-	unsigned char *srcBuf=NULL, *dstBuf=NULL;
-
-	gethandle();
-
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || width<1 || height<1
-		|| stride<0)
-		_throw("Invalid argument in encodeYUV()");
-	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throw("Mismatch between Java and C API");
+	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
+		_throwarg("Invalid argument in encodeYUV()");
 	if(tjPixelSize[pf]!=sizeof(jint))
-		_throw("Pixel format must be 32-bit when encoding from an integer buffer.");
+		_throwarg("Pixel format must be 32-bit when encoding from an integer buffer.");
 
-	arraySize=(stride==0)? width*height:stride*height;
-	if((*env)->GetArrayLength(env, src)<arraySize)
-		_throw("Source buffer is not large enough");
-	if((*env)->GetArrayLength(env, dst)
-		<(jsize)tjBufSizeYUV(width, height, subsamp))
-		_throw("Destination buffer is not large enough");
-
-	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
-	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
-
-	if(tjEncodeYUV2(handle, srcBuf, width, stride*sizeof(jint), height, pf,
-		dstBuf, subsamp, flags)==-1)
-	{
-		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-		(*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
-		dstBuf=srcBuf=NULL;
-		_throw(tjGetErrorStr());
-	}
+	TJCompressor_encodeYUV_12(env, obj, src, sizeof(jint), width,
+		stride*sizeof(jint), height, pf, dst, subsamp, flags);
 
 	bailout:
-	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-	if(srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
 	return;
 }
 
+/* TurboJPEG 1.2.x: TJCompressor::destroy() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy
 	(JNIEnv *env, jobject obj)
 {
@@ -298,13 +536,14 @@
 
 	gethandle();
 
-	if(tjDestroy(handle)==-1) _throw(tjGetErrorStr());
+	if(tjDestroy(handle)==-1) _throwtj();
 	(*env)->SetLongField(env, obj, _fid, 0);
 
 	bailout:
 	return;
 }
 
+/* TurboJPEG 1.2.x: TJDecompressor::init() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_init
 	(JNIEnv *env, jobject obj)
 {
@@ -312,26 +551,27 @@
 	jfieldID fid;
 	tjhandle handle;
 
-	if((handle=tjInitDecompress())==NULL) _throw(tjGetErrorStr());
+	if((handle=tjInitDecompress())==NULL) _throwtj();
 
 	bailif0(cls=(*env)->GetObjectClass(env, obj));
 	bailif0(fid=(*env)->GetFieldID(env, cls, "handle", "J"));
-	(*env)->SetLongField(env, obj, fid, (jlong)handle);
+	(*env)->SetLongField(env, obj, fid, (size_t)handle);
 
 	bailout:
 	return;
 }
 
+/* TurboJPEG 1.2.x: TJDecompressor::getScalingFactors() */
 JNIEXPORT jobjectArray JNICALL Java_org_libjpegturbo_turbojpeg_TJ_getScalingFactors
 	(JNIEnv *env, jclass cls)
 {
-  jclass sfcls=NULL;  jfieldID fid=0;
+	jclass sfcls=NULL;  jfieldID fid=0;
 	tjscalingfactor *sf=NULL;  int n=0, i;
 	jobject sfobj=NULL;
 	jobjectArray sfjava=NULL;
 
 	if((sf=tjGetScalingFactors(&n))==NULL || n==0)
-		_throw(tjGetErrorStr());
+		_throwarg(tjGetErrorStr());
 
 	bailif0(sfcls=(*env)->FindClass(env, "org/libjpegturbo/turbojpeg/TJScalingFactor"));
 	bailif0(sfjava=(jobjectArray)(*env)->NewObjectArray(env, n, sfcls, 0));
@@ -350,42 +590,47 @@
 	return sfjava;
 }
 
+/* TurboJPEG 1.2.x: TJDecompressor::decompressHeader() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader
 	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize)
 {
 	tjhandle handle=0;
 	unsigned char *jpegBuf=NULL;
-	int width=0, height=0, jpegSubsamp=-1;
+	int width=0, height=0, jpegSubsamp=-1, jpegColorspace=-1;
 
 	gethandle();
 
 	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throw("Source buffer is not large enough");
+		_throwarg("Source buffer is not large enough");
 
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 
-	if(tjDecompressHeader2(handle, jpegBuf, (unsigned long)jpegSize, 
-		&width, &height, &jpegSubsamp)==-1)
-	{
-		(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-		_throw(tjGetErrorStr());
-	}
+	if(tjDecompressHeader3(handle, jpegBuf, (unsigned long)jpegSize,
+		&width, &height, &jpegSubsamp, &jpegColorspace)==-1)
+		_throwtj();
+
 	(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);  jpegBuf=NULL;
 
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
 	(*env)->SetIntField(env, obj, _fid, jpegSubsamp);
+	if((_fid=(*env)->GetFieldID(env, _cls, "jpegColorspace", "I"))==0)
+		(*env)->ExceptionClear(env);
+	else
+		(*env)->SetIntField(env, obj, _fid, jpegColorspace);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
 	(*env)->SetIntField(env, obj, _fid, width);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
 	(*env)->SetIntField(env, obj, _fid, height);
 
 	bailout:
+	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
 	return;
 }
 
-JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIIIII
-	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
-		jint x, jint y, jint width, jint pitch, jint height, jint pf, jint flags)
+static void TJDecompressor_decompress
+	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jarray dst,
+		jint dstElementSize, jint x, jint y, jint width, jint pitch, jint height,
+		jint pf, jint flags)
 {
 	tjhandle handle=0;
 	jsize arraySize=0, actualPitch;
@@ -394,16 +639,16 @@
 	gethandle();
 
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throw("Invalid argument in decompress()");
+		_throwarg("Invalid argument in decompress()");
 	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throw("Mismatch between Java and C API");
+		_throwarg("Mismatch between Java and C API");
 
 	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throw("Source buffer is not large enough");
+		_throwarg("Source buffer is not large enough");
 	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
 	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
-	if((*env)->GetArrayLength(env, dst)<arraySize)
-		_throw("Destination buffer is not large enough");
+	if((*env)->GetArrayLength(env, dst)*dstElementSize<arraySize)
+		_throwarg("Destination buffer is not large enough");
 
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
@@ -411,12 +656,7 @@
 	if(tjDecompress2(handle, jpegBuf, (unsigned long)jpegSize,
 		&dstBuf[y*actualPitch + x*tjPixelSize[pf]], width, pitch, height, pf,
 		flags)==-1)
-	{
-		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-		(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-		dstBuf=jpegBuf=NULL;
-		_throw(tjGetErrorStr());
-	}
+		_throwtj();
 
 	bailout:
 	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
@@ -424,67 +664,150 @@
 	return;
 }
 
+/* TurboJPEG 1.3.x: TJDecompressor::decompress() byte destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIIIII
+	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
+		jint x, jint y, jint width, jint pitch, jint height, jint pf, jint flags)
+{
+	TJDecompressor_decompress(env, obj, src, jpegSize, dst, 1, x, y, width,
+		pitch, height, pf, flags);
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::decompress() byte destination */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIII
 	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
 		jint width, jint pitch, jint height, jint pf, jint flags)
 {
-	Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIIIII
-		(env, obj, src, jpegSize, dst, 0, 0, width, pitch, height, pf, flags);
+	TJDecompressor_decompress(env, obj, src, jpegSize, dst, 1, 0, 0, width,
+		pitch, height, pf, flags);
 }
 
+/* TurboJPEG 1.3.x: TJDecompressor::decompress() int destination */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIIIII
 	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jintArray dst,
 		jint x, jint y, jint width, jint stride, jint height, jint pf, jint flags)
 {
-	tjhandle handle=0;
-	jsize arraySize=0, actualStride;
-	unsigned char *jpegBuf=NULL, *dstBuf=NULL;
-
-	gethandle();
-
 	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throw("Invalid argument in decompress()");
-	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throw("Mismatch between Java and C API");
+		_throwarg("Invalid argument in decompress()");
 	if(tjPixelSize[pf]!=sizeof(jint))
-		_throw("Pixel format must be 32-bit when decompressing to an integer buffer.");
+		_throwarg("Pixel format must be 32-bit when decompressing to an integer buffer.");
 
-	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throw("Source buffer is not large enough");
-	actualStride=(stride==0)? width:stride;
-	arraySize=(y+height-1)*actualStride + x+width;
-	if((*env)->GetArrayLength(env, dst)<arraySize)
-		_throw("Destination buffer is not large enough");
-
-	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
-	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
-
-	if(tjDecompress2(handle, jpegBuf, (unsigned long)jpegSize,
-		&dstBuf[(y*actualStride + x)*sizeof(int)], width, stride*sizeof(jint),
-		height, pf, flags)==-1)
-	{
-		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-		(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-		dstBuf=jpegBuf=NULL;
-		_throw(tjGetErrorStr());
-	}
+	TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), x, y,
+		width, stride*sizeof(jint), height, pf, flags);
 
 	bailout:
-	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
 	return;
 }
 
+/* TurboJPEG 1.2.x: TJDecompressor::decompress() int destination */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIII
 	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jintArray dst,
 		jint width, jint stride, jint height, jint pf, jint flags)
 {
-	Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIIIII
-		(env, obj, src, jpegSize, dst, 0, 0, width, stride, height, pf, flags);
-	
+	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
+		_throwarg("Invalid argument in decompress()");
+	if(tjPixelSize[pf]!=sizeof(jint))
+		_throwarg("Pixel format must be 32-bit when decompressing to an integer buffer.");
+
+	TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), 0, 0,
+		width, stride*sizeof(jint), height, pf, flags);
+
+	bailout:
+	return;
+
 }
 
-JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV
+/* TurboJPEG 1.4.x: TJDecompressor::decompressToYUV() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3_3B_3II_3III
+	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize,
+		jobjectArray dstobjs, jintArray jDstOffsets, jint desiredWidth,
+		jintArray jDstStrides, jint desiredHeight, jint flags)
+{
+	tjhandle handle=0;
+	jbyteArray jDstPlanes[3]={NULL, NULL, NULL};
+	unsigned char *jpegBuf=NULL, *dstPlanes[3];
+	int *dstOffsets=NULL, *dstStrides=NULL;
+	int jpegSubsamp=-1, jpegWidth=0, jpegHeight=0;
+	int nc=0, i, width, height, scaledWidth, scaledHeight, nsf=0;
+	tjscalingfactor *sf;
+
+
+	gethandle();
+
+	if((*env)->GetArrayLength(env, src)<jpegSize)
+		_throwarg("Source buffer is not large enough");
+	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+	jpegSubsamp=(int)(*env)->GetIntField(env, obj, _fid);
+	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+	jpegWidth=(int)(*env)->GetIntField(env, obj, _fid);
+	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+	jpegHeight=(int)(*env)->GetIntField(env, obj, _fid);
+
+	nc=(jpegSubsamp==org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY? 1:3);
+
+	width=desiredWidth;  height=desiredHeight;
+	if(width==0) width=jpegWidth;
+	if(height==0) height=jpegHeight;
+	sf=tjGetScalingFactors(&nsf);
+	if(!sf || nsf<1)
+		_throwarg(tjGetErrorStr());
+	for(i=0; i<nsf; i++)
+	{
+		scaledWidth=TJSCALED(jpegWidth, sf[i]);
+		scaledHeight=TJSCALED(jpegHeight, sf[i]);
+		if(scaledWidth<=width && scaledHeight<=height)
+			break;
+	}
+	if(i>=nsf)
+		_throwarg("Could not scale down to desired image dimensions");
+
+	bailif0(dstOffsets=(*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
+	bailif0(dstStrides=(*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
+	for(i=0; i<nc; i++)
+	{
+		int planeSize=tjPlaneSizeYUV(i, scaledWidth, dstStrides[i], scaledHeight,
+			jpegSubsamp);
+		int pw=tjPlaneWidth(i, scaledWidth, jpegSubsamp);
+
+		if(planeSize<0 || pw<0)
+			_throwarg(tjGetErrorStr());
+
+		if(dstOffsets[i]<0)
+			_throwarg("Invalid argument in decompressToYUV()");
+		if(dstStrides[i]<0 && dstOffsets[i]-planeSize+pw<0)
+			_throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+
+		bailif0(jDstPlanes[i]=(*env)->GetObjectArrayElement(env, dstobjs, i));
+		if((*env)->GetArrayLength(env, jDstPlanes[i])<dstOffsets[i]+planeSize)
+			_throwarg("Destination plane is not large enough");
+
+		bailif0(dstPlanes[i]=(*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i],
+			0));
+		dstPlanes[i]=&dstPlanes[i][dstOffsets[i]];
+	}
+	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
+
+	if(tjDecompressToYUVPlanes(handle, jpegBuf, (unsigned long)jpegSize,
+		dstPlanes, desiredWidth, dstStrides, desiredHeight, flags)==-1)
+		_throwtj();
+
+	bailout:
+	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
+	for(i=0; i<nc; i++)
+	{
+		if(dstPlanes[i] && jDstPlanes[i])
+			(*env)->ReleasePrimitiveArrayCritical(env, jDstPlanes[i], dstPlanes[i],
+				0);
+	}
+	if(dstStrides)
+		(*env)->ReleasePrimitiveArrayCritical(env, jDstStrides, dstStrides, 0);
+	if(dstOffsets)
+		(*env)->ReleasePrimitiveArrayCritical(env, jDstOffsets, dstOffsets, 0);
+	return;
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::decompressToYUV() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BI
 	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
 		jint flags)
 {
@@ -495,7 +818,7 @@
 	gethandle();
 
 	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throw("Source buffer is not large enough");
+		_throwarg("Source buffer is not large enough");
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
 	jpegSubsamp=(int)(*env)->GetIntField(env, obj, _fid);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
@@ -504,19 +827,14 @@
 	jpegHeight=(int)(*env)->GetIntField(env, obj, _fid);
 	if((*env)->GetArrayLength(env, dst)
 		<(jsize)tjBufSizeYUV(jpegWidth, jpegHeight, jpegSubsamp))
-		_throw("Destination buffer is not large enough");
+		_throwarg("Destination buffer is not large enough");
 
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
 	if(tjDecompressToYUV(handle, jpegBuf, (unsigned long)jpegSize, dstBuf,
 		flags)==-1)
-	{
-		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-		(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-		dstBuf=jpegBuf=NULL;
-		_throw(tjGetErrorStr());
-	}
+		_throwtj();
 
 	bailout:
 	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
@@ -524,6 +842,115 @@
 	return;
 }
 
+static void TJDecompressor_decodeYUV
+	(JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+		jintArray jSrcStrides, jint subsamp, jarray dst, jint dstElementSize,
+		jint x, jint y, jint width, jint pitch, jint height, jint pf, jint flags)
+{
+	tjhandle handle=0;
+	jsize arraySize=0, actualPitch;
+	jbyteArray jSrcPlanes[3]={NULL, NULL, NULL};
+	const unsigned char *srcPlanes[3];
+	unsigned char *dstBuf=NULL;
+	int *srcOffsets=NULL, *srcStrides=NULL;
+	int nc=(subsamp==org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY? 1:3), i;
+
+	gethandle();
+
+	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || subsamp<0
+		|| subsamp>=org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
+		_throwarg("Invalid argument in decodeYUV()");
+	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF
+		|| org_libjpegturbo_turbojpeg_TJ_NUMSAMP!=TJ_NUMSAMP)
+		_throwarg("Mismatch between Java and C API");
+
+	if((*env)->GetArrayLength(env, srcobjs)<nc)
+		_throwarg("Planes array is too small for the subsampling type");
+	if((*env)->GetArrayLength(env, jSrcOffsets)<nc)
+		_throwarg("Offsets array is too small for the subsampling type");
+	if((*env)->GetArrayLength(env, jSrcStrides)<nc)
+		_throwarg("Strides array is too small for the subsampling type");
+
+	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
+	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
+	if((*env)->GetArrayLength(env, dst)*dstElementSize<arraySize)
+		_throwarg("Destination buffer is not large enough");
+
+	bailif0(srcOffsets=(*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
+	bailif0(srcStrides=(*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
+	for(i=0; i<nc; i++)
+	{
+		int planeSize=tjPlaneSizeYUV(i, width, srcStrides[i], height, subsamp);
+		int pw=tjPlaneWidth(i, width, subsamp);
+
+		if(planeSize<0 || pw<0)
+			_throwarg(tjGetErrorStr());
+
+		if(srcOffsets[i]<0)
+			_throwarg("Invalid argument in decodeYUV()");
+		if(srcStrides[i]<0 && srcOffsets[i]-planeSize+pw<0)
+			_throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+
+		bailif0(jSrcPlanes[i]=(*env)->GetObjectArrayElement(env, srcobjs, i));
+		if((*env)->GetArrayLength(env, jSrcPlanes[i])<srcOffsets[i]+planeSize)
+			_throwarg("Source plane is not large enough");
+
+		bailif0(srcPlanes[i]=(*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i],
+			0));
+		srcPlanes[i]=&srcPlanes[i][srcOffsets[i]];
+	}
+	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+	if(tjDecodeYUVPlanes(handle, srcPlanes, srcStrides, subsamp,
+		&dstBuf[y*actualPitch + x*tjPixelSize[pf]], width, pitch, height, pf,
+		flags)==-1)
+		_throwtj();
+
+	bailout:
+	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
+	for(i=0; i<nc; i++)
+	{
+		if(srcPlanes[i] && jSrcPlanes[i])
+			(*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i],
+				(unsigned char *)srcPlanes[i], 0);
+	}
+	if(srcStrides)
+		(*env)->ReleasePrimitiveArrayCritical(env, jSrcStrides, srcStrides, 0);
+	if(srcOffsets)
+		(*env)->ReleasePrimitiveArrayCritical(env, jSrcOffsets, srcOffsets, 0);
+	return;
+}
+
+/* TurboJPEG 1.4.x: TJDecompressor::decodeYUV() byte destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV___3_3B_3I_3II_3BIIIIIII
+	(JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+		jintArray jSrcStrides, jint subsamp, jbyteArray dst, jint x, jint y,
+		jint width, jint pitch, jint height, jint pf, jint flags)
+{
+	TJDecompressor_decodeYUV(env, obj, srcobjs, jSrcOffsets, jSrcStrides,
+		subsamp, dst, 1, x, y, width, pitch, height, pf, flags);
+}
+
+/* TurboJPEG 1.4.x: TJDecompressor::decodeYUV() int destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV___3_3B_3I_3II_3IIIIIIII
+	(JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+		jintArray jSrcStrides, jint subsamp, jintArray dst, jint x, jint y,
+		jint width, jint stride, jint height, jint pf, jint flags)
+{
+	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
+		_throwarg("Invalid argument in decodeYUV()");
+	if(tjPixelSize[pf]!=sizeof(jint))
+		_throwarg("Pixel format must be 32-bit when decoding to an integer buffer.");
+
+	TJDecompressor_decodeYUV(env, obj, srcobjs, jSrcOffsets, jSrcStrides,
+		subsamp, dst, sizeof(jint), x, y, width, stride*sizeof(jint), height, pf,
+		flags);
+
+	bailout:
+	return;
+}
+
+/* TurboJPEG 1.2.x: TJTransformer::init() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_init
 	(JNIEnv *env, jobject obj)
 {
@@ -531,11 +958,11 @@
 	jfieldID fid;
 	tjhandle handle;
 
-	if((handle=tjInitTransform())==NULL) _throw(tjGetErrorStr());
+	if((handle=tjInitTransform())==NULL) _throwtj();
 
 	bailif0(cls=(*env)->GetObjectClass(env, obj));
 	bailif0(fid=(*env)->GetFieldID(env, cls, "handle", "J"));
-	(*env)->SetLongField(env, obj, fid, (jlong)handle);
+	(*env)->SetLongField(env, obj, fid, (size_t)handle);
 
 	bailout:
 	return;
@@ -555,20 +982,20 @@
 	JNICustomFilterParams *params=(JNICustomFilterParams *)transform->data;
 	JNIEnv *env=params->env;
 	jobject tobj=params->tobj, cfobj=params->cfobj;
-  jobject arrayRegionObj, planeRegionObj, bufobj, borobj;
+	jobject arrayRegionObj, planeRegionObj, bufobj, borobj;
 	jclass cls;  jmethodID mid;  jfieldID fid;
 
 	bailif0(bufobj=(*env)->NewDirectByteBuffer(env, coeffs,
 		sizeof(short)*arrayRegion.w*arrayRegion.h));
 	bailif0(cls=(*env)->FindClass(env, "java/nio/ByteOrder"));
-  bailif0(mid=(*env)->GetStaticMethodID(env, cls, "nativeOrder",
+	bailif0(mid=(*env)->GetStaticMethodID(env, cls, "nativeOrder",
 		"()Ljava/nio/ByteOrder;"));
 	bailif0(borobj=(*env)->CallStaticObjectMethod(env, cls, mid));
 	bailif0(cls=(*env)->GetObjectClass(env, bufobj));
 	bailif0(mid=(*env)->GetMethodID(env, cls, "order",
 		"(Ljava/nio/ByteOrder;)Ljava/nio/ByteBuffer;"));
 	(*env)->CallObjectMethod(env, bufobj, mid, borobj);
-  bailif0(mid=(*env)->GetMethodID(env, cls, "asShortBuffer",
+	bailif0(mid=(*env)->GetMethodID(env, cls, "asShortBuffer",
 		"()Ljava/nio/ShortBuffer;"));
 	bailif0(bufobj=(*env)->CallObjectMethod(env, bufobj, mid));
 
@@ -605,6 +1032,7 @@
 	return -1;
 }
 
+/* TurboJPEG 1.2.x: TJTransformer::transform() */
 JNIEXPORT jintArray JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_transform
 	(JNIEnv *env, jobject obj, jbyteArray jsrcBuf, jint jpegSize,
 		jobjectArray dstobjs, jobjectArray tobjs, jint flags)
@@ -620,7 +1048,7 @@
 	gethandle();
 
 	if((*env)->GetArrayLength(env, jsrcBuf)<jpegSize)
-		_throw("Source buffer is not large enough");
+		_throwarg("Source buffer is not large enough");
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
 	jpegWidth=(int)(*env)->GetIntField(env, obj, _fid);
 	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
@@ -630,19 +1058,19 @@
 
 	n=(*env)->GetArrayLength(env, dstobjs);
 	if(n!=(*env)->GetArrayLength(env, tobjs))
-		_throw("Mismatch between size of transforms array and destination buffers array");
+		_throwarg("Mismatch between size of transforms array and destination buffers array");
 
 	if((dstBufs=(unsigned char **)malloc(sizeof(unsigned char *)*n))==NULL)
-		_throw("Memory allocation failure");
+		_throwmem();
 	if((jdstBufs=(jbyteArray *)malloc(sizeof(jbyteArray)*n))==NULL)
-		_throw("Memory allocation failure");
+		_throwmem();
 	if((dstSizes=(unsigned long *)malloc(sizeof(unsigned long)*n))==NULL)
-		_throw("Memory allocation failure");
+		_throwmem();
 	if((t=(tjtransform *)malloc(sizeof(tjtransform)*n))==NULL)
-		_throw("Memory allocation failure");
+		_throwmem();
 	if((params=(JNICustomFilterParams *)malloc(sizeof(JNICustomFilterParams)*n))
 		==NULL)
-		_throw("Memory allocation failure");
+		_throwmem();
 	for(i=0; i<n; i++)
 	{
 		dstBufs[i]=NULL;  jdstBufs[i]=NULL;  dstSizes[i]=0;
@@ -682,7 +1110,6 @@
 		}
 	}
 
-	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, jsrcBuf, 0));
 	for(i=0; i<n; i++)
 	{
 		int w=jpegWidth, h=jpegHeight;
@@ -691,29 +1118,30 @@
 		bailif0(jdstBufs[i]=(*env)->GetObjectArrayElement(env, dstobjs, i));
 		if((unsigned long)(*env)->GetArrayLength(env, jdstBufs[i])
 			<tjBufSize(w, h, jpegSubsamp))
-			_throw("Destination buffer is not large enough");
-		bailif0(dstBufs[i]=(*env)->GetPrimitiveArrayCritical(env, jdstBufs[i], 0));
+			_throwarg("Destination buffer is not large enough");
 	}
+	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, jsrcBuf, 0));
+	for(i=0; i<n; i++)
+		bailif0(dstBufs[i]=(*env)->GetPrimitiveArrayCritical(env, jdstBufs[i], 0));
 
 	if(tjTransform(handle, jpegBuf, jpegSize, n, dstBufs, dstSizes, t,
 		flags|TJFLAG_NOREALLOC)==-1)
+		_throwtj();
+
+	for(i=0; i<n; i++)
 	{
-		(*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jpegBuf, 0);
-		jpegBuf=NULL;
-		for(i=0; i<n; i++)
-		{
-			(*env)->ReleasePrimitiveArrayCritical(env, jdstBufs[i], dstBufs[i], 0);
-			dstBufs[i]=NULL;
-		}
-		_throw(tjGetErrorStr());
+		(*env)->ReleasePrimitiveArrayCritical(env, jdstBufs[i], dstBufs[i], 0);
+		dstBufs[i]=NULL;
 	}
+	(*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jpegBuf, 0);
+	jpegBuf=NULL;
 
 	jdstSizes=(*env)->NewIntArray(env, n);
 	bailif0(dstSizesi=(*env)->GetIntArrayElements(env, jdstSizes, 0));
 	for(i=0; i<n; i++) dstSizesi[i]=(int)dstSizes[i];
 
 	bailout:
-	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jpegBuf, 0);
+	if(dstSizesi) (*env)->ReleaseIntArrayElements(env, jdstSizes, dstSizesi, 0);
 	if(dstBufs)
 	{
 		for(i=0; i<n; i++)
@@ -723,13 +1151,14 @@
 		}
 		free(dstBufs);
 	}
+	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jpegBuf, 0);
 	if(jdstBufs) free(jdstBufs);
 	if(dstSizes) free(dstSizes);
-	if(dstSizesi) (*env)->ReleaseIntArrayElements(env, jdstSizes, dstSizesi, 0);
 	if(t) free(t);
 	return jdstSizes;
 }
 
+/* TurboJPEG 1.2.x: TJDecompressor::destroy() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy
 	(JNIEnv *env, jobject obj)
 {
diff --git a/turbojpeg-mapfile b/turbojpeg-mapfile
new file mode 100644
index 0000000..35d55ae
--- /dev/null
+++ b/turbojpeg-mapfile
@@ -0,0 +1,56 @@
+TURBOJPEG_1.0
+{
+	global:
+		tjInitCompress;
+		tjCompress;
+		TJBUFSIZE;
+		tjInitDecompress;
+		tjDecompressHeader;
+		tjDecompress;
+		tjDestroy;
+		tjGetErrorStr;
+	local:
+		*;
+};
+
+TURBOJPEG_1.1
+{
+	global:
+		TJBUFSIZEYUV;
+		tjDecompressHeader2;
+		tjDecompressToYUV;
+		tjEncodeYUV;
+} TURBOJPEG_1.0;
+
+TURBOJPEG_1.2
+{
+	global:
+		tjAlloc;
+		tjBufSize;
+		tjBufSizeYUV;
+		tjCompress2;
+		tjDecompress2;
+		tjEncodeYUV2;
+		tjFree;
+		tjGetScalingFactors;
+		tjInitTransform;
+		tjTransform;
+} TURBOJPEG_1.1;
+
+TURBOJPEG_1.4
+{
+	global:
+		tjBufSizeYUV2;
+		tjCompressFromYUV;
+		tjCompressFromYUVPlanes;
+		tjDecodeYUV;
+		tjDecodeYUVPlanes;
+		tjDecompressHeader3;
+		tjDecompressToYUV2;
+		tjDecompressToYUVPlanes;
+		tjEncodeYUV3;
+		tjEncodeYUVPlanes;
+		tjPlaneHeight;
+		tjPlaneSizeYUV;
+		tjPlaneWidth;
+} TURBOJPEG_1.2;
diff --git a/turbojpeg-mapfile.jni b/turbojpeg-mapfile.jni
new file mode 100644
index 0000000..9c1d25b
--- /dev/null
+++ b/turbojpeg-mapfile.jni
@@ -0,0 +1,92 @@
+TURBOJPEG_1.0
+{
+	global:
+		tjInitCompress;
+		tjCompress;
+		TJBUFSIZE;
+		tjInitDecompress;
+		tjDecompressHeader;
+		tjDecompress;
+		tjDestroy;
+		tjGetErrorStr;
+	local:
+		*;
+};
+
+TURBOJPEG_1.1
+{
+	global:
+		TJBUFSIZEYUV;
+		tjDecompressHeader2;
+		tjDecompressToYUV;
+		tjEncodeYUV;
+} TURBOJPEG_1.0;
+
+TURBOJPEG_1.2
+{
+	global:
+		tjAlloc;
+		tjBufSize;
+		tjBufSizeYUV;
+		tjCompress2;
+		tjDecompress2;
+		tjEncodeYUV2;
+		tjFree;
+		tjGetScalingFactors;
+		tjInitTransform;
+		tjTransform;
+		Java_org_libjpegturbo_turbojpeg_TJ_bufSize;
+		Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__III;
+		Java_org_libjpegturbo_turbojpeg_TJ_getScalingFactors;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_init;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIII_3BIII;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIII_3BIII;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BII;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BII;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_init;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIII;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIII;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BI;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy;
+		Java_org_libjpegturbo_turbojpeg_TJTransformer_init;
+		Java_org_libjpegturbo_turbojpeg_TJTransformer_transform;
+} TURBOJPEG_1.1;
+
+TURBOJPEG_1.3
+{
+	global:
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIIIII_3BIII;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIIIII_3BIII;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIIIII;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIIIII;
+} TURBOJPEG_1.2;
+
+TURBOJPEG_1.4
+{
+	global:
+		tjBufSizeYUV2;
+		tjCompressFromYUV;
+		tjCompressFromYUVPlanes;
+		tjDecodeYUV;
+		tjDecodeYUVPlanes;
+		tjDecompressHeader3;
+		tjDecompressToYUV2;
+		tjDecompressToYUVPlanes;
+		tjEncodeYUV3;
+		tjEncodeYUVPlanes;
+		tjPlaneHeight;
+		tjPlaneSizeYUV;
+		tjPlaneWidth;
+		Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3_3B_3II_3III_3BII;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIIIII_3_3B_3I_3III;
+		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIIIII_3_3B_3I_3III;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3_3B_3II_3III;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV___3_3B_3I_3II_3BIIIIIII;
+		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV___3_3B_3I_3II_3IIIIIIII;
+		Java_org_libjpegturbo_turbojpeg_TJ_planeHeight__III;
+		Java_org_libjpegturbo_turbojpeg_TJ_planeSizeYUV__IIIII;
+		Java_org_libjpegturbo_turbojpeg_TJ_planeWidth__III;
+} TURBOJPEG_1.3;
diff --git a/turbojpeg.c b/turbojpeg.c
index 9117273..f51df78 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2012, 2014 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2016 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,6 +31,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <ctype.h>
 #include <jinclude.h>
 #define JPEG_INTERNALS
 #include <jpeglib.h>
@@ -39,12 +40,15 @@
 #include "./turbojpeg.h"
 #include "./tjutil.h"
 #include "transupp.h"
+#include "./jpegcomp.h"
 
 extern void jpeg_mem_dest_tj(j_compress_ptr, unsigned char **,
 	unsigned long *, boolean);
-extern void jpeg_mem_src_tj(j_decompress_ptr, unsigned char *, unsigned long);
+extern void jpeg_mem_src_tj(j_decompress_ptr, const unsigned char *,
+	unsigned long);
 
 #define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
+#define isPow2(x) (((x)&(x-1))==0)
 
 
 /* Error handling (based on example in example.c) */
@@ -55,6 +59,8 @@
 {
 	struct jpeg_error_mgr pub;
 	jmp_buf setjmp_buffer;
+	void (*emit_message)(j_common_ptr, int);
+	boolean warning;
 };
 typedef struct my_error_mgr *my_error_ptr;
 
@@ -72,6 +78,13 @@
 	(*cinfo->err->format_message)(cinfo, errStr);
 }
 
+static void my_emit_message(j_common_ptr cinfo, int msg_level)
+{
+	my_error_ptr myerr=(my_error_ptr)cinfo->err;
+	myerr->emit_message(cinfo, msg_level);
+	if(msg_level<0) myerr->warning=TRUE;
+}
+
 
 /* Global structures, macros, etc. */
 
@@ -82,10 +95,10 @@
 	struct jpeg_compress_struct cinfo;
 	struct jpeg_decompress_struct dinfo;
 	struct my_error_mgr jerr;
-	int init;
+	int init, headerRead;
 } tjinstance;
 
-static const int pixelsize[TJ_NUMSAMP]={3, 3, 3, 1, 3};
+static const int pixelsize[TJ_NUMSAMP]={3, 3, 3, 1, 3, 3};
 
 static const JXFORM_CODE xformtypes[TJ_NUMXOP]=
 {
@@ -119,7 +132,20 @@
 	j_compress_ptr cinfo=NULL;  j_decompress_ptr dinfo=NULL;  \
 	if(!this) {snprintf(errStr, JMSG_LENGTH_MAX, "Invalid handle");  \
 		return -1;}  \
-	cinfo=&this->cinfo;  dinfo=&this->dinfo;
+	cinfo=&this->cinfo;  dinfo=&this->dinfo;  \
+	this->jerr.warning=FALSE;
+#define getcinstance(handle) tjinstance *this=(tjinstance *)handle;  \
+	j_compress_ptr cinfo=NULL;  \
+	if(!this) {snprintf(errStr, JMSG_LENGTH_MAX, "Invalid handle");  \
+		return -1;}  \
+	cinfo=&this->cinfo;  \
+	this->jerr.warning=FALSE;
+#define getdinstance(handle) tjinstance *this=(tjinstance *)handle;  \
+	j_decompress_ptr dinfo=NULL;  \
+	if(!this) {snprintf(errStr, JMSG_LENGTH_MAX, "Invalid handle");  \
+		return -1;}  \
+	dinfo=&this->dinfo;  \
+	this->jerr.warning=FALSE;
 
 static int getPixelFormat(int pixelSize, int flags)
 {
@@ -149,6 +175,7 @@
 	int pixelFormat, int subsamp, int jpegQual, int flags)
 {
 	int retval=0;
+	char *env=NULL;
 
 	switch(pixelFormat)
 	{
@@ -185,10 +212,34 @@
 			cinfo->in_color_space=JCS_RGB;  pixelFormat=TJPF_RGB;
 			break;
 		#endif
+		case TJPF_CMYK:
+			cinfo->in_color_space=JCS_CMYK;  break;
 	}
 
 	cinfo->input_components=tjPixelSize[pixelFormat];
 	jpeg_set_defaults(cinfo);
+
+#ifndef NO_GETENV
+	if((env=getenv("TJ_OPTIMIZE"))!=NULL && strlen(env)>0 && !strcmp(env, "1"))
+		cinfo->optimize_coding=TRUE;
+	if((env=getenv("TJ_ARITHMETIC"))!=NULL && strlen(env)>0	&& !strcmp(env, "1"))
+		cinfo->arith_code=TRUE;
+	if((env=getenv("TJ_RESTART"))!=NULL && strlen(env)>0)
+	{
+		int temp=-1;  char tempc=0;
+		if(sscanf(env, "%d%c", &temp, &tempc)>=1 && temp>=0 && temp<=65535)
+		{
+			if(toupper(tempc)=='B')
+			{
+				cinfo->restart_interval=temp;
+				cinfo->restart_in_rows=0;
+			}
+			else
+				cinfo->restart_in_rows=temp;
+		}
+	}
+#endif
+
 	if(jpegQual>=0)
 	{
 		jpeg_set_quality(cinfo, jpegQual, TRUE);
@@ -197,15 +248,26 @@
 	}
 	if(subsamp==TJSAMP_GRAY)
 		jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
-	else
-		jpeg_set_colorspace(cinfo, JCS_YCbCr);
+	else if(pixelFormat==TJPF_CMYK)
+		jpeg_set_colorspace(cinfo, JCS_YCCK);
+	else jpeg_set_colorspace(cinfo, JCS_YCbCr);
+
+#ifndef NO_GETENV
+	if((env=getenv("TJ_PROGRESSIVE"))!=NULL && strlen(env)>0
+		&& !strcmp(env, "1"))
+		jpeg_simple_progression(cinfo);
+#endif
 
 	cinfo->comp_info[0].h_samp_factor=tjMCUWidth[subsamp]/8;
 	cinfo->comp_info[1].h_samp_factor=1;
 	cinfo->comp_info[2].h_samp_factor=1;
+	if(cinfo->num_components>3)
+		cinfo->comp_info[3].h_samp_factor=tjMCUWidth[subsamp]/8;
 	cinfo->comp_info[0].v_samp_factor=tjMCUHeight[subsamp]/8;
 	cinfo->comp_info[1].v_samp_factor=1;
 	cinfo->comp_info[2].v_samp_factor=1;
+	if(cinfo->num_components>3)
+		cinfo->comp_info[3].v_samp_factor=tjMCUHeight[subsamp]/8;
 
 	return retval;
 }
@@ -255,6 +317,8 @@
 		case TJPF_ABGR:
 			dinfo->out_color_space=JCS_RGB;  break;
 		#endif
+		case TJPF_CMYK:
+			dinfo->out_color_space=JCS_CMYK;  break;
 		default:
 			_throw("Unsupported pixel format");
 	}
@@ -269,9 +333,20 @@
 static int getSubsamp(j_decompress_ptr dinfo)
 {
 	int retval=-1, i, k;
+
+	/* The sampling factors actually have no meaning with grayscale JPEG files,
+	   and in fact it's possible to generate grayscale JPEGs with sampling
+	   factors > 1 (even though those sampling factors are ignored by the
+	   decompressor.)  Thus, we need to treat grayscale as a special case. */
+	if(dinfo->num_components==1 && dinfo->jpeg_color_space==JCS_GRAYSCALE)
+		return TJSAMP_GRAY;
+
 	for(i=0; i<NUMSUBOPT; i++)
 	{
-		if(dinfo->num_components==pixelsize[i])
+		if(dinfo->num_components==pixelsize[i]
+			|| ((dinfo->jpeg_color_space==JCS_YCCK
+				|| dinfo->jpeg_color_space==JCS_CMYK)
+					&& pixelsize[i]==3 && dinfo->num_components==4))
 		{
 			if(dinfo->comp_info[0].h_samp_factor==tjMCUWidth[i]/8
 				&& dinfo->comp_info[0].v_samp_factor==tjMCUHeight[i]/8)
@@ -279,8 +354,13 @@
 				int match=0;
 				for(k=1; k<dinfo->num_components; k++)
 				{
-					if(dinfo->comp_info[k].h_samp_factor==1
-						&& dinfo->comp_info[k].v_samp_factor==1)
+					int href=1, vref=1;
+					if(dinfo->jpeg_color_space==JCS_YCCK && k==3)
+					{
+						href=tjMCUWidth[i]/8;  vref=tjMCUHeight[i]/8;
+					}
+					if(dinfo->comp_info[k].h_samp_factor==href
+						&& dinfo->comp_info[k].v_samp_factor==vref)
 						match++;
 				}
 				if(match==dinfo->num_components-1)
@@ -477,12 +557,15 @@
 
 static tjhandle _tjInitCompress(tjinstance *this)
 {
-	unsigned char buffer[1], *buf=buffer;  unsigned long size=1;
+	static unsigned char buffer[1];
+	unsigned char *buf=buffer;  unsigned long size=1;
 
 	/* This is also straight out of example.c */
 	this->cinfo.err=jpeg_std_error(&this->jerr.pub);
 	this->jerr.pub.error_exit=my_error_exit;
 	this->jerr.pub.output_message=my_output_message;
+	this->jerr.emit_message=this->jerr.pub.emit_message;
+	this->jerr.pub.emit_message=my_emit_message;
 
 	if(setjmp(this->jerr.setjmp_buffer))
 	{
@@ -547,22 +630,33 @@
 }
 
 
-DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV2(int width, int pad, int height,
 	int subsamp)
 {
-	unsigned long retval=0;
-	int pw, ph, cw, ch;
-	if(width<1 || height<1 || subsamp<0 || subsamp>=NUMSUBOPT)
-		_throw("tjBufSizeYUV(): Invalid argument");
-	pw=PAD(width, tjMCUWidth[subsamp]/8);
-	ph=PAD(height, tjMCUHeight[subsamp]/8);
-	cw=pw*8/tjMCUWidth[subsamp];  ch=ph*8/tjMCUHeight[subsamp];
-	retval=PAD(pw, 4)*ph + (subsamp==TJSAMP_GRAY? 0:PAD(cw, 4)*ch*2);
+	int retval=0, nc, i;
+
+	if(subsamp<0 || subsamp>=NUMSUBOPT)
+		_throw("tjBufSizeYUV2(): Invalid argument");
+
+	nc=(subsamp==TJSAMP_GRAY? 1:3);
+	for(i=0; i<nc; i++)
+	{
+		int pw=tjPlaneWidth(i, width, subsamp);
+		int stride=PAD(pw, pad);
+		int ph=tjPlaneHeight(i, height, subsamp);
+		if(pw<0 || ph<0) return -1;
+		else retval+=stride*ph;
+	}
 
 	bailout:
 	return retval;
 }
 
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+	int subsamp)
+{
+	return tjBufSizeYUV2(width, 4, height, subsamp);
+}
 
 DLLEXPORT unsigned long DLLCALL TJBUFSIZEYUV(int width, int height,
 	int subsamp)
@@ -571,7 +665,72 @@
 }
 
 
-DLLEXPORT int DLLCALL tjCompress2(tjhandle handle, unsigned char *srcBuf,
+DLLEXPORT int tjPlaneWidth(int componentID, int width, int subsamp)
+{
+	int pw, nc, retval=0;
+
+	if(width<1 || subsamp<0 || subsamp>=TJ_NUMSAMP)
+		_throw("tjPlaneWidth(): Invalid argument");
+	nc=(subsamp==TJSAMP_GRAY? 1:3);
+	if(componentID<0 || componentID>=nc)
+		_throw("tjPlaneWidth(): Invalid argument");
+
+	pw=PAD(width, tjMCUWidth[subsamp]/8);
+	if(componentID==0)
+		retval=pw;
+	else
+		retval=pw*8/tjMCUWidth[subsamp];
+
+	bailout:
+	return retval;
+}
+
+
+DLLEXPORT int tjPlaneHeight(int componentID, int height, int subsamp)
+{
+	int ph, nc, retval=0;
+
+	if(height<1 || subsamp<0 || subsamp>=TJ_NUMSAMP)
+		_throw("tjPlaneHeight(): Invalid argument");
+	nc=(subsamp==TJSAMP_GRAY? 1:3);
+	if(componentID<0 || componentID>=nc)
+		_throw("tjPlaneHeight(): Invalid argument");
+
+	ph=PAD(height, tjMCUHeight[subsamp]/8);
+	if(componentID==0)
+		retval=ph;
+	else
+		retval=ph*8/tjMCUHeight[subsamp];
+
+	bailout:
+	return retval;
+}
+
+
+DLLEXPORT unsigned long DLLCALL tjPlaneSizeYUV(int componentID, int width,
+	int stride, int height, int subsamp)
+{
+	unsigned long retval=0;
+	int pw, ph;
+
+	if(width<1 || height<1 || subsamp<0 || subsamp>=NUMSUBOPT)
+		_throw("tjPlaneSizeYUV(): Invalid argument");
+
+	pw=tjPlaneWidth(componentID, width, subsamp);
+	ph=tjPlaneHeight(componentID, height, subsamp);
+	if(pw<0 || ph<0) return -1;
+
+	if(stride==0) stride=pw;
+	else stride=abs(stride);
+
+	retval=stride*(ph-1)+pw;
+
+	bailout:
+	return retval;
+}
+
+
+DLLEXPORT int DLLCALL tjCompress2(tjhandle handle, const unsigned char *srcBuf,
 	int width, int pitch, int height, int pixelFormat, unsigned char **jpegBuf,
 	unsigned long *jpegSize, int jpegSubsamp, int jpegQual, int flags)
 {
@@ -580,7 +739,7 @@
 	unsigned char *rgbBuf=NULL;
 	#endif
 
-	getinstance(handle)
+	getcinstance(handle)
 	if((this->init&COMPRESS)==0)
 		_throw("tjCompress2(): Instance has not been initialized for compression");
 
@@ -599,7 +758,7 @@
 	if(pitch==0) pitch=width*tjPixelSize[pixelFormat];
 
 	#ifndef JCS_EXTENSIONS
-	if(pixelFormat!=TJPF_GRAY)
+	if(pixelFormat!=TJPF_GRAY && pixelFormat!=TJPF_CMYK)
 	{
 		rgbBuf=(unsigned char *)malloc(width*height*RGB_PIXELSIZE);
 		if(!rgbBuf) _throw("tjCompress2(): Memory allocation failure");
@@ -628,8 +787,9 @@
 		_throw("tjCompress2(): Memory allocation failure");
 	for(i=0; i<height; i++)
 	{
-		if(flags&TJFLAG_BOTTOMUP) row_pointer[i]=&srcBuf[(height-i-1)*pitch];
-		else row_pointer[i]=&srcBuf[i*pitch];
+		if(flags&TJFLAG_BOTTOMUP)
+			row_pointer[i]=(JSAMPROW)&srcBuf[(height-i-1)*pitch];
+		else row_pointer[i]=(JSAMPROW)&srcBuf[i*pitch];
 	}
 	while(cinfo->next_scanline<cinfo->image_height)
 	{
@@ -644,6 +804,7 @@
 	if(rgbBuf) free(rgbBuf);
 	#endif
 	if(row_pointer) free(row_pointer);
+	if(this->jerr.warning) retval=-1;
 	return retval;
 }
 
@@ -669,23 +830,23 @@
 }
 
 
-DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf,
-	int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf,
-	int subsamp, int flags)
+DLLEXPORT int DLLCALL tjEncodeYUVPlanes(tjhandle handle,
+	const unsigned char *srcBuf, int width, int pitch, int height,
+	int pixelFormat, unsigned char **dstPlanes, int *strides, int subsamp,
+	int flags)
 {
 	int i, retval=0;  JSAMPROW *row_pointer=NULL;
 	JSAMPLE *_tmpbuf[MAX_COMPONENTS], *_tmpbuf2[MAX_COMPONENTS];
 	JSAMPROW *tmpbuf[MAX_COMPONENTS], *tmpbuf2[MAX_COMPONENTS];
 	JSAMPROW *outbuf[MAX_COMPONENTS];
-	int row, pw, ph, cw[MAX_COMPONENTS], ch[MAX_COMPONENTS];
-	JSAMPLE *ptr=dstBuf;
-	unsigned long yuvsize=0;
+	int row, pw0, ph0, pw[MAX_COMPONENTS], ph[MAX_COMPONENTS];
+	JSAMPLE *ptr;
 	jpeg_component_info *compptr;
 	#ifndef JCS_EXTENSIONS
 	unsigned char *rgbBuf=NULL;
 	#endif
 
-	getinstance(handle);
+	getcinstance(handle);
 
 	for(i=0; i<MAX_COMPONENTS; i++)
 	{
@@ -694,12 +855,14 @@
 	}
 
 	if((this->init&COMPRESS)==0)
-		_throw("tjEncodeYUV2(): Instance has not been initialized for compression");
+		_throw("tjEncodeYUVPlanes(): Instance has not been initialized for compression");
 
 	if(srcBuf==NULL || width<=0 || pitch<0 || height<=0 || pixelFormat<0
-		|| pixelFormat>=TJ_NUMPF || dstBuf==NULL || subsamp<0
+		|| pixelFormat>=TJ_NUMPF || !dstPlanes || !dstPlanes[0] || subsamp<0
 		|| subsamp>=NUMSUBOPT)
-		_throw("tjEncodeYUV2(): Invalid argument");
+		_throw("tjEncodeYUVPlanes(): Invalid argument");
+	if(subsamp!=TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
+		_throw("tjEncodeYUVPlanes(): Invalid argument");
 
 	if(setjmp(this->jerr.setjmp_buffer))
 	{
@@ -708,13 +871,16 @@
 		goto bailout;
 	}
 
+	if(pixelFormat==TJPF_CMYK)
+		_throw("tjEncodeYUVPlanes(): Cannot generate YUV images from CMYK pixels");
+
 	if(pitch==0) pitch=width*tjPixelSize[pixelFormat];
 
 	#ifndef JCS_EXTENSIONS
-	if(pixelFormat!=TJPF_GRAY)
+	if(pixelFormat!=TJPF_GRAY && pixelFormat!=TJPF_CMYK)
 	{
 		rgbBuf=(unsigned char *)malloc(width*height*RGB_PIXELSIZE);
-		if(!rgbBuf) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!rgbBuf) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
 		srcBuf=toRGB(srcBuf, width, pitch, height, pixelFormat, rgbBuf);
 		pitch=width*RGB_PIXELSIZE;
 	}
@@ -727,7 +893,6 @@
 	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
 	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 
-	yuvsize=tjBufSizeYUV(width, height, subsamp);
 	if(setCompDefaults(cinfo, pixelFormat, subsamp, -1, flags)==-1) return -1;
 
 	/* Execute only the parts of jpeg_start_compress() that we need.  If we
@@ -735,25 +900,26 @@
 	   to write the file headers, which could overflow the output buffer if the
 	   YUV image were very small. */
 	if(cinfo->global_state!=CSTATE_START)
-		_throw("tjEncodeYUV3(): libjpeg API is in the wrong state");
+		_throw("tjEncodeYUVPlanes(): libjpeg API is in the wrong state");
 	(*cinfo->err->reset_error_mgr)((j_common_ptr)cinfo);
 	jinit_c_master_control(cinfo, FALSE);
 	jinit_color_converter(cinfo);
 	jinit_downsampler(cinfo);
 	(*cinfo->cconvert->start_pass)(cinfo);
 
-	pw=PAD(width, cinfo->max_h_samp_factor);
-	ph=PAD(height, cinfo->max_v_samp_factor);
+	pw0=PAD(width, cinfo->max_h_samp_factor);
+	ph0=PAD(height, cinfo->max_v_samp_factor);
 
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph))==NULL)
-		_throw("tjEncodeYUV2(): Memory allocation failure");
+	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph0))==NULL)
+		_throw("tjEncodeYUVPlanes(): Memory allocation failure");
 	for(i=0; i<height; i++)
 	{
-		if(flags&TJFLAG_BOTTOMUP) row_pointer[i]=&srcBuf[(height-i-1)*pitch];
-		else row_pointer[i]=&srcBuf[i*pitch];
+		if(flags&TJFLAG_BOTTOMUP)
+			row_pointer[i]=(JSAMPROW)&srcBuf[(height-i-1)*pitch];
+		else row_pointer[i]=(JSAMPROW)&srcBuf[i*pitch];
 	}
-	if(height<ph)
-		for(i=height; i<ph; i++) row_pointer[i]=row_pointer[height-1];
+	if(height<ph0)
+		for(i=height; i<ph0; i++) row_pointer[i]=row_pointer[height-1];
 
 	for(i=0; i<cinfo->num_components; i++)
 	{
@@ -761,9 +927,9 @@
 		_tmpbuf[i]=(JSAMPLE *)malloc(
 			PAD((compptr->width_in_blocks*cinfo->max_h_samp_factor*DCTSIZE)
 				/compptr->h_samp_factor, 16) * cinfo->max_v_samp_factor + 16);
-		if(!_tmpbuf[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!_tmpbuf[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
 		tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*cinfo->max_v_samp_factor);
-		if(!tmpbuf[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!tmpbuf[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
 		for(row=0; row<cinfo->max_v_samp_factor; row++)
 		{
 			unsigned char *_tmpbuf_aligned=
@@ -774,9 +940,9 @@
 		}
 		_tmpbuf2[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 16)
 			* compptr->v_samp_factor + 16);
-		if(!_tmpbuf2[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!_tmpbuf2[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
 		tmpbuf2[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor);
-		if(!tmpbuf2[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
+		if(!tmpbuf2[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
 		for(row=0; row<compptr->v_samp_factor; row++)
 		{
 			unsigned char *_tmpbuf2_aligned=
@@ -784,20 +950,19 @@
 			tmpbuf2[i][row]=&_tmpbuf2_aligned[
 				PAD(compptr->width_in_blocks*DCTSIZE, 16) * row];
 		}
-		cw[i]=pw*compptr->h_samp_factor/cinfo->max_h_samp_factor;
-		ch[i]=ph*compptr->v_samp_factor/cinfo->max_v_samp_factor;
-		outbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ch[i]);
-		if(!outbuf[i]) _throw("tjEncodeYUV2(): Memory allocation failure");
-		for(row=0; row<ch[i]; row++)
+		pw[i]=pw0*compptr->h_samp_factor/cinfo->max_h_samp_factor;
+		ph[i]=ph0*compptr->v_samp_factor/cinfo->max_v_samp_factor;
+		outbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]);
+		if(!outbuf[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+		ptr=dstPlanes[i];
+		for(row=0; row<ph[i]; row++)
 		{
 			outbuf[i][row]=ptr;
-			ptr+=PAD(cw[i], 4);
+			ptr+=(strides && strides[i]!=0)? strides[i]:pw[i];
 		}
 	}
-	if(yuvsize!=(unsigned long)(ptr-dstBuf))
-		_throw("tjEncodeYUV2(): Generated image is not the correct size");
 
-	for(row=0; row<ph; row+=cinfo->max_v_samp_factor)
+	for(row=0; row<ph0; row+=cinfo->max_v_samp_factor)
 	{
 		(*cinfo->cconvert->color_convert)(cinfo, &row_pointer[row], tmpbuf, 0,
 			cinfo->max_v_samp_factor);
@@ -805,7 +970,7 @@
 		for(i=0, compptr=cinfo->comp_info; i<cinfo->num_components; i++, compptr++)
 			jcopy_sample_rows(tmpbuf2[i], 0, outbuf[i],
 				row*compptr->v_samp_factor/cinfo->max_v_samp_factor,
-				compptr->v_samp_factor, cw[i]);
+				compptr->v_samp_factor, pw[i]);
 	}
 	cinfo->next_scanline+=height;
 	jpeg_abort_compress(cinfo);
@@ -824,9 +989,54 @@
 		if(_tmpbuf2[i]!=NULL) free(_tmpbuf2[i]);
 		if(outbuf[i]!=NULL) free(outbuf[i]);
 	}
+	if(this->jerr.warning) retval=-1;
 	return retval;
 }
 
+DLLEXPORT int DLLCALL tjEncodeYUV3(tjhandle handle,
+	const unsigned char *srcBuf, int width, int pitch, int height,
+	int pixelFormat, unsigned char *dstBuf, int pad, int subsamp, int flags)
+{
+	unsigned char *dstPlanes[3];
+	int pw0, ph0, strides[3], retval=-1;
+
+	if(width<=0 || height<=0 || dstBuf==NULL || pad<0 || !isPow2(pad)
+		|| subsamp<0 || subsamp>=NUMSUBOPT)
+		_throw("tjEncodeYUV3(): Invalid argument");
+
+	pw0=tjPlaneWidth(0, width, subsamp);
+	ph0=tjPlaneHeight(0, height, subsamp);
+	dstPlanes[0]=dstBuf;
+	strides[0]=PAD(pw0, pad);
+	if(subsamp==TJSAMP_GRAY)
+	{
+		strides[1]=strides[2]=0;
+		dstPlanes[1]=dstPlanes[2]=NULL;
+	}
+	else
+	{
+		int pw1=tjPlaneWidth(1, width, subsamp);
+		int ph1=tjPlaneHeight(1, height, subsamp);
+		strides[1]=strides[2]=PAD(pw1, pad);
+		dstPlanes[1]=dstPlanes[0]+strides[0]*ph0;
+		dstPlanes[2]=dstPlanes[1]+strides[1]*ph1;
+	}
+
+	return tjEncodeYUVPlanes(handle, srcBuf, width, pitch, height, pixelFormat,
+		dstPlanes, strides, subsamp, flags);
+
+	bailout:
+	return retval;
+}
+
+DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf,
+	int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf,
+	int subsamp, int flags)
+{
+	return tjEncodeYUV3(handle, srcBuf, width, pitch, height, pixelFormat,
+		dstBuf, 4, subsamp, flags);
+}
+
 DLLEXPORT int DLLCALL tjEncodeYUV(tjhandle handle, unsigned char *srcBuf,
 	int width, int pitch, int height, int pixelSize, unsigned char *dstBuf,
 	int subsamp, int flags)
@@ -836,16 +1046,187 @@
 }
 
 
+DLLEXPORT int DLLCALL tjCompressFromYUVPlanes(tjhandle handle,
+	const unsigned char **srcPlanes, int width, const int *strides, int height,
+	int subsamp, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual,
+	int flags)
+{
+	int i, row, retval=0, alloc=1;  JSAMPROW *inbuf[MAX_COMPONENTS];
+	int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
+		tmpbufsize=0, usetmpbuf=0, th[MAX_COMPONENTS];
+	JSAMPLE *_tmpbuf=NULL, *ptr;  JSAMPROW *tmpbuf[MAX_COMPONENTS];
+
+	getcinstance(handle)
+
+	for(i=0; i<MAX_COMPONENTS; i++)
+	{
+		tmpbuf[i]=NULL;  inbuf[i]=NULL;
+	}
+
+	if((this->init&COMPRESS)==0)
+		_throw("tjCompressFromYUVPlanes(): Instance has not been initialized for compression");
+
+	if(!srcPlanes || !srcPlanes[0] || width<=0 || height<=0 || subsamp<0
+		|| subsamp>=NUMSUBOPT || jpegBuf==NULL || jpegSize==NULL || jpegQual<0
+		|| jpegQual>100)
+		_throw("tjCompressFromYUVPlanes(): Invalid argument");
+	if(subsamp!=TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
+		_throw("tjCompressFromYUVPlanes(): Invalid argument");
+
+	if(setjmp(this->jerr.setjmp_buffer))
+	{
+		/* If we get here, the JPEG code has signaled an error. */
+		retval=-1;
+		goto bailout;
+	}
+
+	cinfo->image_width=width;
+	cinfo->image_height=height;
+
+	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+
+	if(flags&TJFLAG_NOREALLOC)
+	{
+		alloc=0;  *jpegSize=tjBufSize(width, height, subsamp);
+	}
+	jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
+	if(setCompDefaults(cinfo, TJPF_RGB, subsamp, jpegQual, flags)==-1)
+		return -1;
+	cinfo->raw_data_in=TRUE;
+
+	jpeg_start_compress(cinfo, TRUE);
+	for(i=0; i<cinfo->num_components; i++)
+	{
+		jpeg_component_info *compptr=&cinfo->comp_info[i];
+		int ih;
+		iw[i]=compptr->width_in_blocks*DCTSIZE;
+		ih=compptr->height_in_blocks*DCTSIZE;
+		pw[i]=PAD(cinfo->image_width, cinfo->max_h_samp_factor)
+			*compptr->h_samp_factor/cinfo->max_h_samp_factor;
+		ph[i]=PAD(cinfo->image_height, cinfo->max_v_samp_factor)
+			*compptr->v_samp_factor/cinfo->max_v_samp_factor;
+		if(iw[i]!=pw[i] || ih!=ph[i]) usetmpbuf=1;
+		th[i]=compptr->v_samp_factor*DCTSIZE;
+		tmpbufsize+=iw[i]*th[i];
+		if((inbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]))==NULL)
+			_throw("tjCompressFromYUVPlanes(): Memory allocation failure");
+		ptr=(JSAMPLE *)srcPlanes[i];
+		for(row=0; row<ph[i]; row++)
+		{
+			inbuf[i][row]=ptr;
+			ptr+=(strides && strides[i]!=0)? strides[i]:pw[i];
+		}
+	}
+	if(usetmpbuf)
+	{
+		if((_tmpbuf=(JSAMPLE *)malloc(sizeof(JSAMPLE)*tmpbufsize))==NULL)
+			_throw("tjCompressFromYUVPlanes(): Memory allocation failure");
+		ptr=_tmpbuf;
+		for(i=0; i<cinfo->num_components; i++)
+		{
+			if((tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*th[i]))==NULL)
+				_throw("tjCompressFromYUVPlanes(): Memory allocation failure");
+			for(row=0; row<th[i]; row++)
+			{
+				tmpbuf[i][row]=ptr;
+				ptr+=iw[i];
+			}
+		}
+	}
+
+	for(row=0; row<(int)cinfo->image_height;
+		row+=cinfo->max_v_samp_factor*DCTSIZE)
+	{
+		JSAMPARRAY yuvptr[MAX_COMPONENTS];
+		int crow[MAX_COMPONENTS];
+		for(i=0; i<cinfo->num_components; i++)
+		{
+			jpeg_component_info *compptr=&cinfo->comp_info[i];
+			crow[i]=row*compptr->v_samp_factor/cinfo->max_v_samp_factor;
+			if(usetmpbuf)
+			{
+				int j, k;
+				for(j=0; j<min(th[i], ph[i]-crow[i]); j++)
+				{
+					memcpy(tmpbuf[i][j], inbuf[i][crow[i]+j], pw[i]);
+					/* Duplicate last sample in row to fill out MCU */
+					for(k=pw[i]; k<iw[i]; k++) tmpbuf[i][j][k]=tmpbuf[i][j][pw[i]-1];
+				}
+				/* Duplicate last row to fill out MCU */
+				for(j=ph[i]-crow[i]; j<th[i]; j++)
+					memcpy(tmpbuf[i][j], tmpbuf[i][ph[i]-crow[i]-1], iw[i]);
+				yuvptr[i]=tmpbuf[i];
+			}
+			else
+				yuvptr[i]=&inbuf[i][crow[i]];
+		}
+		jpeg_write_raw_data(cinfo, yuvptr, cinfo->max_v_samp_factor*DCTSIZE);
+	}
+	jpeg_finish_compress(cinfo);
+
+	bailout:
+	if(cinfo->global_state>CSTATE_START) jpeg_abort_compress(cinfo);
+	for(i=0; i<MAX_COMPONENTS; i++)
+	{
+		if(tmpbuf[i]) free(tmpbuf[i]);
+		if(inbuf[i]) free(inbuf[i]);
+	}
+	if(_tmpbuf) free(_tmpbuf);
+	if(this->jerr.warning) retval=-1;
+	return retval;
+}
+
+DLLEXPORT int DLLCALL tjCompressFromYUV(tjhandle handle,
+	const unsigned char *srcBuf, int width, int pad, int height, int subsamp,
+	unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags)
+{
+	const unsigned char *srcPlanes[3];
+	int pw0, ph0, strides[3], retval=-1;
+
+	if(srcBuf==NULL || width<=0 || pad<1 || height<=0 || subsamp<0
+		|| subsamp>=NUMSUBOPT)
+		_throw("tjCompressFromYUV(): Invalid argument");
+
+	pw0=tjPlaneWidth(0, width, subsamp);
+	ph0=tjPlaneHeight(0, height, subsamp);
+	srcPlanes[0]=srcBuf;
+	strides[0]=PAD(pw0, pad);
+	if(subsamp==TJSAMP_GRAY)
+	{
+		strides[1]=strides[2]=0;
+		srcPlanes[1]=srcPlanes[2]=NULL;
+	}
+	else
+	{
+		int pw1=tjPlaneWidth(1, width, subsamp);
+		int ph1=tjPlaneHeight(1, height, subsamp);
+		strides[1]=strides[2]=PAD(pw1, pad);
+		srcPlanes[1]=srcPlanes[0]+strides[0]*ph0;
+		srcPlanes[2]=srcPlanes[1]+strides[1]*ph1;
+	}
+
+	return tjCompressFromYUVPlanes(handle, srcPlanes, width, strides, height,
+		subsamp, jpegBuf, jpegSize, jpegQual, flags);
+
+	bailout:
+	return retval;
+}
+
+
 /* Decompressor */
 
 static tjhandle _tjInitDecompress(tjinstance *this)
 {
-	unsigned char buffer[1];
+	static unsigned char buffer[1];
 
 	/* This is also straight out of example.c */
 	this->dinfo.err=jpeg_std_error(&this->jerr.pub);
 	this->jerr.pub.error_exit=my_error_exit;
 	this->jerr.pub.output_message=my_output_message;
+	this->jerr.emit_message=this->jerr.pub.emit_message;
+	this->jerr.pub.emit_message=my_emit_message;
 
 	if(setjmp(this->jerr.setjmp_buffer))
 	{
@@ -875,19 +1256,19 @@
 }
 
 
-DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
-	unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
-	int *jpegSubsamp)
+DLLEXPORT int DLLCALL tjDecompressHeader3(tjhandle handle,
+	const unsigned char *jpegBuf, unsigned long jpegSize, int *width,
+	int *height, int *jpegSubsamp, int *jpegColorspace)
 {
 	int retval=0;
 
-	getinstance(handle);
+	getdinstance(handle);
 	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecompressHeader2(): Instance has not been initialized for decompression");
+		_throw("tjDecompressHeader3(): Instance has not been initialized for decompression");
 
 	if(jpegBuf==NULL || jpegSize<=0 || width==NULL || height==NULL
-		|| jpegSubsamp==NULL)
-		_throw("tjDecompressHeader2(): Invalid argument");
+		|| jpegSubsamp==NULL || jpegColorspace==NULL)
+		_throw("tjDecompressHeader3(): Invalid argument");
 
 	if(setjmp(this->jerr.setjmp_buffer))
 	{
@@ -901,18 +1282,39 @@
 	*width=dinfo->image_width;
 	*height=dinfo->image_height;
 	*jpegSubsamp=getSubsamp(dinfo);
+	switch(dinfo->jpeg_color_space)
+	{
+		case JCS_GRAYSCALE:  *jpegColorspace=TJCS_GRAY;  break;
+		case JCS_RGB:        *jpegColorspace=TJCS_RGB;  break;
+		case JCS_YCbCr:      *jpegColorspace=TJCS_YCbCr;  break;
+		case JCS_CMYK:       *jpegColorspace=TJCS_CMYK;  break;
+		case JCS_YCCK:       *jpegColorspace=TJCS_YCCK;  break;
+		default:             *jpegColorspace=-1;  break;
+	}
 
 	jpeg_abort_decompress(dinfo);
 
 	if(*jpegSubsamp<0)
-		_throw("tjDecompressHeader2(): Could not determine subsampling type for JPEG image");
+		_throw("tjDecompressHeader3(): Could not determine subsampling type for JPEG image");
+	if(*jpegColorspace<0)
+		_throw("tjDecompressHeader3(): Could not determine colorspace of JPEG image");
 	if(*width<1 || *height<1)
-		_throw("tjDecompressHeader2(): Invalid data returned in header");
+		_throw("tjDecompressHeader3(): Invalid data returned in header");
 
 	bailout:
+	if(this->jerr.warning) retval=-1;
 	return retval;
 }
 
+DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
+	unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
+	int *jpegSubsamp)
+{
+	int jpegColorspace;
+	return tjDecompressHeader3(handle, jpegBuf, jpegSize, width, height,
+		jpegSubsamp, &jpegColorspace);
+}
+
 DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle handle,
 	unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height)
 {
@@ -936,9 +1338,9 @@
 }
 
 
-DLLEXPORT int DLLCALL tjDecompress2(tjhandle handle, unsigned char *jpegBuf,
-	unsigned long jpegSize, unsigned char *dstBuf, int width, int pitch,
-	int height, int pixelFormat, int flags)
+DLLEXPORT int DLLCALL tjDecompress2(tjhandle handle,
+	const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+	int width, int pitch, int height, int pixelFormat, int flags)
 {
 	int i, retval=0;  JSAMPROW *row_pointer=NULL;
 	int jpegwidth, jpegheight, scaledw, scaledh;
@@ -947,7 +1349,7 @@
 	unsigned char *_dstBuf=NULL;  int _pitch=0;
 	#endif
 
-	getinstance(handle);
+	getdinstance(handle);
 	if((this->init&DECOMPRESS)==0)
 		_throw("tjDecompress2(): Instance has not been initialized for decompression");
 
@@ -985,7 +1387,7 @@
 		if(scaledw<=width && scaledh<=height)
 			break;
 	}
-	if(scaledw>width || scaledh>height)
+	if(i>=NUMSF)
 		_throw("tjDecompress2(): Could not scale down to desired image dimensions");
 	width=scaledw;  height=scaledh;
 	dinfo->scale_num=sf[i].num;
@@ -995,7 +1397,7 @@
 	if(pitch==0) pitch=dinfo->output_width*tjPixelSize[pixelFormat];
 
 	#ifndef JCS_EXTENSIONS
-	if(pixelFormat!=TJPF_GRAY &&
+	if(pixelFormat!=TJPF_GRAY && pixelFormat!=TJPF_CMYK &&
 		(RGB_RED!=tjRedOffset[pixelFormat] ||
 			RGB_GREEN!=tjGreenOffset[pixelFormat] ||
 			RGB_BLUE!=tjBlueOffset[pixelFormat] ||
@@ -1034,6 +1436,7 @@
 	if(rgbBuf) free(rgbBuf);
 	#endif
 	if(row_pointer) free(row_pointer);
+	if(this->jerr.warning) retval=-1;
 	return retval;
 }
 
@@ -1049,16 +1452,270 @@
 }
 
 
-DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
-	unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+static int setDecodeDefaults(struct jpeg_decompress_struct *dinfo,
+	int pixelFormat, int subsamp, int flags)
+{
+	int i;
+
+	dinfo->scale_num=dinfo->scale_denom=1;
+
+	if(subsamp==TJSAMP_GRAY)
+	{
+		dinfo->num_components=dinfo->comps_in_scan=1;
+		dinfo->jpeg_color_space=JCS_GRAYSCALE;
+	}
+	else
+	{
+		dinfo->num_components=dinfo->comps_in_scan=3;
+		dinfo->jpeg_color_space=JCS_YCbCr;
+	}
+
+	dinfo->comp_info=(jpeg_component_info *)
+		(*dinfo->mem->alloc_small)((j_common_ptr)dinfo, JPOOL_IMAGE,
+			dinfo->num_components*sizeof(jpeg_component_info));
+
+	for(i=0; i<dinfo->num_components; i++)
+	{
+		jpeg_component_info *compptr=&dinfo->comp_info[i];
+		compptr->h_samp_factor=(i==0)? tjMCUWidth[subsamp]/8:1;
+		compptr->v_samp_factor=(i==0)? tjMCUHeight[subsamp]/8:1;
+		compptr->component_index=i;
+		compptr->component_id=i+1;
+		compptr->quant_tbl_no=compptr->dc_tbl_no=compptr->ac_tbl_no=
+			(i==0)? 0:1;
+		dinfo->cur_comp_info[i]=compptr;
+	}
+	dinfo->data_precision=8;
+	for(i=0; i<2; i++)
+	{
+		if(dinfo->quant_tbl_ptrs[i]==NULL)
+			dinfo->quant_tbl_ptrs[i]=jpeg_alloc_quant_table((j_common_ptr)dinfo);
+	}
+
+	return 0;
+}
+
+
+int my_read_markers(j_decompress_ptr dinfo)
+{
+	return JPEG_REACHED_SOS;
+}
+
+void my_reset_marker_reader(j_decompress_ptr dinfo)
+{
+}
+
+DLLEXPORT int DLLCALL tjDecodeYUVPlanes(tjhandle handle,
+	const unsigned char **srcPlanes, const int *strides, int subsamp,
+	unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat,
 	int flags)
 {
-	int i, row, retval=0;  JSAMPROW *outbuf[MAX_COMPONENTS];
-	int cw[MAX_COMPONENTS], ch[MAX_COMPONENTS], iw[MAX_COMPONENTS],
-		tmpbufsize=0, usetmpbuf=0, th[MAX_COMPONENTS];
-	JSAMPLE *_tmpbuf=NULL, *ptr=dstBuf;  JSAMPROW *tmpbuf[MAX_COMPONENTS];
+	int i, retval=0;  JSAMPROW *row_pointer=NULL;
+	JSAMPLE *_tmpbuf[MAX_COMPONENTS];
+	JSAMPROW *tmpbuf[MAX_COMPONENTS], *inbuf[MAX_COMPONENTS];
+	int row, pw0, ph0, pw[MAX_COMPONENTS], ph[MAX_COMPONENTS];
+	JSAMPLE *ptr;
+	jpeg_component_info *compptr;
+	#ifndef JCS_EXTENSIONS
+	unsigned char *rgbBuf=NULL;
+	unsigned char *_dstBuf=NULL;  int _pitch=0;
+	#endif
+	int (*old_read_markers)(j_decompress_ptr);
+	void (*old_reset_marker_reader)(j_decompress_ptr);
 
-	getinstance(handle);
+	getdinstance(handle);
+
+	for(i=0; i<MAX_COMPONENTS; i++)
+	{
+		tmpbuf[i]=NULL;  _tmpbuf[i]=NULL;  inbuf[i]=NULL;
+	}
+
+	if((this->init&DECOMPRESS)==0)
+		_throw("tjDecodeYUVPlanes(): Instance has not been initialized for decompression");
+
+	if(!srcPlanes || !srcPlanes[0] || subsamp<0 || subsamp>=NUMSUBOPT
+		|| dstBuf==NULL || width<=0 || pitch<0 || height<=0 || pixelFormat<0
+		|| pixelFormat>=TJ_NUMPF)
+		_throw("tjDecodeYUVPlanes(): Invalid argument");
+	if(subsamp!=TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
+		_throw("tjDecodeYUVPlanes(): Invalid argument");
+
+	if(setjmp(this->jerr.setjmp_buffer))
+	{
+		/* If we get here, the JPEG code has signaled an error. */
+		retval=-1;
+		goto bailout;
+	}
+
+	if(pixelFormat==TJPF_CMYK)
+		_throw("tjDecodeYUVPlanes(): Cannot decode YUV images into CMYK pixels.");
+
+	if(pitch==0) pitch=width*tjPixelSize[pixelFormat];
+	dinfo->image_width=width;
+	dinfo->image_height=height;
+
+	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+
+	if(setDecodeDefaults(dinfo, pixelFormat, subsamp, flags)==-1)
+	{
+		retval=-1;  goto bailout;
+	}
+	old_read_markers=dinfo->marker->read_markers;
+	dinfo->marker->read_markers=my_read_markers;
+	old_reset_marker_reader=dinfo->marker->reset_marker_reader;
+	dinfo->marker->reset_marker_reader=my_reset_marker_reader;
+	jpeg_read_header(dinfo, TRUE);
+	dinfo->marker->read_markers=old_read_markers;
+	dinfo->marker->reset_marker_reader=old_reset_marker_reader;
+
+	if(setDecompDefaults(dinfo, pixelFormat, flags)==-1)
+	{
+		retval=-1;  goto bailout;
+	}
+	dinfo->do_fancy_upsampling=FALSE;
+	dinfo->Se=DCTSIZE2-1;
+	jinit_master_decompress(dinfo);
+	(*dinfo->upsample->start_pass)(dinfo);
+
+	pw0=PAD(width, dinfo->max_h_samp_factor);
+	ph0=PAD(height, dinfo->max_v_samp_factor);
+
+	if(pitch==0) pitch=dinfo->output_width*tjPixelSize[pixelFormat];
+
+	#ifndef JCS_EXTENSIONS
+	if(pixelFormat!=TJPF_GRAY && pixelFormat!=TJPF_CMYK &&
+		(RGB_RED!=tjRedOffset[pixelFormat] ||
+			RGB_GREEN!=tjGreenOffset[pixelFormat] ||
+			RGB_BLUE!=tjBlueOffset[pixelFormat] ||
+			RGB_PIXELSIZE!=tjPixelSize[pixelFormat]))
+	{
+		rgbBuf=(unsigned char *)malloc(width*height*3);
+		if(!rgbBuf) _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+		_pitch=pitch;  pitch=width*3;
+		_dstBuf=dstBuf;  dstBuf=rgbBuf;
+	}
+	#endif
+
+	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph0))==NULL)
+		_throw("tjDecodeYUVPlanes(): Memory allocation failure");
+	for(i=0; i<height; i++)
+	{
+		if(flags&TJFLAG_BOTTOMUP) row_pointer[i]=&dstBuf[(height-i-1)*pitch];
+		else row_pointer[i]=&dstBuf[i*pitch];
+	}
+	if(height<ph0)
+		for(i=height; i<ph0; i++) row_pointer[i]=row_pointer[height-1];
+
+	for(i=0; i<dinfo->num_components; i++)
+	{
+		compptr=&dinfo->comp_info[i];
+		_tmpbuf[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 16)
+			* compptr->v_samp_factor + 16);
+		if(!_tmpbuf[i]) _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+		tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor);
+		if(!tmpbuf[i]) _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+		for(row=0; row<compptr->v_samp_factor; row++)
+		{
+			unsigned char *_tmpbuf_aligned=
+				(unsigned char *)PAD((size_t)_tmpbuf[i], 16);
+			tmpbuf[i][row]=&_tmpbuf_aligned[
+				PAD(compptr->width_in_blocks*DCTSIZE, 16) * row];
+		}
+		pw[i]=pw0*compptr->h_samp_factor/dinfo->max_h_samp_factor;
+		ph[i]=ph0*compptr->v_samp_factor/dinfo->max_v_samp_factor;
+		inbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]);
+		if(!inbuf[i]) _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+		ptr=(JSAMPLE *)srcPlanes[i];
+		for(row=0; row<ph[i]; row++)
+		{
+			inbuf[i][row]=ptr;
+			ptr+=(strides && strides[i]!=0)? strides[i]:pw[i];
+		}
+	}
+
+	for(row=0; row<ph0; row+=dinfo->max_v_samp_factor)
+	{
+		JDIMENSION inrow=0, outrow=0;
+		for(i=0, compptr=dinfo->comp_info; i<dinfo->num_components; i++, compptr++)
+			jcopy_sample_rows(inbuf[i],
+				row*compptr->v_samp_factor/dinfo->max_v_samp_factor, tmpbuf[i], 0,
+				compptr->v_samp_factor, pw[i]);
+		(dinfo->upsample->upsample)(dinfo, tmpbuf, &inrow,
+			dinfo->max_v_samp_factor, &row_pointer[row], &outrow,
+			dinfo->max_v_samp_factor);
+	}
+	jpeg_abort_decompress(dinfo);
+
+	#ifndef JCS_EXTENSIONS
+	fromRGB(rgbBuf, _dstBuf, width, _pitch, height, pixelFormat);
+	#endif
+
+	bailout:
+	if(dinfo->global_state>DSTATE_START) jpeg_abort_decompress(dinfo);
+	#ifndef JCS_EXTENSIONS
+	if(rgbBuf) free(rgbBuf);
+	#endif
+	if(row_pointer) free(row_pointer);
+	for(i=0; i<MAX_COMPONENTS; i++)
+	{
+		if(tmpbuf[i]!=NULL) free(tmpbuf[i]);
+		if(_tmpbuf[i]!=NULL) free(_tmpbuf[i]);
+		if(inbuf[i]!=NULL) free(inbuf[i]);
+	}
+	if(this->jerr.warning) retval=-1;
+	return retval;
+}
+
+DLLEXPORT int DLLCALL tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
+	int pad, int subsamp, unsigned char *dstBuf, int width, int pitch,
+	int height, int pixelFormat, int flags)
+{
+	const unsigned char *srcPlanes[3];
+	int pw0, ph0, strides[3], retval=-1;
+
+	if(srcBuf==NULL || pad<0 || !isPow2(pad) || subsamp<0 || subsamp>=NUMSUBOPT
+		|| width<=0 || height<=0)
+		_throw("tjDecodeYUV(): Invalid argument");
+
+	pw0=tjPlaneWidth(0, width, subsamp);
+	ph0=tjPlaneHeight(0, height, subsamp);
+	srcPlanes[0]=srcBuf;
+	strides[0]=PAD(pw0, pad);
+	if(subsamp==TJSAMP_GRAY)
+	{
+		strides[1]=strides[2]=0;
+		srcPlanes[1]=srcPlanes[2]=NULL;
+	}
+	else
+	{
+		int pw1=tjPlaneWidth(1, width, subsamp);
+		int ph1=tjPlaneHeight(1, height, subsamp);
+		strides[1]=strides[2]=PAD(pw1, pad);
+		srcPlanes[1]=srcPlanes[0]+strides[0]*ph0;
+		srcPlanes[2]=srcPlanes[1]+strides[1]*ph1;
+	}
+
+	return tjDecodeYUVPlanes(handle, srcPlanes, strides, subsamp, dstBuf, width,
+		pitch, height, pixelFormat, flags);
+
+	bailout:
+	return retval;
+}
+
+DLLEXPORT int DLLCALL tjDecompressToYUVPlanes(tjhandle handle,
+	const unsigned char *jpegBuf, unsigned long jpegSize,
+	unsigned char **dstPlanes, int width, int *strides, int height, int flags)
+{
+	int i, sfi, row, retval=0;  JSAMPROW *outbuf[MAX_COMPONENTS];
+	int jpegwidth, jpegheight, jpegSubsamp, scaledw, scaledh;
+	int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
+		tmpbufsize=0, usetmpbuf=0, th[MAX_COMPONENTS];
+	JSAMPLE *_tmpbuf=NULL, *ptr;  JSAMPROW *tmpbuf[MAX_COMPONENTS];
+	int dctsize;
+
+	getdinstance(handle);
 
 	for(i=0; i<MAX_COMPONENTS; i++)
 	{
@@ -1066,10 +1723,11 @@
 	}
 
 	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecompressToYUV(): Instance has not been initialized for decompression");
+		_throw("tjDecompressToYUVPlanes(): Instance has not been initialized for decompression");
 
-	if(jpegBuf==NULL || jpegSize<=0 || dstBuf==NULL)
-		_throw("tjDecompressToYUV(): Invalid argument");
+	if(jpegBuf==NULL || jpegSize<=0 || !dstPlanes || !dstPlanes[0] || width<0
+		|| height<0)
+		_throw("tjDecompressToYUVPlanes(): Invalid argument");
 
 	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
 	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
@@ -1082,39 +1740,73 @@
 		goto bailout;
 	}
 
-	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
-	jpeg_read_header(dinfo, TRUE);
+	if(!this->headerRead)
+	{
+		jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+		jpeg_read_header(dinfo, TRUE);
+	}
+	this->headerRead=0;
+	jpegSubsamp=getSubsamp(dinfo);
+	if(jpegSubsamp<0)
+		_throw("tjDecompressToYUVPlanes(): Could not determine subsampling type for JPEG image");
+
+	if(jpegSubsamp!=TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
+		_throw("tjDecompressToYUVPlanes(): Invalid argument");
+
+	jpegwidth=dinfo->image_width;  jpegheight=dinfo->image_height;
+	if(width==0) width=jpegwidth;
+	if(height==0) height=jpegheight;
+	for(i=0; i<NUMSF; i++)
+	{
+		scaledw=TJSCALED(jpegwidth, sf[i]);
+		scaledh=TJSCALED(jpegheight, sf[i]);
+		if(scaledw<=width && scaledh<=height)
+			break;
+	}
+	if(i>=NUMSF)
+		_throw("tjDecompressToYUVPlanes(): Could not scale down to desired image dimensions");
+	if(dinfo->num_components>3)
+		_throw("tjDecompressToYUVPlanes(): JPEG image must have 3 or fewer components");
+
+	width=scaledw;  height=scaledh;
+	dinfo->scale_num=sf[i].num;
+	dinfo->scale_denom=sf[i].denom;
+	sfi=i;
+	jpeg_calc_output_dimensions(dinfo);
+
+	dctsize=DCTSIZE*sf[sfi].num/sf[sfi].denom;
 
 	for(i=0; i<dinfo->num_components; i++)
 	{
 		jpeg_component_info *compptr=&dinfo->comp_info[i];
 		int ih;
-		iw[i]=compptr->width_in_blocks*DCTSIZE;
-		ih=compptr->height_in_blocks*DCTSIZE;
-		cw[i]=PAD(dinfo->image_width, dinfo->max_h_samp_factor)
+		iw[i]=compptr->width_in_blocks*dctsize;
+		ih=compptr->height_in_blocks*dctsize;
+		pw[i]=PAD(dinfo->output_width, dinfo->max_h_samp_factor)
 			*compptr->h_samp_factor/dinfo->max_h_samp_factor;
-		ch[i]=PAD(dinfo->image_height, dinfo->max_v_samp_factor)
+		ph[i]=PAD(dinfo->output_height, dinfo->max_v_samp_factor)
 			*compptr->v_samp_factor/dinfo->max_v_samp_factor;
-		if(iw[i]!=cw[i] || ih!=ch[i]) usetmpbuf=1;
-		th[i]=compptr->v_samp_factor*DCTSIZE;
+		if(iw[i]!=pw[i] || ih!=ph[i]) usetmpbuf=1;
+		th[i]=compptr->v_samp_factor*dctsize;
 		tmpbufsize+=iw[i]*th[i];
-		if((outbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ch[i]))==NULL)
-			_throw("tjDecompressToYUV(): Memory allocation failure");
-		for(row=0; row<ch[i]; row++)
+		if((outbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]))==NULL)
+			_throw("tjDecompressToYUVPlanes(): Memory allocation failure");
+		ptr=dstPlanes[i];
+		for(row=0; row<ph[i]; row++)
 		{
 			outbuf[i][row]=ptr;
-			ptr+=PAD(cw[i], 4);
+			ptr+=(strides && strides[i]!=0)? strides[i]:pw[i];
 		}
 	}
 	if(usetmpbuf)
 	{
 		if((_tmpbuf=(JSAMPLE *)malloc(sizeof(JSAMPLE)*tmpbufsize))==NULL)
-			_throw("tjDecompressToYUV(): Memory allocation failure");
+			_throw("tjDecompressToYUVPlanes(): Memory allocation failure");
 		ptr=_tmpbuf;
 		for(i=0; i<dinfo->num_components; i++)
 		{
 			if((tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*th[i]))==NULL)
-				_throw("tjDecompressToYUV(): Memory allocation failure");
+				_throw("tjDecompressToYUVPlanes(): Memory allocation failure");
 			for(row=0; row<th[i]; row++)
 			{
 				tmpbuf[i][row]=ptr;
@@ -1129,26 +1821,45 @@
 
 	jpeg_start_decompress(dinfo);
 	for(row=0; row<(int)dinfo->output_height;
-		row+=dinfo->max_v_samp_factor*DCTSIZE)
+		row+=dinfo->max_v_samp_factor*dinfo->_min_DCT_scaled_size)
 	{
 		JSAMPARRAY yuvptr[MAX_COMPONENTS];
 		int crow[MAX_COMPONENTS];
 		for(i=0; i<dinfo->num_components; i++)
 		{
 			jpeg_component_info *compptr=&dinfo->comp_info[i];
+			if(jpegSubsamp==TJ_420)
+			{
+				/* When 4:2:0 subsampling is used with IDCT scaling, libjpeg will try
+				   to be clever and use the IDCT to perform upsampling on the U and V
+				   planes.  For instance, if the output image is to be scaled by 1/2
+				   relative to the JPEG image, then the scaling factor and upsampling
+				   effectively cancel each other, so a normal 8x8 IDCT can be used.
+				   However, this is not desirable when using the decompress-to-YUV
+				   functionality in TurboJPEG, since we want to output the U and V
+				   planes in their subsampled form.  Thus, we have to override some
+				   internal libjpeg parameters to force it to use the "scaled" IDCT
+				   functions on the U and V planes. */
+				compptr->_DCT_scaled_size=dctsize;
+				compptr->MCU_sample_width=tjMCUWidth[jpegSubsamp]*
+					sf[sfi].num/sf[sfi].denom*
+					compptr->v_samp_factor/dinfo->max_v_samp_factor;
+				dinfo->idct->inverse_DCT[i] = dinfo->idct->inverse_DCT[0];
+			}
 			crow[i]=row*compptr->v_samp_factor/dinfo->max_v_samp_factor;
 			if(usetmpbuf) yuvptr[i]=tmpbuf[i];
 			else yuvptr[i]=&outbuf[i][crow[i]];
 		}
-		jpeg_read_raw_data(dinfo, yuvptr, dinfo->max_v_samp_factor*DCTSIZE);
+		jpeg_read_raw_data(dinfo, yuvptr,
+			dinfo->max_v_samp_factor*dinfo->_min_DCT_scaled_size);
 		if(usetmpbuf)
 		{
 			int j;
 			for(i=0; i<dinfo->num_components; i++)
 			{
-				for(j=0; j<min(th[i], ch[i]-crow[i]); j++)
+				for(j=0; j<min(th[i], ph[i]-crow[i]); j++)
 				{
-					memcpy(outbuf[i][crow[i]+j], tmpbuf[i][j], cw[i]);
+					memcpy(outbuf[i][crow[i]+j], tmpbuf[i][j], pw[i]);
 				}
 			}
 		}
@@ -1163,9 +1874,78 @@
 		if(outbuf[i]) free(outbuf[i]);
 	}
 	if(_tmpbuf) free(_tmpbuf);
+	if(this->jerr.warning) retval=-1;
 	return retval;
 }
 
+DLLEXPORT int DLLCALL tjDecompressToYUV2(tjhandle handle,
+	const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+	int width, int pad, int height, int flags)
+{
+	unsigned char *dstPlanes[3];
+	int pw0, ph0, strides[3], retval=-1, jpegSubsamp=-1;
+	int i, jpegwidth, jpegheight, scaledw, scaledh;
+
+	getdinstance(handle);
+
+	if(jpegBuf==NULL || jpegSize<=0 || dstBuf==NULL || width<0 || pad<1
+		|| !isPow2(pad) || height<0)
+		_throw("tjDecompressToYUV2(): Invalid argument");
+
+	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+	jpeg_read_header(dinfo, TRUE);
+	jpegSubsamp=getSubsamp(dinfo);
+	if(jpegSubsamp<0)
+		_throw("tjDecompressToYUV2(): Could not determine subsampling type for JPEG image");
+
+	jpegwidth=dinfo->image_width;  jpegheight=dinfo->image_height;
+	if(width==0) width=jpegwidth;
+	if(height==0) height=jpegheight;
+
+	for(i=0; i<NUMSF; i++)
+	{
+		scaledw=TJSCALED(jpegwidth, sf[i]);
+		scaledh=TJSCALED(jpegheight, sf[i]);
+		if(scaledw<=width && scaledh<=height)
+			break;
+	}
+	if(i>=NUMSF)
+		_throw("tjDecompressToYUV2(): Could not scale down to desired image dimensions");
+
+	pw0=tjPlaneWidth(0, width, jpegSubsamp);
+	ph0=tjPlaneHeight(0, height, jpegSubsamp);
+	dstPlanes[0]=dstBuf;
+	strides[0]=PAD(pw0, pad);
+	if(jpegSubsamp==TJSAMP_GRAY)
+	{
+		strides[1]=strides[2]=0;
+		dstPlanes[1]=dstPlanes[2]=NULL;
+	}
+	else
+	{
+		int pw1=tjPlaneWidth(1, width, jpegSubsamp);
+		int ph1=tjPlaneHeight(1, height, jpegSubsamp);
+		strides[1]=strides[2]=PAD(pw1, pad);
+		dstPlanes[1]=dstPlanes[0]+strides[0]*ph0;
+		dstPlanes[2]=dstPlanes[1]+strides[1]*ph1;
+	}
+
+	this->headerRead=1;
+	return tjDecompressToYUVPlanes(handle, jpegBuf, jpegSize, dstPlanes, width,
+		strides, height, flags);
+
+	bailout:
+	return retval;
+
+}
+
+DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
+	unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+	int flags)
+{
+	return tjDecompressToYUV2(handle, jpegBuf, jpegSize, dstBuf, 0, 4, 0, flags);
+}
+
 
 /* Transformer */
 
@@ -1186,9 +1966,9 @@
 }
 
 
-DLLEXPORT int DLLCALL tjTransform(tjhandle handle, unsigned char *jpegBuf,
-	unsigned long jpegSize, int n, unsigned char **dstBufs,
-	unsigned long *dstSizes, tjtransform *t, int flags)
+DLLEXPORT int DLLCALL tjTransform(tjhandle handle,
+	const unsigned char *jpegBuf, unsigned long jpegSize, int n,
+	unsigned char **dstBufs, unsigned long *dstSizes, tjtransform *t, int flags)
 {
 	jpeg_transform_info *xinfo=NULL;
 	jvirt_barray_ptr *srccoefs, *dstcoefs;
@@ -1336,5 +2116,6 @@
 	if(cinfo->global_state>CSTATE_START) jpeg_abort_compress(cinfo);
 	if(dinfo->global_state>DSTATE_START) jpeg_abort_decompress(dinfo);
 	if(xinfo) free(xinfo);
+	if(this->jerr.warning) retval=-1;
 	return retval;
 }
diff --git a/turbojpeg.h b/turbojpeg.h
index a563c81..583029f 100644
--- a/turbojpeg.h
+++ b/turbojpeg.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2013 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2015 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,6 +42,35 @@
  * TurboJPEG API.  This API provides an interface for generating, decoding, and
  * transforming planar YUV and JPEG images in memory.
  *
+ * @anchor YUVnotes
+ * YUV Image Format Notes
+ * ----------------------
+ * Technically, the JPEG format uses the YCbCr colorspace (which is technically
+ * not a colorspace but a color transform), but per the convention of the
+ * digital video community, the TurboJPEG API uses "YUV" to refer to an image
+ * format consisting of Y, Cb, and Cr image planes.
+ *
+ * Each plane is simply a 2D array of bytes, each byte representing the value
+ * of one of the components (Y, Cb, or Cr) at a particular location in the
+ * image.  The width and height of each plane are determined by the image
+ * width, height, and level of chrominance subsampling.   The luminance plane
+ * width is the image width padded to the nearest multiple of the horizontal
+ * subsampling factor (2 in the case of 4:2:0 and 4:2:2, 4 in the case of
+ * 4:1:1, 1 in the case of 4:4:4 or grayscale.)  Similarly, the luminance plane
+ * height is the image height padded to the nearest multiple of the vertical
+ * subsampling factor (2 in the case of 4:2:0 or 4:4:0, 1 in the case of 4:4:4
+ * or grayscale.)  This is irrespective of any additional padding that may be
+ * specified as an argument to the various YUV functions.  The chrominance
+ * plane width is equal to the luminance plane width divided by the horizontal
+ * subsampling factor, and the chrominance plane height is equal to the
+ * luminance plane height divided by the vertical subsampling factor.
+ *
+ * For example, if the source image is 35 x 35 pixels and 4:2:2 subsampling is
+ * used, then the luminance plane would be 36 x 35 bytes, and each of the
+ * chrominance planes would be 18 x 35 bytes.  If you specify a line padding of
+ * 4 bytes on top of this, then the luminance plane would be 36 x 35 bytes, and
+ * each of the chrominance planes would be 20 x 35 bytes.
+ *
  * @{
  */
 
@@ -49,20 +78,16 @@
 /**
  * The number of chrominance subsampling options
  */
-#define TJ_NUMSAMP 5
+#define TJ_NUMSAMP 6
 
 /**
  * Chrominance subsampling options.
- * When an image is converted from the RGB to the YCbCr colorspace as part of
- * the JPEG compression process, some of the Cb and Cr (chrominance) components
- * can be discarded or averaged together to produce a smaller image with little
- * perceptible loss of image clarity (the human eye is more sensitive to small
- * changes in brightness than small changes in color.)  This is called
- * "chrominance subsampling".
- * <p>
- * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
- * convention of the digital video community, the TurboJPEG API uses "YUV" to
- * refer to an image format consisting of Y, Cb, and Cr image planes.
+ * When pixels are converted from RGB to YCbCr (see #TJCS_YCbCr) or from CMYK
+ * to YCCK (see #TJCS_YCCK) as part of the JPEG compression process, some of
+ * the Cb and Cr (chrominance) components can be discarded or averaged together
+ * to produce a smaller image with little perceptible loss of image clarity
+ * (the human eye is more sensitive to small changes in brightness than to
+ * small changes in color.)  This is called "chrominance subsampling".
  */
 enum TJSAMP
 {
@@ -89,9 +114,22 @@
   /**
    * 4:4:0 chrominance subsampling.  The JPEG or YUV image will contain one
    * chrominance component for every 1x2 block of pixels in the source image.
-   * Note that 4:4:0 subsampling is not fully accelerated in libjpeg-turbo.
+   *
+   * @note 4:4:0 subsampling is not fully accelerated in libjpeg-turbo.
    */
-  TJSAMP_440
+  TJSAMP_440,
+  /**
+   * 4:1:1 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 4x1 block of pixels in the source image.
+   * JPEG images compressed with 4:1:1 subsampling will be almost exactly the
+   * same size as those compressed with 4:2:0 subsampling, and in the
+   * aggregate, both subsampling methods produce approximately the same
+   * perceptual quality.  However, 4:1:1 is better able to reproduce sharp
+   * horizontal features.
+   *
+   * @note 4:1:1 subsampling is not fully accelerated in libjpeg-turbo.
+   */
+  TJSAMP_411
 };
 
 /**
@@ -100,9 +138,10 @@
  * - 8x8 for no subsampling or grayscale
  * - 16x8 for 4:2:2
  * - 8x16 for 4:4:0
- * - 16x16 for 4:2:0 
+ * - 16x16 for 4:2:0
+ * - 32x8 for 4:1:1
  */
-static const int tjMCUWidth[TJ_NUMSAMP]  = {8, 16, 16, 8, 8};
+static const int tjMCUWidth[TJ_NUMSAMP]  = {8, 16, 16, 8, 8, 32};
 
 /**
  * MCU block height (in pixels) for a given level of chrominance subsampling.
@@ -110,15 +149,16 @@
  * - 8x8 for no subsampling or grayscale
  * - 16x8 for 4:2:2
  * - 8x16 for 4:4:0
- * - 16x16 for 4:2:0 
+ * - 16x16 for 4:2:0
+ * - 32x8 for 4:1:1
  */
-static const int tjMCUHeight[TJ_NUMSAMP] = {8, 8, 16, 8, 16};
+static const int tjMCUHeight[TJ_NUMSAMP] = {8, 8, 16, 8, 16, 8};
 
 
 /**
  * The number of pixel formats
  */
-#define TJ_NUMPF 11
+#define TJ_NUMPF 12
 
 /**
  * Pixel formats
@@ -193,16 +233,33 @@
    * decompressing, the X component is guaranteed to be 0xFF, which can be
    * interpreted as an opaque alpha channel.
    */
-  TJPF_ARGB
+  TJPF_ARGB,
+  /**
+   * CMYK pixel format.  Unlike RGB, which is an additive color model used
+   * primarily for display, CMYK (Cyan/Magenta/Yellow/Key) is a subtractive
+   * color model used primarily for printing.  In the CMYK color model, the
+   * value of each color component typically corresponds to an amount of cyan,
+   * magenta, yellow, or black ink that is applied to a white background.  In
+   * order to convert between CMYK and RGB, it is necessary to use a color
+   * management system (CMS.)  A CMS will attempt to map colors within the
+   * printer's gamut to perceptually similar colors in the display's gamut and
+   * vice versa, but the mapping is typically not 1:1 or reversible, nor can it
+   * be defined with a simple formula.  Thus, such a conversion is out of scope
+   * for a codec library.  However, the TurboJPEG API allows for compressing
+   * CMYK pixels into a YCCK JPEG image (see #TJCS_YCCK) and decompressing YCCK
+   * JPEG images into CMYK pixels.
+   */
+  TJPF_CMYK
 };
 
+
 /**
  * Red offset (in bytes) for a given pixel format.  This specifies the number
  * of bytes that the red component is offset from the start of the pixel.  For
  * instance, if a pixel of format TJ_BGRX is stored in <tt>char pixel[]</tt>,
  * then the red component will be <tt>pixel[tjRedOffset[TJ_BGRX]]</tt>.
  */
-static const int tjRedOffset[TJ_NUMPF] = {0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1};
+static const int tjRedOffset[TJ_NUMPF] = {0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1, -1};
 /**
  * Green offset (in bytes) for a given pixel format.  This specifies the number
  * of bytes that the green component is offset from the start of the pixel.
@@ -210,19 +267,81 @@
  * <tt>char pixel[]</tt>, then the green component will be
  * <tt>pixel[tjGreenOffset[TJ_BGRX]]</tt>.
  */
-static const int tjGreenOffset[TJ_NUMPF] = {1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2};
+static const int tjGreenOffset[TJ_NUMPF] = {1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2, -1};
 /**
  * Blue offset (in bytes) for a given pixel format.  This specifies the number
  * of bytes that the Blue component is offset from the start of the pixel.  For
  * instance, if a pixel of format TJ_BGRX is stored in <tt>char pixel[]</tt>,
  * then the blue component will be <tt>pixel[tjBlueOffset[TJ_BGRX]]</tt>.
  */
-static const int tjBlueOffset[TJ_NUMPF] = {2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3};
+static const int tjBlueOffset[TJ_NUMPF] = {2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3, -1};
 
 /**
  * Pixel size (in bytes) for a given pixel format.
  */
-static const int tjPixelSize[TJ_NUMPF] = {3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4};
+static const int tjPixelSize[TJ_NUMPF] = {3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4};
+
+
+/**
+ * The number of JPEG colorspaces
+ */
+#define TJ_NUMCS 5
+
+/**
+ * JPEG colorspaces
+ */
+enum TJCS
+{
+  /**
+   * RGB colorspace.  When compressing the JPEG image, the R, G, and B
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  RGB JPEG images can be
+   * decompressed to any of the extended RGB pixel formats or grayscale, but
+   * they cannot be decompressed to YUV images.
+   */
+  TJCS_RGB=0,
+  /**
+   * YCbCr colorspace.  YCbCr is not an absolute colorspace but rather a
+   * mathematical transformation of RGB designed solely for storage and
+   * transmission.  YCbCr images must be converted to RGB before they can
+   * actually be displayed.  In the YCbCr colorspace, the Y (luminance)
+   * component represents the black & white portion of the original image, and
+   * the Cb and Cr (chrominance) components represent the color portion of the
+   * original image.  Originally, the analog equivalent of this transformation
+   * allowed the same signal to drive both black & white and color televisions,
+   * but JPEG images use YCbCr primarily because it allows the color data to be
+   * optionally subsampled for the purposes of reducing bandwidth or disk
+   * space.  YCbCr is the most common JPEG colorspace, and YCbCr JPEG images
+   * can be compressed from and decompressed to any of the extended RGB pixel
+   * formats or grayscale, or they can be decompressed to YUV planar images.
+   */
+  TJCS_YCbCr,
+  /**
+   * Grayscale colorspace.  The JPEG image retains only the luminance data (Y
+   * component), and any color data from the source image is discarded.
+   * Grayscale JPEG images can be compressed from and decompressed to any of
+   * the extended RGB pixel formats or grayscale, or they can be decompressed
+   * to YUV planar images.
+   */
+  TJCS_GRAY,
+  /**
+   * CMYK colorspace.  When compressing the JPEG image, the C, M, Y, and K
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  CMYK JPEG images can
+   * only be decompressed to CMYK pixels.
+   */
+  TJCS_CMYK,
+  /**
+   * YCCK colorspace.  YCCK (AKA "YCbCrK") is not an absolute colorspace but
+   * rather a mathematical transformation of CMYK designed solely for storage
+   * and transmission.  It is to CMYK as YCbCr is to RGB.  CMYK pixels can be
+   * reversibly transformed into YCCK, and as with YCbCr, the chrominance
+   * components in the YCCK pixels can be subsampled without incurring major
+   * perceptual loss.  YCCK JPEG images can only be compressed from and
+   * decompressed to CMYK pixels.
+   */
+  TJCS_YCCK
+};
 
 
 /**
@@ -231,26 +350,6 @@
  */
 #define TJFLAG_BOTTOMUP        2
 /**
- * Turn off CPU auto-detection and force TurboJPEG to use MMX code (if the
- * underlying codec supports it.)
- */
-#define TJFLAG_FORCEMMX        8
-/**
- * Turn off CPU auto-detection and force TurboJPEG to use SSE code (if the
- * underlying codec supports it.)
- */
-#define TJFLAG_FORCESSE       16
-/**
- * Turn off CPU auto-detection and force TurboJPEG to use SSE2 code (if the
- * underlying codec supports it.)
- */
-#define TJFLAG_FORCESSE2      32
-/**
- * Turn off CPU auto-detection and force TurboJPEG to use SSE3 code (if the
- * underlying codec supports it.)
- */
-#define TJFLAG_FORCESSE3     128
-/**
  * When decompressing an image that was compressed using chrominance
  * subsampling, use the fastest chrominance upsampling algorithm available in
  * the underlying codec.  The default is to use smooth upsampling, which
@@ -449,26 +548,29 @@
    * to be applied in the frequency domain.
    *
    * @param coeffs pointer to an array of transformed DCT coefficients.  (NOTE:
-   *        this pointer is not guaranteed to be valid once the callback
-   *        returns, so applications wishing to hand off the DCT coefficients
-   *        to another function or library should make a copy of them within
-   *        the body of the callback.)
+   * this pointer is not guaranteed to be valid once the callback returns, so
+   * applications wishing to hand off the DCT coefficients to another function
+   * or library should make a copy of them within the body of the callback.)
+   *
    * @param arrayRegion #tjregion structure containing the width and height of
-   *        the array pointed to by <tt>coeffs</tt> as well as its offset
-   *        relative to the component plane.  TurboJPEG implementations may
-   *        choose to split each component plane into multiple DCT coefficient
-   *        arrays and call the callback function once for each array.
+   * the array pointed to by <tt>coeffs</tt> as well as its offset relative to
+   * the component plane.  TurboJPEG implementations may choose to split each
+   * component plane into multiple DCT coefficient arrays and call the callback
+   * function once for each array.
+   *
    * @param planeRegion #tjregion structure containing the width and height of
-   *        the component plane to which <tt>coeffs</tt> belongs
+   * the component plane to which <tt>coeffs</tt> belongs
+   *
    * @param componentID ID number of the component plane to which
-   *        <tt>coeffs</tt> belongs (Y, Cb, and Cr have, respectively, ID's of
-   *        0, 1, and 2 in typical JPEG images.)
+   * <tt>coeffs</tt> belongs (Y, Cb, and Cr have, respectively, ID's of 0, 1,
+   * and 2 in typical JPEG images.)
+   *
    * @param transformID ID number of the transformed image to which
-   *        <tt>coeffs</tt> belongs.  This is the same as the index of the
-   *        transform in the <tt>transforms</tt> array that was passed to
-   *        #tjTransform().
+   * <tt>coeffs</tt> belongs.  This is the same as the index of the transform
+   * in the <tt>transforms</tt> array that was passed to #tjTransform().
+   *
    * @param transform a pointer to a #tjtransform structure that specifies the
-   *        parameters and/or cropping region for this transform
+   * parameters and/or cropping region for this transform
    *
    * @return 0 if the callback was successful, or -1 if an error occurred.
    */
@@ -491,7 +593,7 @@
 /**
  * Compute the scaled value of <tt>dimension</tt> using the given scaling
  * factor.  This macro performs the integer equivalent of <tt>ceil(dimension *
- * scalingFactor)</tt>. 
+ * scalingFactor)</tt>.
  */
 #define TJSCALED(dimension, scalingFactor) ((dimension * scalingFactor.num \
   + scalingFactor.denom - 1) / scalingFactor.denom)
@@ -512,58 +614,205 @@
 
 
 /**
- * Compress an RGB or grayscale image into a JPEG image.
+ * Compress an RGB, grayscale, or CMYK image into a JPEG image.
  *
  * @param handle a handle to a TurboJPEG compressor or transformer instance
- * @param srcBuf pointer to an image buffer containing RGB or grayscale pixels
- *        to be compressed
+ *
+ * @param srcBuf pointer to an image buffer containing RGB, grayscale, or
+ * CMYK pixels to be compressed
+ *
  * @param width width (in pixels) of the source image
- * @param pitch bytes per line of the source image.  Normally, this should be
- *        <tt>width * #tjPixelSize[pixelFormat]</tt> if the image is unpadded,
- *        or <tt>#TJPAD(width * #tjPixelSize[pixelFormat])</tt> if each line of
- *        the image is padded to the nearest 32-bit boundary, as is the case
- *        for Windows bitmaps.  You can also be clever and use this parameter
- *        to skip lines, etc.  Setting this parameter to 0 is the equivalent of
- *        setting it to <tt>width * #tjPixelSize[pixelFormat]</tt>.
+ *
+ * @param pitch bytes per line in the source image.  Normally, this should be
+ * <tt>width * #tjPixelSize[pixelFormat]</tt> if the image is unpadded, or
+ * <tt>#TJPAD(width * #tjPixelSize[pixelFormat])</tt> if each line of the image
+ * is padded to the nearest 32-bit boundary, as is the case for Windows
+ * bitmaps.  You can also be clever and use this parameter to skip lines, etc.
+ * Setting this parameter to 0 is the equivalent of setting it to
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>.
+ *
  * @param height height (in pixels) of the source image
+ *
  * @param pixelFormat pixel format of the source image (see @ref TJPF
- *        "Pixel formats".)
+ * "Pixel formats".)
+ *
  * @param jpegBuf address of a pointer to an image buffer that will receive the
- *        JPEG image.  TurboJPEG has the ability to reallocate the JPEG buffer
- *        to accommodate the size of the JPEG image.  Thus, you can choose to:
- *        -# pre-allocate the JPEG buffer with an arbitrary size using
- *        #tjAlloc() and let TurboJPEG grow the buffer as needed,
- *        -# set <tt>*jpegBuf</tt> to NULL to tell TurboJPEG to allocate the
- *        buffer for you, or
- *        -# pre-allocate the buffer to a "worst case" size determined by
- *        calling #tjBufSize().  This should ensure that the buffer never has
- *        to be re-allocated (setting #TJFLAG_NOREALLOC guarantees this.)
- *        .
- *        If you choose option 1, <tt>*jpegSize</tt> should be set to the
- *        size of your pre-allocated buffer.  In any case, unless you have
- *        set #TJFLAG_NOREALLOC, you should always check <tt>*jpegBuf</tt> upon
- *        return from this function, as it may have changed.
+ * JPEG image.  TurboJPEG has the ability to reallocate the JPEG buffer
+ * to accommodate the size of the JPEG image.  Thus, you can choose to:
+ * -# pre-allocate the JPEG buffer with an arbitrary size using #tjAlloc() and
+ * let TurboJPEG grow the buffer as needed,
+ * -# set <tt>*jpegBuf</tt> to NULL to tell TurboJPEG to allocate the buffer
+ * for you, or
+ * -# pre-allocate the buffer to a "worst case" size determined by calling
+ * #tjBufSize().  This should ensure that the buffer never has to be
+ * re-allocated (setting #TJFLAG_NOREALLOC guarantees this.)
+ * .
+ * If you choose option 1, <tt>*jpegSize</tt> should be set to the size of your
+ * pre-allocated buffer.  In any case, unless you have set #TJFLAG_NOREALLOC,
+ * you should always check <tt>*jpegBuf</tt> upon return from this function, as
+ * it may have changed.
+ *
  * @param jpegSize pointer to an unsigned long variable that holds the size of
- *        the JPEG image buffer.  If <tt>*jpegBuf</tt> points to a
- *        pre-allocated buffer, then <tt>*jpegSize</tt> should be set to the
- *        size of the buffer.  Upon return, <tt>*jpegSize</tt> will contain the
- *        size of the JPEG image (in bytes.)
+ * the JPEG image buffer.  If <tt>*jpegBuf</tt> points to a pre-allocated
+ * buffer, then <tt>*jpegSize</tt> should be set to the size of the buffer.
+ * Upon return, <tt>*jpegSize</tt> will contain the size of the JPEG image (in
+ * bytes.)  If <tt>*jpegBuf</tt> points to a JPEG image buffer that is being
+ * reused from a previous call to one of the JPEG compression functions, then
+ * <tt>*jpegSize</tt> is ignored.
+ *
  * @param jpegSubsamp the level of chrominance subsampling to be used when
- *        generating the JPEG image (see @ref TJSAMP
- *        "Chrominance subsampling options".)
+ * generating the JPEG image (see @ref TJSAMP
+ * "Chrominance subsampling options".)
+ *
  * @param jpegQual the image quality of the generated JPEG image (1 = worst,
-          100 = best)
+ * 100 = best)
+ *
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
- *        "flags".
+ * "flags"
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
-DLLEXPORT int DLLCALL tjCompress2(tjhandle handle, unsigned char *srcBuf,
+DLLEXPORT int DLLCALL tjCompress2(tjhandle handle, const unsigned char *srcBuf,
   int width, int pitch, int height, int pixelFormat, unsigned char **jpegBuf,
   unsigned long *jpegSize, int jpegSubsamp, int jpegQual, int flags);
 
 
 /**
+ * Compress a YUV planar image into a JPEG image.
+ *
+ * @param handle a handle to a TurboJPEG compressor or transformer instance
+ *
+ * @param srcBuf pointer to an image buffer containing a YUV planar image to be
+ * compressed.  The size of this buffer should match the value returned by
+ * #tjBufSizeYUV2() for the given image width, height, padding, and level of
+ * chrominance subsampling.  The Y, U (Cb), and V (Cr) image planes should be
+ * stored sequentially in the source buffer (refer to @ref YUVnotes
+ * "YUV Image Format Notes".)
+ *
+ * @param width width (in pixels) of the source image.  If the width is not an
+ * even multiple of the MCU block width (see #tjMCUWidth), then an intermediate
+ * buffer copy will be performed within TurboJPEG.
+ *
+ * @param pad the line padding used in the source image.  For instance, if each
+ * line in each plane of the YUV image is padded to the nearest multiple of 4
+ * bytes, then <tt>pad</tt> should be set to 4.
+ *
+ * @param height height (in pixels) of the source image.  If the height is not
+ * an even multiple of the MCU block height (see #tjMCUHeight), then an
+ * intermediate buffer copy will be performed within TurboJPEG.
+ *
+ * @param subsamp the level of chrominance subsampling used in the source
+ * image (see @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @param jpegBuf address of a pointer to an image buffer that will receive the
+ * JPEG image.  TurboJPEG has the ability to reallocate the JPEG buffer to
+ * accommodate the size of the JPEG image.  Thus, you can choose to:
+ * -# pre-allocate the JPEG buffer with an arbitrary size using #tjAlloc() and
+ * let TurboJPEG grow the buffer as needed,
+ * -# set <tt>*jpegBuf</tt> to NULL to tell TurboJPEG to allocate the buffer
+ * for you, or
+ * -# pre-allocate the buffer to a "worst case" size determined by calling
+ * #tjBufSize().  This should ensure that the buffer never has to be
+ * re-allocated (setting #TJFLAG_NOREALLOC guarantees this.)
+ * .
+ * If you choose option 1, <tt>*jpegSize</tt> should be set to the size of your
+ * pre-allocated buffer.  In any case, unless you have set #TJFLAG_NOREALLOC,
+ * you should always check <tt>*jpegBuf</tt> upon return from this function, as
+ * it may have changed.
+ *
+ * @param jpegSize pointer to an unsigned long variable that holds the size of
+ * the JPEG image buffer.  If <tt>*jpegBuf</tt> points to a pre-allocated
+ * buffer, then <tt>*jpegSize</tt> should be set to the size of the buffer.
+ * Upon return, <tt>*jpegSize</tt> will contain the size of the JPEG image (in
+ * bytes.)  If <tt>*jpegBuf</tt> points to a JPEG image buffer that is being
+ * reused from a previous call to one of the JPEG compression functions, then
+ * <tt>*jpegSize</tt> is ignored.
+ *
+ * @param jpegQual the image quality of the generated JPEG image (1 = worst,
+ * 100 = best)
+ *
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ * "flags"
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+*/
+DLLEXPORT int DLLCALL tjCompressFromYUV(tjhandle handle,
+  const unsigned char *srcBuf, int width, int pad, int height, int subsamp,
+  unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags);
+
+
+/**
+ * Compress a set of Y, U (Cb), and V (Cr) image planes into a JPEG image.
+ *
+ * @param handle a handle to a TurboJPEG compressor or transformer instance
+ *
+ * @param srcPlanes an array of pointers to Y, U (Cb), and V (Cr) image planes
+ * (or just a Y plane, if compressing a grayscale image) that contain a YUV
+ * image to be compressed.  These planes can be contiguous or non-contiguous in
+ * memory.  The size of each plane should match the value returned by
+ * #tjPlaneSizeYUV() for the given image width, height, strides, and level of
+ * chrominance subsampling.  Refer to @ref YUVnotes "YUV Image Format Notes"
+ * for more details.
+ *
+ * @param width width (in pixels) of the source image.  If the width is not an
+ * even multiple of the MCU block width (see #tjMCUWidth), then an intermediate
+ * buffer copy will be performed within TurboJPEG.
+ *
+ * @param strides an array of integers, each specifying the number of bytes per
+ * line in the corresponding plane of the YUV source image.  Setting the stride
+ * for any plane to 0 is the same as setting it to the plane width (see
+ * @ref YUVnotes "YUV Image Format Notes".)  If <tt>strides</tt> is NULL, then
+ * the strides for all planes will be set to their respective plane widths.
+ * You can adjust the strides in order to specify an arbitrary amount of line
+ * padding in each plane or to create a JPEG image from a subregion of a larger
+ * YUV planar image.
+ *
+ * @param height height (in pixels) of the source image.  If the height is not
+ * an even multiple of the MCU block height (see #tjMCUHeight), then an
+ * intermediate buffer copy will be performed within TurboJPEG.
+ *
+ * @param subsamp the level of chrominance subsampling used in the source
+ * image (see @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @param jpegBuf address of a pointer to an image buffer that will receive the
+ * JPEG image.  TurboJPEG has the ability to reallocate the JPEG buffer to
+ * accommodate the size of the JPEG image.  Thus, you can choose to:
+ * -# pre-allocate the JPEG buffer with an arbitrary size using #tjAlloc() and
+ * let TurboJPEG grow the buffer as needed,
+ * -# set <tt>*jpegBuf</tt> to NULL to tell TurboJPEG to allocate the buffer
+ * for you, or
+ * -# pre-allocate the buffer to a "worst case" size determined by calling
+ * #tjBufSize().  This should ensure that the buffer never has to be
+ * re-allocated (setting #TJFLAG_NOREALLOC guarantees this.)
+ * .
+ * If you choose option 1, <tt>*jpegSize</tt> should be set to the size of your
+ * pre-allocated buffer.  In any case, unless you have set #TJFLAG_NOREALLOC,
+ * you should always check <tt>*jpegBuf</tt> upon return from this function, as
+ * it may have changed.
+ *
+ * @param jpegSize pointer to an unsigned long variable that holds the size of
+ * the JPEG image buffer.  If <tt>*jpegBuf</tt> points to a pre-allocated
+ * buffer, then <tt>*jpegSize</tt> should be set to the size of the buffer.
+ * Upon return, <tt>*jpegSize</tt> will contain the size of the JPEG image (in
+ * bytes.)  If <tt>*jpegBuf</tt> points to a JPEG image buffer that is being
+ * reused from a previous call to one of the JPEG compression functions, then
+ * <tt>*jpegSize</tt> is ignored.
+ *
+ * @param jpegQual the image quality of the generated JPEG image (1 = worst,
+ * 100 = best)
+ *
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ * "flags"
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+*/
+DLLEXPORT int DLLCALL tjCompressFromYUVPlanes(tjhandle handle,
+  const unsigned char **srcPlanes, int width, const int *strides, int height,
+  int subsamp, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual,
+  int flags);
+
+
+/**
  * The maximum size of the buffer (in bytes) required to hold a JPEG image with
  * the given parameters.  The number of bytes returned by this function is
  * larger than the size of the uncompressed source image.  The reason for this
@@ -574,11 +823,13 @@
  * size of a JPEG image prior to compression, the corner case has to be
  * handled.
  *
- * @param width width of the image (in pixels)
- * @param height height of the image (in pixels)
+ * @param width width (in pixels) of the image
+ *
+ * @param height height (in pixels) of the image
+ *
  * @param jpegSubsamp the level of chrominance subsampling to be used when
- *        generating the JPEG image (see @ref TJSAMP
- *        "Chrominance subsampling options".)
+ * generating the JPEG image (see @ref TJSAMP
+ * "Chrominance subsampling options".)
  *
  * @return the maximum size of the buffer (in bytes) required to hold the
  * image, or -1 if the arguments are out of bounds.
@@ -591,64 +842,192 @@
  * The size of the buffer (in bytes) required to hold a YUV planar image with
  * the given parameters.
  *
- * @param width width of the image (in pixels)
- * @param height height of the image (in pixels)
+ * @param width width (in pixels) of the image
+ *
+ * @param pad the width of each line in each plane of the image is padded to
+ * the nearest multiple of this number of bytes (must be a power of 2.)
+ *
+ * @param height height (in pixels) of the image
+ *
  * @param subsamp level of chrominance subsampling in the image (see
- *        @ref TJSAMP "Chrominance subsampling options".)
+ * @ref TJSAMP "Chrominance subsampling options".)
  *
  * @return the size of the buffer (in bytes) required to hold the image, or
  * -1 if the arguments are out of bounds.
  */
-DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV2(int width, int pad, int height,
   int subsamp);
 
 
 /**
+ * The size of the buffer (in bytes) required to hold a YUV image plane with
+ * the given parameters.
+ *
+ * @param componentID ID number of the image plane (0 = Y, 1 = U/Cb, 2 = V/Cr)
+ *
+ * @param width width (in pixels) of the YUV image.  NOTE: this is the width of
+ * the whole image, not the plane width.
+ *
+ * @param stride bytes per line in the image plane.  Setting this to 0 is the
+ * equivalent of setting it to the plane width.
+ *
+ * @param height height (in pixels) of the YUV image.  NOTE: this is the height
+ * of the whole image, not the plane height.
+ *
+ * @param subsamp level of chrominance subsampling in the image (see
+ * @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return the size of the buffer (in bytes) required to hold the YUV image
+ * plane, or -1 if the arguments are out of bounds.
+ */
+DLLEXPORT unsigned long DLLCALL tjPlaneSizeYUV(int componentID, int width,
+  int stride, int height, int subsamp);
+
+
+/**
+ * The plane width of a YUV image plane with the given parameters.  Refer to
+ * @ref YUVnotes "YUV Image Format Notes" for a description of plane width.
+ *
+ * @param componentID ID number of the image plane (0 = Y, 1 = U/Cb, 2 = V/Cr)
+ *
+ * @param width width (in pixels) of the YUV image
+ *
+ * @param subsamp level of chrominance subsampling in the image (see
+ * @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return the plane width of a YUV image plane with the given parameters, or
+ * -1 if the arguments are out of bounds.
+ */
+DLLEXPORT int tjPlaneWidth(int componentID, int width, int subsamp);
+
+
+/**
+ * The plane height of a YUV image plane with the given parameters.  Refer to
+ * @ref YUVnotes "YUV Image Format Notes" for a description of plane height.
+ *
+ * @param componentID ID number of the image plane (0 = Y, 1 = U/Cb, 2 = V/Cr)
+ *
+ * @param height height (in pixels) of the YUV image
+ *
+ * @param subsamp level of chrominance subsampling in the image (see
+ * @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return the plane height of a YUV image plane with the given parameters, or
+ * -1 if the arguments are out of bounds.
+ */
+DLLEXPORT int tjPlaneHeight(int componentID, int height, int subsamp);
+
+
+/**
  * Encode an RGB or grayscale image into a YUV planar image.  This function
- * uses the accelerated color conversion routines in TurboJPEG's underlying
- * codec to produce a planar YUV image that is suitable for X Video.
- * Specifically, if the chrominance components are subsampled along the
- * horizontal dimension, then the width of the luminance plane is padded to the
- * nearest multiple of 2 in the output image (same goes for the height of the
- * luminance plane, if the chrominance components are subsampled along the
- * vertical dimension.)  Also, each line of each plane in the output image is
- * padded to 4 bytes.  Although this will work with any subsampling option, it
- * is really only useful in combination with TJ_420, which produces an image
- * compatible with the I420 (AKA "YUV420P") format.
- * <p>
- * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
- * convention of the digital video community, the TurboJPEG API uses "YUV" to
- * refer to an image format consisting of Y, Cb, and Cr image planes.
+ * uses the accelerated color conversion routines in the underlying
+ * codec but does not execute any of the other steps in the JPEG compression
+ * process.
  *
  * @param handle a handle to a TurboJPEG compressor or transformer instance
+ *
  * @param srcBuf pointer to an image buffer containing RGB or grayscale pixels
- *        to be encoded
+ * to be encoded
+ *
  * @param width width (in pixels) of the source image
- * @param pitch bytes per line of the source image.  Normally, this should be
- *        <tt>width * #tjPixelSize[pixelFormat]</tt> if the image is unpadded,
- *        or <tt>#TJPAD(width * #tjPixelSize[pixelFormat])</tt> if each line of
- *        the image is padded to the nearest 32-bit boundary, as is the case
- *        for Windows bitmaps.  You can also be clever and use this parameter
- *        to skip lines, etc.  Setting this parameter to 0 is the equivalent of
- *        setting it to <tt>width * #tjPixelSize[pixelFormat]</tt>.
+ *
+ * @param pitch bytes per line in the source image.  Normally, this should be
+ * <tt>width * #tjPixelSize[pixelFormat]</tt> if the image is unpadded, or
+ * <tt>#TJPAD(width * #tjPixelSize[pixelFormat])</tt> if each line of the image
+ * is padded to the nearest 32-bit boundary, as is the case for Windows
+ * bitmaps.  You can also be clever and use this parameter to skip lines, etc.
+ * Setting this parameter to 0 is the equivalent of setting it to
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>.
+ *
  * @param height height (in pixels) of the source image
+ *
  * @param pixelFormat pixel format of the source image (see @ref TJPF
- *        "Pixel formats".)
+ * "Pixel formats".)
+ *
  * @param dstBuf pointer to an image buffer that will receive the YUV image.
- *        Use #tjBufSizeYUV() to determine the appropriate size for this buffer
- *        based on the image width, height, and level of chrominance
- *        subsampling.
+ * Use #tjBufSizeYUV2() to determine the appropriate size for this buffer based
+ * on the image width, height, padding, and level of chrominance subsampling.
+ * The Y, U (Cb), and V (Cr) image planes will be stored sequentially in the
+ * buffer (refer to @ref YUVnotes "YUV Image Format Notes".)
+ *
+ * @param pad the width of each line in each plane of the YUV image will be
+ * padded to the nearest multiple of this number of bytes (must be a power of
+ * 2.)  To generate images suitable for X Video, <tt>pad</tt> should be set to
+ * 4.
+ *
  * @param subsamp the level of chrominance subsampling to be used when
- *        generating the YUV image (see @ref TJSAMP
- *        "Chrominance subsampling options".)
+ * generating the YUV image (see @ref TJSAMP
+ * "Chrominance subsampling options".)  To generate images suitable for X
+ * Video, <tt>subsamp</tt> should be set to @ref TJSAMP_420.  This produces an
+ * image compatible with the I420 (AKA "YUV420P") format.
+ *
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
- *        "flags".
+ * "flags"
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
-DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle,
-  unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat,
-  unsigned char *dstBuf, int subsamp, int flags);
+DLLEXPORT int DLLCALL tjEncodeYUV3(tjhandle handle,
+  const unsigned char *srcBuf, int width, int pitch, int height,
+  int pixelFormat, unsigned char *dstBuf, int pad, int subsamp, int flags);
+
+
+/**
+ * Encode an RGB or grayscale image into separate Y, U (Cb), and V (Cr) image
+ * planes.  This function uses the accelerated color conversion routines in the
+ * underlying codec but does not execute any of the other steps in the JPEG
+ * compression process.
+ *
+ * @param handle a handle to a TurboJPEG compressor or transformer instance
+ *
+ * @param srcBuf pointer to an image buffer containing RGB or grayscale pixels
+ * to be encoded
+ *
+ * @param width width (in pixels) of the source image
+ *
+ * @param pitch bytes per line in the source image.  Normally, this should be
+ * <tt>width * #tjPixelSize[pixelFormat]</tt> if the image is unpadded, or
+ * <tt>#TJPAD(width * #tjPixelSize[pixelFormat])</tt> if each line of the image
+ * is padded to the nearest 32-bit boundary, as is the case for Windows
+ * bitmaps.  You can also be clever and use this parameter to skip lines, etc.
+ * Setting this parameter to 0 is the equivalent of setting it to
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>.
+ *
+ * @param height height (in pixels) of the source image
+ *
+ * @param pixelFormat pixel format of the source image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @param dstPlanes an array of pointers to Y, U (Cb), and V (Cr) image planes
+ * (or just a Y plane, if generating a grayscale image) that will receive the
+ * encoded image.  These planes can be contiguous or non-contiguous in memory.
+ * Use #tjPlaneSizeYUV() to determine the appropriate size for each plane based
+ * on the image width, height, strides, and level of chrominance subsampling.
+ * Refer to @ref YUVnotes "YUV Image Format Notes" for more details.
+ *
+ * @param strides an array of integers, each specifying the number of bytes per
+ * line in the corresponding plane of the output image.  Setting the stride for
+ * any plane to 0 is the same as setting it to the plane width (see
+ * @ref YUVnotes "YUV Image Format Notes".)  If <tt>strides</tt> is NULL, then
+ * the strides for all planes will be set to their respective plane widths.
+ * You can adjust the strides in order to add an arbitrary amount of line
+ * padding to each plane or to encode an RGB or grayscale image into a
+ * subregion of a larger YUV planar image.
+ *
+ * @param subsamp the level of chrominance subsampling to be used when
+ * generating the YUV image (see @ref TJSAMP
+ * "Chrominance subsampling options".)  To generate images suitable for X
+ * Video, <tt>subsamp</tt> should be set to @ref TJSAMP_420.  This produces an
+ * image compatible with the I420 (AKA "YUV420P") format.
+ *
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ * "flags"
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+*/
+DLLEXPORT int DLLCALL tjEncodeYUVPlanes(tjhandle handle,
+  const unsigned char *srcBuf, int width, int pitch, int height,
+  int pixelFormat, unsigned char **dstPlanes, int *strides, int subsamp,
+  int flags);
 
 
 /**
@@ -664,21 +1043,30 @@
  * Retrieve information about a JPEG image without decompressing it.
  *
  * @param handle a handle to a TurboJPEG decompressor or transformer instance
+ *
  * @param jpegBuf pointer to a buffer containing a JPEG image
+ *
  * @param jpegSize size of the JPEG image (in bytes)
+ *
  * @param width pointer to an integer variable that will receive the width (in
- *        pixels) of the JPEG image
+ * pixels) of the JPEG image
+ *
  * @param height pointer to an integer variable that will receive the height
- *        (in pixels) of the JPEG image
+ * (in pixels) of the JPEG image
+ *
  * @param jpegSubsamp pointer to an integer variable that will receive the
- *        level of chrominance subsampling used when compressing the JPEG image
- *        (see @ref TJSAMP "Chrominance subsampling options".)
+ * level of chrominance subsampling used when the JPEG image was compressed
+ * (see @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @param jpegColorspace pointer to an integer variable that will receive one
+ * of the JPEG colorspace constants, indicating the colorspace of the JPEG
+ * image (see @ref TJCS "JPEG colorspaces".)
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
 */
-DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
-  unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
-  int *jpegSubsamp);
+DLLEXPORT int DLLCALL tjDecompressHeader3(tjhandle handle,
+  const unsigned char *jpegBuf, unsigned long jpegSize, int *width,
+  int *height, int *jpegSubsamp, int *jpegColorspace);
 
 
 /**
@@ -686,7 +1074,7 @@
  * this implementation of TurboJPEG supports.
  *
  * @param numscalingfactors pointer to an integer variable that will receive
- *        the number of elements in the list
+ * the number of elements in the list
  *
  * @return a pointer to a list of fractional scaling factors, or NULL if an
  * error is encountered (see #tjGetErrorStr().)
@@ -695,79 +1083,272 @@
 
 
 /**
- * Decompress a JPEG image to an RGB or grayscale image.
+ * Decompress a JPEG image to an RGB, grayscale, or CMYK image.
  *
  * @param handle a handle to a TurboJPEG decompressor or transformer instance
+ *
  * @param jpegBuf pointer to a buffer containing the JPEG image to decompress
+ *
  * @param jpegSize size of the JPEG image (in bytes)
+ *
  * @param dstBuf pointer to an image buffer that will receive the decompressed
- *        image.  This buffer should normally be <tt>pitch * scaledHeight</tt>
- *        bytes in size, where <tt>scaledHeight</tt> can be determined by
- *        calling #TJSCALED() with the JPEG image height and one of the scaling
- *        factors returned by #tjGetScalingFactors().  The <tt>dstBuf</tt>
- *        pointer may also be used to decompress into a specific region of a
- *        larger buffer.
+ * image.  This buffer should normally be <tt>pitch * scaledHeight</tt> bytes
+ * in size, where <tt>scaledHeight</tt> can be determined by calling
+ * #TJSCALED() with the JPEG image height and one of the scaling factors
+ * returned by #tjGetScalingFactors().  The <tt>dstBuf</tt> pointer may also be
+ * used to decompress into a specific region of a larger buffer.
+ *
  * @param width desired width (in pixels) of the destination image.  If this is
- *        different than the width of the JPEG image being decompressed, then
- *        TurboJPEG will use scaling in the JPEG decompressor to generate the
- *        largest possible image that will fit within the desired width.  If
- *        <tt>width</tt> is set to 0, then only the height will be considered
- *        when determining the scaled image size.
- * @param pitch bytes per line of the destination image.  Normally, this is
- *        <tt>scaledWidth * #tjPixelSize[pixelFormat]</tt> if the decompressed
- *        image is unpadded, else <tt>#TJPAD(scaledWidth *
- *        #tjPixelSize[pixelFormat])</tt> if each line of the decompressed
- *        image is padded to the nearest 32-bit boundary, as is the case for
- *        Windows bitmaps.  (NOTE: <tt>scaledWidth</tt> can be determined by
- *        calling #TJSCALED() with the JPEG image width and one of the scaling
- *        factors returned by #tjGetScalingFactors().)  You can also be clever
- *        and use the pitch parameter to skip lines, etc.  Setting this
- *        parameter to 0 is the equivalent of setting it to <tt>scaledWidth
- *        * #tjPixelSize[pixelFormat]</tt>.
+ * different than the width of the JPEG image being decompressed, then
+ * TurboJPEG will use scaling in the JPEG decompressor to generate the largest
+ * possible image that will fit within the desired width.  If <tt>width</tt> is
+ * set to 0, then only the height will be considered when determining the
+ * scaled image size.
+ *
+ * @param pitch bytes per line in the destination image.  Normally, this is
+ * <tt>scaledWidth * #tjPixelSize[pixelFormat]</tt> if the decompressed image
+ * is unpadded, else <tt>#TJPAD(scaledWidth * #tjPixelSize[pixelFormat])</tt>
+ * if each line of the decompressed image is padded to the nearest 32-bit
+ * boundary, as is the case for Windows bitmaps.  (NOTE: <tt>scaledWidth</tt>
+ * can be determined by calling #TJSCALED() with the JPEG image width and one
+ * of the scaling factors returned by #tjGetScalingFactors().)  You can also be
+ * clever and use the pitch parameter to skip lines, etc.  Setting this
+ * parameter to 0 is the equivalent of setting it to
+ * <tt>scaledWidth * #tjPixelSize[pixelFormat]</tt>.
+ *
  * @param height desired height (in pixels) of the destination image.  If this
- *        is different than the height of the JPEG image being decompressed,
- *        then TurboJPEG will use scaling in the JPEG decompressor to generate
- *        the largest possible image that will fit within the desired height.
- *        If <tt>height</tt> is set to 0, then only the width will be
- *        considered when determining the scaled image size.
+ * is different than the height of the JPEG image being decompressed, then
+ * TurboJPEG will use scaling in the JPEG decompressor to generate the largest
+ * possible image that will fit within the desired height.  If <tt>height</tt>
+ * is set to 0, then only the width will be considered when determining the
+ * scaled image size.
+ *
  * @param pixelFormat pixel format of the destination image (see @ref
- *        TJPF "Pixel formats".)
+ * TJPF "Pixel formats".)
+ *
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
- *        "flags".
+ * "flags"
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
  */
 DLLEXPORT int DLLCALL tjDecompress2(tjhandle handle,
-  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+  const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
   int width, int pitch, int height, int pixelFormat, int flags);
 
 
 /**
  * Decompress a JPEG image to a YUV planar image.  This function performs JPEG
  * decompression but leaves out the color conversion step, so a planar YUV
- * image is generated instead of an RGB image.  The padding of the planes in
- * this image is the same as in the images generated by #tjEncodeYUV2().  Note
- * that, if the width or height of the image is not an even multiple of the MCU
- * block size (see #tjMCUWidth and #tjMCUHeight), then an intermediate buffer
- * copy will be performed within TurboJPEG.
- * <p>
- * NOTE: Technically, the JPEG format uses the YCbCr colorspace, but per the
- * convention of the digital video community, the TurboJPEG API uses "YUV" to
- * refer to an image format consisting of Y, Cb, and Cr image planes.
+ * image is generated instead of an RGB image.
  *
  * @param handle a handle to a TurboJPEG decompressor or transformer instance
+ *
  * @param jpegBuf pointer to a buffer containing the JPEG image to decompress
+ *
  * @param jpegSize size of the JPEG image (in bytes)
+ *
  * @param dstBuf pointer to an image buffer that will receive the YUV image.
- *        Use #tjBufSizeYUV() to determine the appropriate size for this buffer
- *        based on the image width, height, and level of subsampling.
+ * Use #tjBufSizeYUV2() to determine the appropriate size for this buffer based
+ * on the image width, height, padding, and level of subsampling.  The Y,
+ * U (Cb), and V (Cr) image planes will be stored sequentially in the buffer
+ * (refer to @ref YUVnotes "YUV Image Format Notes".)
+ *
+ * @param width desired width (in pixels) of the YUV image.  If this is
+ * different than the width of the JPEG image being decompressed, then
+ * TurboJPEG will use scaling in the JPEG decompressor to generate the largest
+ * possible image that will fit within the desired width.  If <tt>width</tt> is
+ * set to 0, then only the height will be considered when determining the
+ * scaled image size.  If the scaled width is not an even multiple of the MCU
+ * block width (see #tjMCUWidth), then an intermediate buffer copy will be
+ * performed within TurboJPEG.
+ *
+ * @param pad the width of each line in each plane of the YUV image will be
+ * padded to the nearest multiple of this number of bytes (must be a power of
+ * 2.)  To generate images suitable for X Video, <tt>pad</tt> should be set to
+ * 4.
+ *
+ * @param height desired height (in pixels) of the YUV image.  If this is
+ * different than the height of the JPEG image being decompressed, then
+ * TurboJPEG will use scaling in the JPEG decompressor to generate the largest
+ * possible image that will fit within the desired height.  If <tt>height</tt>
+ * is set to 0, then only the width will be considered when determining the
+ * scaled image size.  If the scaled height is not an even multiple of the MCU
+ * block height (see #tjMCUHeight), then an intermediate buffer copy will be
+ * performed within TurboJPEG.
+ *
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
- *        "flags".
+ * "flags"
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
  */
-DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
-  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+DLLEXPORT int DLLCALL tjDecompressToYUV2(tjhandle handle,
+  const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+  int width, int pad, int height, int flags);
+
+
+/**
+ * Decompress a JPEG image into separate Y, U (Cb), and V (Cr) image
+ * planes.  This function performs JPEG decompression but leaves out the color
+ * conversion step, so a planar YUV image is generated instead of an RGB image.
+ *
+ * @param handle a handle to a TurboJPEG decompressor or transformer instance
+ *
+ * @param jpegBuf pointer to a buffer containing the JPEG image to decompress
+ *
+ * @param jpegSize size of the JPEG image (in bytes)
+ *
+ * @param dstPlanes an array of pointers to Y, U (Cb), and V (Cr) image planes
+ * (or just a Y plane, if decompressing a grayscale image) that will receive
+ * the YUV image.  These planes can be contiguous or non-contiguous in memory.
+ * Use #tjPlaneSizeYUV() to determine the appropriate size for each plane based
+ * on the scaled image width, scaled image height, strides, and level of
+ * chrominance subsampling.  Refer to @ref YUVnotes "YUV Image Format Notes"
+ * for more details.
+ *
+ * @param width desired width (in pixels) of the YUV image.  If this is
+ * different than the width of the JPEG image being decompressed, then
+ * TurboJPEG will use scaling in the JPEG decompressor to generate the largest
+ * possible image that will fit within the desired width.  If <tt>width</tt> is
+ * set to 0, then only the height will be considered when determining the
+ * scaled image size.  If the scaled width is not an even multiple of the MCU
+ * block width (see #tjMCUWidth), then an intermediate buffer copy will be
+ * performed within TurboJPEG.
+ *
+ * @param strides an array of integers, each specifying the number of bytes per
+ * line in the corresponding plane of the output image.  Setting the stride for
+ * any plane to 0 is the same as setting it to the scaled plane width (see
+ * @ref YUVnotes "YUV Image Format Notes".)  If <tt>strides</tt> is NULL, then
+ * the strides for all planes will be set to their respective scaled plane
+ * widths.  You can adjust the strides in order to add an arbitrary amount of
+ * line padding to each plane or to decompress the JPEG image into a subregion
+ * of a larger YUV planar image.
+ *
+ * @param height desired height (in pixels) of the YUV image.  If this is
+ * different than the height of the JPEG image being decompressed, then
+ * TurboJPEG will use scaling in the JPEG decompressor to generate the largest
+ * possible image that will fit within the desired height.  If <tt>height</tt>
+ * is set to 0, then only the width will be considered when determining the
+ * scaled image size.  If the scaled height is not an even multiple of the MCU
+ * block height (see #tjMCUHeight), then an intermediate buffer copy will be
+ * performed within TurboJPEG.
+ *
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ * "flags"
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ */
+DLLEXPORT int DLLCALL tjDecompressToYUVPlanes(tjhandle handle,
+  const unsigned char *jpegBuf, unsigned long jpegSize,
+  unsigned char **dstPlanes, int width, int *strides, int height, int flags);
+
+
+/**
+ * Decode a YUV planar image into an RGB or grayscale image.  This function
+ * uses the accelerated color conversion routines in the underlying
+ * codec but does not execute any of the other steps in the JPEG decompression
+ * process.
+ *
+ * @param handle a handle to a TurboJPEG decompressor or transformer instance
+ *
+ * @param srcBuf pointer to an image buffer containing a YUV planar image to be
+ * decoded.  The size of this buffer should match the value returned by
+ * #tjBufSizeYUV2() for the given image width, height, padding, and level of
+ * chrominance subsampling.  The Y, U (Cb), and V (Cr) image planes should be
+ * stored sequentially in the source buffer (refer to @ref YUVnotes
+ * "YUV Image Format Notes".)
+ *
+ * @param pad Use this parameter to specify that the width of each line in each
+ * plane of the YUV source image is padded to the nearest multiple of this
+ * number of bytes (must be a power of 2.)
+ *
+ * @param subsamp the level of chrominance subsampling used in the YUV source
+ * image (see @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @param dstBuf pointer to an image buffer that will receive the decoded
+ * image.  This buffer should normally be <tt>pitch * height</tt> bytes in
+ * size, but the <tt>dstBuf</tt> pointer can also be used to decode into a
+ * specific region of a larger buffer.
+ *
+ * @param width width (in pixels) of the source and destination images
+ *
+ * @param pitch bytes per line in the destination image.  Normally, this should
+ * be <tt>width * #tjPixelSize[pixelFormat]</tt> if the destination image is
+ * unpadded, or <tt>#TJPAD(width * #tjPixelSize[pixelFormat])</tt> if each line
+ * of the destination image should be padded to the nearest 32-bit boundary, as
+ * is the case for Windows bitmaps.  You can also be clever and use the pitch
+ * parameter to skip lines, etc.  Setting this parameter to 0 is the equivalent
+ * of setting it to <tt>width * #tjPixelSize[pixelFormat]</tt>.
+ *
+ * @param height height (in pixels) of the source and destination images
+ *
+ * @param pixelFormat pixel format of the destination image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ * "flags"
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ */
+DLLEXPORT int DLLCALL tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
+  int pad, int subsamp, unsigned char *dstBuf, int width, int pitch,
+  int height, int pixelFormat, int flags);
+
+
+/**
+ * Decode a set of Y, U (Cb), and V (Cr) image planes into an RGB or grayscale
+ * image.  This function uses the accelerated color conversion routines in the
+ * underlying codec but does not execute any of the other steps in the JPEG
+ * decompression process.
+ *
+ * @param handle a handle to a TurboJPEG decompressor or transformer instance
+ *
+ * @param srcPlanes an array of pointers to Y, U (Cb), and V (Cr) image planes
+ * (or just a Y plane, if decoding a grayscale image) that contain a YUV image
+ * to be decoded.  These planes can be contiguous or non-contiguous in memory.
+ * The size of each plane should match the value returned by #tjPlaneSizeYUV()
+ * for the given image width, height, strides, and level of chrominance
+ * subsampling.  Refer to @ref YUVnotes "YUV Image Format Notes" for more
+ * details.
+ *
+ * @param strides an array of integers, each specifying the number of bytes per
+ * line in the corresponding plane of the YUV source image.  Setting the stride
+ * for any plane to 0 is the same as setting it to the plane width (see
+ * @ref YUVnotes "YUV Image Format Notes".)  If <tt>strides</tt> is NULL, then
+ * the strides for all planes will be set to their respective plane widths.
+ * You can adjust the strides in order to specify an arbitrary amount of line
+ * padding in each plane or to decode a subregion of a larger YUV planar image.
+ *
+ * @param subsamp the level of chrominance subsampling used in the YUV source
+ * image (see @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @param dstBuf pointer to an image buffer that will receive the decoded
+ * image.  This buffer should normally be <tt>pitch * height</tt> bytes in
+ * size, but the <tt>dstBuf</tt> pointer can also be used to decode into a
+ * specific region of a larger buffer.
+ *
+ * @param width width (in pixels) of the source and destination images
+ *
+ * @param pitch bytes per line in the destination image.  Normally, this should
+ * be <tt>width * #tjPixelSize[pixelFormat]</tt> if the destination image is
+ * unpadded, or <tt>#TJPAD(width * #tjPixelSize[pixelFormat])</tt> if each line
+ * of the destination image should be padded to the nearest 32-bit boundary, as
+ * is the case for Windows bitmaps.  You can also be clever and use the pitch
+ * parameter to skip lines, etc.  Setting this parameter to 0 is the equivalent
+ * of setting it to <tt>width * #tjPixelSize[pixelFormat]</tt>.
+ *
+ * @param height height (in pixels) of the source and destination images
+ *
+ * @param pixelFormat pixel format of the destination image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ * "flags"
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ */
+DLLEXPORT int DLLCALL tjDecodeYUVPlanes(tjhandle handle,
+  const unsigned char **srcPlanes, const int *strides, int subsamp,
+  unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat,
   int flags);
 
 
@@ -782,9 +1363,9 @@
 
 /**
  * Losslessly transform a JPEG image into another JPEG image.  Lossless
- * transforms work by moving the raw coefficients from one JPEG image structure
- * to another without altering the values of the coefficients.  While this is
- * typically faster than decompressing the image, transforming it, and
+ * transforms work by moving the raw DCT coefficients from one JPEG image
+ * structure to another without altering the values of the coefficients.  While
+ * this is typically faster than decompressing the image, transforming it, and
  * re-compressing it, lossless transforms are not free.  Each lossless
  * transform requires reading and performing Huffman decoding on all of the
  * coefficients in the source image, regardless of the size of the destination
@@ -794,51 +1375,58 @@
  * source coefficients multiple times.
  *
  * @param handle a handle to a TurboJPEG transformer instance
- * @param jpegBuf pointer to a buffer containing the JPEG image to transform
- * @param jpegSize size of the JPEG image (in bytes)
+ *
+ * @param jpegBuf pointer to a buffer containing the JPEG source image to
+ * transform
+ *
+ * @param jpegSize size of the JPEG source image (in bytes)
+ *
  * @param n the number of transformed JPEG images to generate
+ *
  * @param dstBufs pointer to an array of n image buffers.  <tt>dstBufs[i]</tt>
- *        will receive a JPEG image that has been transformed using the
- *        parameters in <tt>transforms[i]</tt>.  TurboJPEG has the ability to
- *        reallocate the JPEG buffer to accommodate the size of the JPEG image.
- *        Thus, you can choose to:
- *        -# pre-allocate the JPEG buffer with an arbitrary size using
- *        #tjAlloc() and let TurboJPEG grow the buffer as needed,
- *        -# set <tt>dstBufs[i]</tt> to NULL to tell TurboJPEG to allocate the
- *        buffer for you, or
- *        -# pre-allocate the buffer to a "worst case" size determined by
- *        calling #tjBufSize() with the transformed or cropped width and
- *        height.  This should ensure that the buffer never has to be
- *        re-allocated (setting #TJFLAG_NOREALLOC guarantees this.)
- *        .
- *        If you choose option 1, <tt>dstSizes[i]</tt> should be set to
- *        the size of your pre-allocated buffer.  In any case, unless you have
- *        set #TJFLAG_NOREALLOC, you should always check <tt>dstBufs[i]</tt>
- *        upon return from this function, as it may have changed.
+ * will receive a JPEG image that has been transformed using the parameters in
+ * <tt>transforms[i]</tt>.  TurboJPEG has the ability to reallocate the JPEG
+ * buffer to accommodate the size of the JPEG image.  Thus, you can choose to:
+ * -# pre-allocate the JPEG buffer with an arbitrary size using #tjAlloc() and
+ * let TurboJPEG grow the buffer as needed,
+ * -# set <tt>dstBufs[i]</tt> to NULL to tell TurboJPEG to allocate the buffer
+ * for you, or
+ * -# pre-allocate the buffer to a "worst case" size determined by calling
+ * #tjBufSize() with the transformed or cropped width and height.  This should
+ * ensure that the buffer never has to be re-allocated (setting
+ * #TJFLAG_NOREALLOC guarantees this.)
+ * .
+ * If you choose option 1, <tt>dstSizes[i]</tt> should be set to the size of
+ * your pre-allocated buffer.  In any case, unless you have set
+ * #TJFLAG_NOREALLOC, you should always check <tt>dstBufs[i]</tt> upon return
+ * from this function, as it may have changed.
+ *
  * @param dstSizes pointer to an array of n unsigned long variables that will
- *        receive the actual sizes (in bytes) of each transformed JPEG image.
- *        If <tt>dstBufs[i]</tt> points to a pre-allocated buffer, then
- *        <tt>dstSizes[i]</tt> should be set to the size of the buffer.  Upon
- *        return, <tt>dstSizes[i]</tt> will contain the size of the JPEG image
- *        (in bytes.)
+ * receive the actual sizes (in bytes) of each transformed JPEG image.  If
+ * <tt>dstBufs[i]</tt> points to a pre-allocated buffer, then
+ * <tt>dstSizes[i]</tt> should be set to the size of the buffer.  Upon return,
+ * <tt>dstSizes[i]</tt> will contain the size of the JPEG image (in bytes.)
+ *
  * @param transforms pointer to an array of n #tjtransform structures, each of
- *        which specifies the transform parameters and/or cropping region for
- *        the corresponding transformed output image.
+ * which specifies the transform parameters and/or cropping region for the
+ * corresponding transformed output image.
+ *
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
- *        "flags".
+ * "flags"
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
  */
-DLLEXPORT int DLLCALL tjTransform(tjhandle handle, unsigned char *jpegBuf,
-  unsigned long jpegSize, int n, unsigned char **dstBufs,
-  unsigned long *dstSizes, tjtransform *transforms, int flags);
+DLLEXPORT int DLLCALL tjTransform(tjhandle handle,
+  const unsigned char *jpegBuf, unsigned long jpegSize, int n,
+  unsigned char **dstBufs, unsigned long *dstSizes, tjtransform *transforms,
+  int flags);
 
 
 /**
  * Destroy a TurboJPEG compressor, decompressor, or transformer instance.
  *
  * @param handle a handle to a TurboJPEG compressor, decompressor or
- *        transformer instance
+ * transformer instance
  *
  * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
  */
@@ -852,9 +1440,9 @@
  * (re)allocation (by setting #TJFLAG_NOREALLOC.)
  *
  * @param bytes the number of bytes to allocate
- * 
+ *
  * @return a pointer to a newly-allocated buffer with the specified number of
- *         bytes
+ * bytes.
  *
  * @sa tjFree()
  */
@@ -882,6 +1470,13 @@
 DLLEXPORT char* DLLCALL tjGetErrorStr(void);
 
 
+/* Deprecated functions and macros */
+#define TJFLAG_FORCEMMX        8
+#define TJFLAG_FORCESSE       16
+#define TJFLAG_FORCESSE2      32
+#define TJFLAG_FORCESSE3     128
+
+
 /* Backward compatibility functions and macros (nothing to see here) */
 #define NUMSUBOPT TJ_NUMSAMP
 #define TJ_444 TJSAMP_444
@@ -905,6 +1500,9 @@
 DLLEXPORT unsigned long DLLCALL TJBUFSIZEYUV(int width, int height,
   int jpegSubsamp);
 
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+  int subsamp);
+
 DLLEXPORT int DLLCALL tjCompress(tjhandle handle, unsigned char *srcBuf,
   int width, int pitch, int height, int pixelSize, unsigned char *dstBuf,
   unsigned long *compressedSize, int jpegSubsamp, int jpegQual, int flags);
@@ -913,13 +1511,25 @@
   unsigned char *srcBuf, int width, int pitch, int height, int pixelSize,
   unsigned char *dstBuf, int subsamp, int flags);
 
+DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle,
+  unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat,
+  unsigned char *dstBuf, int subsamp, int flags);
+
 DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle handle,
   unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height);
 
+DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
+  unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
+  int *jpegSubsamp);
+
 DLLEXPORT int DLLCALL tjDecompress(tjhandle handle,
   unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
   int width, int pitch, int height, int pixelSize, int flags);
 
+DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
+  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+  int flags);
+
 
 /**
  * @}
diff --git a/turbojpegl.c b/turbojpegl.c
deleted file mode 100644
index 2150a2d..0000000
--- a/turbojpegl.c
+++ /dev/null
@@ -1,363 +0,0 @@
-/* Copyright (C)2004 Landmark Graphics Corporation
- * Copyright (C)2005 Sun Microsystems, Inc.
- * Copyright (C)2009 D. R. Commander
- *
- * This library is free software and may be redistributed and/or modified under
- * the terms of the wxWindows Library License, Version 3.1 or (at your option)
- * any later version.  The full license is in the LICENSE.txt file included
- * with this distribution.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * wxWindows Library License for more details.
- */
-
-// This implements a JPEG compressor/decompressor using the libjpeg API
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <jpeglib.h>
-#include <jerror.h>
-#include <setjmp.h>
-#include "./turbojpeg.h"
-
-
-// Error handling
-
-static char lasterror[JMSG_LENGTH_MAX]="No error";
-
-typedef struct _error_mgr
-{
-	struct jpeg_error_mgr pub;
-	jmp_buf jb;
-} error_mgr;
-
-static void my_error_exit(j_common_ptr cinfo)
-{
-	error_mgr *myerr = (error_mgr *)cinfo->err;
-	(*cinfo->err->output_message)(cinfo);
-	longjmp(myerr->jb, 1);
-}
-
-static void my_output_message(j_common_ptr cinfo)
-{
-	(*cinfo->err->format_message)(cinfo, lasterror);
-}
-
-
-// Global structures, macros, etc.
-
-typedef struct _jpgstruct
-{
-	struct jpeg_compress_struct cinfo;
-	struct jpeg_decompress_struct dinfo;
-	struct jpeg_destination_mgr jdms;
-	struct jpeg_source_mgr jsms;
-	error_mgr jerr;
-	int initc, initd;
-} jpgstruct;
-
-static const int hsampfactor[NUMSUBOPT]={1, 2, 2, 1};
-static const int vsampfactor[NUMSUBOPT]={1, 1, 2, 1};
-
-#define _throw(c) {sprintf(lasterror, "%s", c);  return -1;}
-#define _catch(f) {if((f)==-1) return -1;}
-#define checkhandle(h) jpgstruct *j=(jpgstruct *)h; \
-	if(!j) _throw("Invalid handle");
-
-
-// CO
-
-static boolean empty_output_buffer(struct jpeg_compress_struct *cinfo)
-{
-	ERREXIT(cinfo, JERR_BUFFER_SIZE);
-	return TRUE;
-}
-
-static void destination_noop(struct jpeg_compress_struct *cinfo)
-{
-}
-
-DLLEXPORT tjhandle DLLCALL tjInitCompress(void)
-{
-	jpgstruct *j=NULL;
-	if((j=(jpgstruct *)malloc(sizeof(jpgstruct)))==NULL)
-		{sprintf(lasterror, "Memory allocation failure");  return NULL;}
-	memset(j, 0, sizeof(jpgstruct));
-	j->cinfo.err=jpeg_std_error(&j->jerr.pub);
-	j->jerr.pub.error_exit=my_error_exit;
-	j->jerr.pub.output_message=my_output_message;
-
-	if(setjmp(j->jerr.jb))
-	{ // this will execute if LIBJPEG has an error
-		if(j) free(j);  return NULL;
-  }
-
-	jpeg_create_compress(&j->cinfo);
-	j->cinfo.dest=&j->jdms;
-	j->jdms.init_destination=destination_noop;
-	j->jdms.empty_output_buffer=empty_output_buffer;
-	j->jdms.term_destination=destination_noop;
-
-	j->initc=1;
-	return (tjhandle)j;
-}
-
-DLLEXPORT unsigned long DLLCALL TJBUFSIZE(int width, int height)
-{
-	// This allows enough room in case the image doesn't compress
-	return ((width+15)&(~15)) * ((height+15)&(~15)) * 6 + 2048;
-}
-
-DLLEXPORT int DLLCALL tjCompress(tjhandle h,
-	unsigned char *srcbuf, int width, int pitch, int height, int ps,
-	unsigned char *dstbuf, unsigned long *size,
-	int jpegsub, int qual, int flags)
-{
-	int i;  JSAMPROW *row_pointer=NULL;
-
-	checkhandle(h);
-
-	if(srcbuf==NULL || width<=0 || pitch<0 || height<=0
-		|| dstbuf==NULL || size==NULL
-		|| jpegsub<0 || jpegsub>=NUMSUBOPT || qual<0 || qual>100)
-		_throw("Invalid argument in tjCompress()");
-	if(ps!=3 && ps!=4) _throw("This compressor can only take 24-bit or 32-bit RGB input");
-	if(!j->initc) _throw("Instance has not been initialized for compression");
-
-	if(pitch==0) pitch=width*ps;
-
-	j->cinfo.image_width = width;
-	j->cinfo.image_height = height;
-	j->cinfo.input_components = ps;
-
-	#if JCS_EXTENSIONS==1
-	j->cinfo.in_color_space = JCS_EXT_RGB;
-	if(ps==3 && (flags&TJ_BGR))
-		j->cinfo.in_color_space = JCS_EXT_BGR;
-	else if(ps==4 && !(flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
-		j->cinfo.in_color_space = JCS_EXT_RGBX;
-	else if(ps==4 && (flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
-		j->cinfo.in_color_space = JCS_EXT_BGRX;
-	else if(ps==4 && (flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
-		j->cinfo.in_color_space = JCS_EXT_XBGR;
-	else if(ps==4 && !(flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
-		j->cinfo.in_color_space = JCS_EXT_XRGB;
-	#else
-	#error "TurboJPEG requires JPEG colorspace extensions"
-	#endif
-
-	if(flags&TJ_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJ_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJ_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
-
-	if(setjmp(j->jerr.jb))
-	{  // this will execute if LIBJPEG has an error
-		if(row_pointer) free(row_pointer);
-		return -1;
-  }
-
-	jpeg_set_defaults(&j->cinfo);
-
-	jpeg_set_quality(&j->cinfo, qual, TRUE);
-	if(jpegsub==TJ_GRAYSCALE)
-		jpeg_set_colorspace(&j->cinfo, JCS_GRAYSCALE);
-	else
-		jpeg_set_colorspace(&j->cinfo, JCS_YCbCr);
-	j->cinfo.dct_method = JDCT_FASTEST;
-
-	j->cinfo.comp_info[0].h_samp_factor=hsampfactor[jpegsub];
-	j->cinfo.comp_info[1].h_samp_factor=1;
-	j->cinfo.comp_info[2].h_samp_factor=1;
-	j->cinfo.comp_info[0].v_samp_factor=vsampfactor[jpegsub];
-	j->cinfo.comp_info[1].v_samp_factor=1;
-	j->cinfo.comp_info[2].v_samp_factor=1;
-
-	j->jdms.next_output_byte = dstbuf;
-	j->jdms.free_in_buffer = TJBUFSIZE(j->cinfo.image_width, j->cinfo.image_height);
-
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
-		_throw("Memory allocation failed in tjInitCompress()");
-	for(i=0; i<height; i++)
-	{
-		if(flags&TJ_BOTTOMUP) row_pointer[i]= &srcbuf[(height-i-1)*pitch];
-		else row_pointer[i]= &srcbuf[i*pitch];
-	}
-	jpeg_start_compress(&j->cinfo, TRUE);
-	while(j->cinfo.next_scanline<j->cinfo.image_height)
-	{
-		jpeg_write_scanlines(&j->cinfo, &row_pointer[j->cinfo.next_scanline],
-			j->cinfo.image_height-j->cinfo.next_scanline);
-	}
-	jpeg_finish_compress(&j->cinfo);
-	*size=TJBUFSIZE(j->cinfo.image_width, j->cinfo.image_height)
-		-(unsigned long)(j->jdms.free_in_buffer);
-
-	if(row_pointer) free(row_pointer);
-	return 0;
-}
-
-
-// DEC
-
-static boolean fill_input_buffer (struct jpeg_decompress_struct *dinfo)
-{
-	ERREXIT(dinfo, JERR_BUFFER_SIZE);
-	return TRUE;
-}
-
-static void skip_input_data (struct jpeg_decompress_struct *dinfo, long num_bytes)
-{
-	dinfo->src->next_input_byte += (size_t) num_bytes;
-	dinfo->src->bytes_in_buffer -= (size_t) num_bytes;
-}
-
-static void source_noop (struct jpeg_decompress_struct *dinfo)
-{
-}
-
-DLLEXPORT tjhandle DLLCALL tjInitDecompress(void)
-{
-	jpgstruct *j;
-	if((j=(jpgstruct *)malloc(sizeof(jpgstruct)))==NULL)
-		{sprintf(lasterror, "Memory allocation failure");  return NULL;}
-	memset(j, 0, sizeof(jpgstruct));
-	j->dinfo.err=jpeg_std_error(&j->jerr.pub);
-	j->jerr.pub.error_exit=my_error_exit;
-	j->jerr.pub.output_message=my_output_message;
-
-	if(setjmp(j->jerr.jb))
-	{ // this will execute if LIBJPEG has an error
-		free(j);  return NULL;
-  }
-
-	jpeg_create_decompress(&j->dinfo);
-	j->dinfo.src=&j->jsms;
-	j->jsms.init_source=source_noop;
-	j->jsms.fill_input_buffer = fill_input_buffer;
-	j->jsms.skip_input_data = skip_input_data;
-	j->jsms.resync_to_restart = jpeg_resync_to_restart;
-	j->jsms.term_source = source_noop;
-
-	j->initd=1;
-	return (tjhandle)j;
-}
-
-
-DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle h,
-	unsigned char *srcbuf, unsigned long size,
-	int *width, int *height)
-{
-	checkhandle(h);
-
-	if(srcbuf==NULL || size<=0 || width==NULL || height==NULL)
-		_throw("Invalid argument in tjDecompressHeader()");
-	if(!j->initd) _throw("Instance has not been initialized for decompression");
-
-	if(setjmp(j->jerr.jb))
-	{  // this will execute if LIBJPEG has an error
-		return -1;
-	}
-
-	j->jsms.bytes_in_buffer = size;
-	j->jsms.next_input_byte = srcbuf;
-
-	jpeg_read_header(&j->dinfo, TRUE);
-
-	*width=j->dinfo.image_width;  *height=j->dinfo.image_height;
-
-	jpeg_abort_decompress(&j->dinfo);
-
-	if(*width<1 || *height<1) _throw("Invalid data returned in header");
-	return 0;
-}
-
-
-DLLEXPORT int DLLCALL tjDecompress(tjhandle h,
-	unsigned char *srcbuf, unsigned long size,
-	unsigned char *dstbuf, int width, int pitch, int height, int ps,
-	int flags)
-{
-	int i;  JSAMPROW *row_pointer=NULL;
-
-	checkhandle(h);
-
-	if(srcbuf==NULL || size<=0
-		|| dstbuf==NULL || width<=0 || pitch<0 || height<=0)
-		_throw("Invalid argument in tjDecompress()");
-	if(ps!=3 && ps!=4) _throw("This compressor can only take 24-bit or 32-bit RGB input");
-	if(!j->initd) _throw("Instance has not been initialized for decompression");
-
-	if(pitch==0) pitch=width*ps;
-
-	if(flags&TJ_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJ_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJ_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
-
-	if(setjmp(j->jerr.jb))
-	{  // this will execute if LIBJPEG has an error
-		if(row_pointer) free(row_pointer);
-		return -1;
-  }
-
-	j->jsms.bytes_in_buffer = size;
-	j->jsms.next_input_byte = srcbuf;
-
-	jpeg_read_header(&j->dinfo, TRUE);
-
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
-		_throw("Memory allocation failed in tjInitDecompress()");
-	for(i=0; i<height; i++)
-	{
-		if(flags&TJ_BOTTOMUP) row_pointer[i]= &dstbuf[(height-i-1)*pitch];
-		else row_pointer[i]= &dstbuf[i*pitch];
-	}
-
-	#if JCS_EXTENSIONS==1
-	j->dinfo.out_color_space = JCS_EXT_RGB;
-	if(ps==3 && (flags&TJ_BGR))
-		j->dinfo.out_color_space = JCS_EXT_BGR;
-	else if(ps==4 && !(flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
-		j->dinfo.out_color_space = JCS_EXT_RGBX;
-	else if(ps==4 && (flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
-		j->dinfo.out_color_space = JCS_EXT_BGRX;
-	else if(ps==4 && (flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
-		j->dinfo.out_color_space = JCS_EXT_XBGR;
-	else if(ps==4 && !(flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
-		j->dinfo.out_color_space = JCS_EXT_XRGB;
-	#else
-	#error "TurboJPEG requires JPEG colorspace extensions"
-	#endif
-	if(flags&TJ_FASTUPSAMPLE) j->dinfo.do_fancy_upsampling=FALSE;
-
-	jpeg_start_decompress(&j->dinfo);
-	while(j->dinfo.output_scanline<j->dinfo.output_height)
-	{
-		jpeg_read_scanlines(&j->dinfo, &row_pointer[j->dinfo.output_scanline],
-			j->dinfo.output_height-j->dinfo.output_scanline);
-	}
-	jpeg_finish_decompress(&j->dinfo);
-
-	if(row_pointer) free(row_pointer);
-	return 0;
-}
-
-
-// General
-
-DLLEXPORT char* DLLCALL tjGetErrorStr(void)
-{
-	return lasterror;
-}
-
-DLLEXPORT int DLLCALL tjDestroy(tjhandle h)
-{
-	checkhandle(h);
-	if(setjmp(j->jerr.jb)) return -1;
-	if(j->initc) jpeg_destroy_compress(&j->cinfo);
-	if(j->initd) jpeg_destroy_decompress(&j->dinfo);
-	free(j);
-	return 0;
-}
diff --git a/usage.txt b/usage.txt
new file mode 100644
index 0000000..5abda4e
--- /dev/null
+++ b/usage.txt
@@ -0,0 +1,660 @@
+NOTE:  This file was modified by The libjpeg-turbo Project to include only
+information relevant to libjpeg-turbo and to wordsmith certain sections.
+
+USAGE instructions for the Independent JPEG Group's JPEG software
+=================================================================
+
+This file describes usage of the JPEG conversion programs cjpeg and djpeg,
+as well as the utility programs jpegtran, rdjpgcom and wrjpgcom.  (See
+the other documentation files if you wish to use the JPEG library within
+your own programs.)
+
+If you are on a Unix machine you may prefer to read the Unix-style manual
+pages in files cjpeg.1, djpeg.1, jpegtran.1, rdjpgcom.1, wrjpgcom.1.
+
+
+INTRODUCTION
+
+These programs implement JPEG image encoding, decoding, and transcoding.
+JPEG (pronounced "jay-peg") is a standardized compression method for
+full-color and grayscale images.
+
+
+GENERAL USAGE
+
+We provide two programs, cjpeg to compress an image file into JPEG format,
+and djpeg to decompress a JPEG file back into a conventional image format.
+
+On Unix-like systems, you say:
+        cjpeg [switches] [imagefile] >jpegfile
+or
+        djpeg [switches] [jpegfile]  >imagefile
+The programs read the specified input file, or standard input if none is
+named.  They always write to standard output (with trace/error messages to
+standard error).  These conventions are handy for piping images between
+programs.
+
+On most non-Unix systems, you say:
+        cjpeg [switches] imagefile jpegfile
+or
+        djpeg [switches] jpegfile  imagefile
+i.e., both the input and output files are named on the command line.  This
+style is a little more foolproof, and it loses no functionality if you don't
+have pipes.  (You can get this style on Unix too, if you prefer, by defining
+TWO_FILE_COMMANDLINE when you compile the programs; see install.txt.)
+
+You can also say:
+        cjpeg [switches] -outfile jpegfile  imagefile
+or
+        djpeg [switches] -outfile imagefile  jpegfile
+This syntax works on all systems, so it is useful for scripts.
+
+The currently supported image file formats are: PPM (PBMPLUS color format),
+PGM (PBMPLUS grayscale format), BMP, Targa, and RLE (Utah Raster Toolkit
+format).  (RLE is supported only if the URT library is available, which it
+isn't on most non-Unix systems.)  cjpeg recognizes the input image format
+automatically, with the exception of some Targa files.  You have to tell djpeg
+which format to generate.
+
+JPEG files are in the defacto standard JFIF file format.  There are other,
+less widely used JPEG-based file formats, but we don't support them.
+
+All switch names may be abbreviated; for example, -grayscale may be written
+-gray or -gr.  Most of the "basic" switches can be abbreviated to as little as
+one letter.  Upper and lower case are equivalent (-BMP is the same as -bmp).
+British spellings are also accepted (e.g., -greyscale), though for brevity
+these are not mentioned below.
+
+
+CJPEG DETAILS
+
+The basic command line switches for cjpeg are:
+
+        -quality N[,...]  Scale quantization tables to adjust image quality.
+                          Quality is 0 (worst) to 100 (best); default is 75.
+                          (See below for more info.)
+
+        -grayscale      Create monochrome JPEG file from color input.
+                        Be sure to use this switch when compressing a grayscale
+                        BMP file, because cjpeg isn't bright enough to notice
+                        whether a BMP file uses only shades of gray.  By
+                        saying -grayscale, you'll get a smaller JPEG file that
+                        takes less time to process.
+
+        -rgb            Create RGB JPEG file.
+                        Using this switch suppresses the conversion from RGB
+                        colorspace input to the default YCbCr JPEG colorspace.
+
+        -optimize       Perform optimization of entropy encoding parameters.
+                        Without this, default encoding parameters are used.
+                        -optimize usually makes the JPEG file a little smaller,
+                        but cjpeg runs somewhat slower and needs much more
+                        memory.  Image quality and speed of decompression are
+                        unaffected by -optimize.
+
+        -progressive    Create progressive JPEG file (see below).
+
+        -targa          Input file is Targa format.  Targa files that contain
+                        an "identification" field will not be automatically
+                        recognized by cjpeg; for such files you must specify
+                        -targa to make cjpeg treat the input as Targa format.
+                        For most Targa files, you won't need this switch.
+
+The -quality switch lets you trade off compressed file size against quality of
+the reconstructed image: the higher the quality setting, the larger the JPEG
+file, and the closer the output image will be to the original input.  Normally
+you want to use the lowest quality setting (smallest file) that decompresses
+into something visually indistinguishable from the original image.  For this
+purpose the quality setting should generally be between 50 and 95 (the default
+is 75) for photographic images.  If you see defects at -quality 75, then go up
+5 or 10 counts at a time until you are happy with the output image.  (The
+optimal setting will vary from one image to another.)
+
+-quality 100 will generate a quantization table of all 1's, minimizing loss
+in the quantization step (but there is still information loss in subsampling,
+as well as roundoff error.)  For most images, specifying a quality value above
+about 95 will increase the size of the compressed file dramatically, and while
+the quality gain from these higher quality values is measurable (using metrics
+such as PSNR or SSIM), it is rarely perceivable by human vision.
+
+In the other direction, quality values below 50 will produce very small files
+of low image quality.  Settings around 5 to 10 might be useful in preparing an
+index of a large image library, for example.  Try -quality 2 (or so) for some
+amusing Cubist effects.  (Note: quality values below about 25 generate 2-byte
+quantization tables, which are considered optional in the JPEG standard.
+cjpeg emits a warning message when you give such a quality value, because some
+other JPEG programs may be unable to decode the resulting file.  Use -baseline
+if you need to ensure compatibility at low quality values.)
+
+The -quality option has been extended in this version of cjpeg to support
+separate quality settings for luminance and chrominance (or, in general,
+separate settings for every quantization table slot.)  The principle is the
+same as chrominance subsampling:  since the human eye is more sensitive to
+spatial changes in brightness than spatial changes in color, the chrominance
+components can be quantized more than the luminance components without
+incurring any visible image quality loss.  However, unlike subsampling, this
+feature reduces data in the frequency domain instead of the spatial domain,
+which allows for more fine-grained control.  This option is useful in
+quality-sensitive applications, for which the artifacts generated by
+subsampling may be unacceptable.
+
+The -quality option accepts a comma-separated list of parameters, which
+respectively refer to the quality levels that should be assigned to the
+quantization table slots.  If there are more q-table slots than parameters,
+then the last parameter is replicated.  Thus, if only one quality parameter is
+given, this is used for both luminance and chrominance (slots 0 and 1,
+respectively), preserving the legacy behavior of cjpeg v6b and prior.  More (or
+customized) quantization tables can be set with the -qtables option and
+assigned to components with the -qslots option (see the "wizard" switches
+below.)
+
+JPEG  files  generated  with separate luminance and chrominance quality are
+fully compliant with standard JPEG decoders.
+
+CAUTION: For this setting to be useful, be sure to pass an argument of
+-sample 1x1 to cjpeg to disable chrominance subsampling.  Otherwise, the
+default subsampling level (2x2, AKA "4:2:0") will be used.
+
+The -progressive switch creates a "progressive JPEG" file.  In this type of
+JPEG file, the data is stored in multiple scans of increasing quality.  If the
+file is being transmitted over a slow communications link, the decoder can use
+the first scan to display a low-quality image very quickly, and can then
+improve the display with each subsequent scan.  The final image is exactly
+equivalent to a standard JPEG file of the same quality setting, and the total
+file size is about the same --- often a little smaller.
+
+Switches for advanced users:
+
+        -arithmetic     Use arithmetic coding.  CAUTION: arithmetic coded JPEG
+                        is not yet widely implemented, so many decoders will
+                        be unable to view an arithmetic coded JPEG file at
+                        all.
+
+        -dct int        Use integer DCT method (default).
+        -dct fast       Use fast integer DCT (less accurate).
+                        In libjpeg-turbo, the fast method is generally about
+                        5-15% faster than the int method when using the
+                        x86/x86-64 SIMD extensions (results may vary with other
+                        SIMD implementations, or when using libjpeg-turbo
+                        without SIMD extensions.)  For quality levels of 90 and
+                        below, there should be little or no perceptible
+                        difference between the two algorithms.  For quality
+                        levels above 90, however, the difference between
+                        the fast and the int methods becomes more pronounced.
+                        With quality=97, for instance, the fast method incurs
+                        generally about a 1-3 dB loss (in PSNR) relative to
+                        the int method, but this can be larger for some images.
+                        Do not use the fast method with quality levels above
+                        97.  The algorithm often degenerates at quality=98 and
+                        above and can actually produce a more lossy image than
+                        if lower quality levels had been used.  Also, in
+                        libjpeg-turbo, the fast method is not fully accerated
+                        for quality levels above 97, so it will be slower than
+                        the int method.
+        -dct float      Use floating-point DCT method.
+                        The float method is mainly a legacy feature.  It does
+                        not produce significantly more accurate results than
+                        the int method, and it is much slower.  The float
+                        method may also give different results on different
+                        machines due to varying roundoff behavior, whereas the
+                        integer methods should give the same results on all
+                        machines.
+
+        -restart N      Emit a JPEG restart marker every N MCU rows, or every
+                        N MCU blocks if "B" is attached to the number.
+                        -restart 0 (the default) means no restart markers.
+
+        -smooth N       Smooth the input image to eliminate dithering noise.
+                        N, ranging from 1 to 100, indicates the strength of
+                        smoothing.  0 (the default) means no smoothing.
+
+        -maxmemory N    Set limit for amount of memory to use in processing
+                        large images.  Value is in thousands of bytes, or
+                        millions of bytes if "M" is attached to the number.
+                        For example, -max 4m selects 4000000 bytes.  If more
+                        space is needed, temporary files will be used.
+
+        -verbose        Enable debug printout.  More -v's give more printout.
+        or  -debug      Also, version information is printed at startup.
+
+The -restart option inserts extra markers that allow a JPEG decoder to
+resynchronize after a transmission error.  Without restart markers, any damage
+to a compressed file will usually ruin the image from the point of the error
+to the end of the image; with restart markers, the damage is usually confined
+to the portion of the image up to the next restart marker.  Of course, the
+restart markers occupy extra space.  We recommend -restart 1 for images that
+will be transmitted across unreliable networks such as Usenet.
+
+The -smooth option filters the input to eliminate fine-scale noise.  This is
+often useful when converting dithered images to JPEG: a moderate smoothing
+factor of 10 to 50 gets rid of dithering patterns in the input file, resulting
+in a smaller JPEG file and a better-looking image.  Too large a smoothing
+factor will visibly blur the image, however.
+
+Switches for wizards:
+
+        -baseline       Force baseline-compatible quantization tables to be
+                        generated.  This clamps quantization values to 8 bits
+                        even at low quality settings.  (This switch is poorly
+                        named, since it does not ensure that the output is
+                        actually baseline JPEG.  For example, you can use
+                        -baseline and -progressive together.)
+
+        -qtables file   Use the quantization tables given in the specified
+                        text file.
+
+        -qslots N[,...] Select which quantization table to use for each color
+                        component.
+
+        -sample HxV[,...]  Set JPEG sampling factors for each color component.
+
+        -scans file     Use the scan script given in the specified text file.
+
+The "wizard" switches are intended for experimentation with JPEG.  If you
+don't know what you are doing, DON'T USE THEM.  These switches are documented
+further in the file wizard.txt.
+
+
+DJPEG DETAILS
+
+The basic command line switches for djpeg are:
+
+        -colors N       Reduce image to at most N colors.  This reduces the
+        or -quantize N  number of colors used in the output image, so that it
+                        can be displayed on a colormapped display or stored in
+                        a colormapped file format.  For example, if you have
+                        an 8-bit display, you'd need to reduce to 256 or fewer
+                        colors.  (-colors is the recommended name, -quantize
+                        is provided only for backwards compatibility.)
+
+        -fast           Select recommended processing options for fast, low
+                        quality output.  (The default options are chosen for
+                        highest quality output.)  Currently, this is equivalent
+                        to "-dct fast -nosmooth -onepass -dither ordered".
+
+        -grayscale      Force grayscale output even if JPEG file is color.
+                        Useful for viewing on monochrome displays; also,
+                        djpeg runs noticeably faster in this mode.
+
+        -rgb            Force RGB output even if JPEG file is grayscale.
+
+        -scale M/N      Scale the output image by a factor M/N.  Currently
+                        the scale factor must be M/8, where M is an integer
+                        between 1 and 16 inclusive, or any reduced fraction
+                        thereof (such as 1/2, 3/4, etc.  Scaling is handy if
+                        the image is larger than your screen; also, djpeg runs
+                        much faster when scaling down the output.
+
+        -bmp            Select BMP output format (Windows flavor).  8-bit
+                        colormapped format is emitted if -colors or -grayscale
+                        is specified, or if the JPEG file is grayscale;
+                        otherwise, 24-bit full-color format is emitted.
+
+        -gif            Select GIF output format.  Since GIF does not support
+                        more than 256 colors, -colors 256 is assumed (unless
+                        you specify a smaller number of colors).  If you
+                        specify -fast, the default number of colors is 216.
+
+        -os2            Select BMP output format (OS/2 1.x flavor).  8-bit
+                        colormapped format is emitted if -colors or -grayscale
+                        is specified, or if the JPEG file is grayscale;
+                        otherwise, 24-bit full-color format is emitted.
+
+        -pnm            Select PBMPLUS (PPM/PGM) output format (this is the
+                        default format).  PGM is emitted if the JPEG file is
+                        grayscale or if -grayscale is specified; otherwise
+                        PPM is emitted.
+
+        -rle            Select RLE output format.  (Requires URT library.)
+
+        -targa          Select Targa output format.  Grayscale format is
+                        emitted if the JPEG file is grayscale or if
+                        -grayscale is specified; otherwise, colormapped format
+                        is emitted if -colors is specified; otherwise, 24-bit
+                        full-color format is emitted.
+
+Switches for advanced users:
+
+        -dct int        Use integer DCT method (default).
+        -dct fast       Use fast integer DCT (less accurate).
+                        In libjpeg-turbo, the fast method is generally about
+                        5-15% faster than the int method when using the
+                        x86/x86-64 SIMD extensions (results may vary with other
+                        SIMD implementations, or when using libjpeg-turbo
+                        without SIMD extensions.)  If the JPEG image was
+                        compressed using a quality level of 85 or below, then
+                        there should be little or no perceptible difference
+                        between the two algorithms.  When decompressing images
+                        that were compressed using quality levels above 85,
+                        however, the difference between the fast and int
+                        methods becomes more pronounced.  With images
+                        compressed using quality=97, for instance, the fast
+                        method incurs generally about a 4-6 dB loss (in PSNR)
+                        relative to the int method, but this can be larger for
+                        some images.  If you can avoid it, do not use the fast
+                        method when decompressing images that were compressed
+                        using quality levels above 97.  The algorithm often
+                        degenerates for such images and can actually produce
+                        a more lossy output image than if the JPEG image had
+                        been compressed using lower quality levels.
+        -dct float      Use floating-point DCT method.
+                        The float method is mainly a legacy feature.  It does
+                        not produce significantly more accurate results than
+                        the int method, and it is much slower.  The float
+                        method may also give different results on different
+                        machines due to varying roundoff behavior, whereas the
+                        integer methods should give the same results on all
+                        machines.
+
+        -dither fs      Use Floyd-Steinberg dithering in color quantization.
+        -dither ordered Use ordered dithering in color quantization.
+        -dither none    Do not use dithering in color quantization.
+                        By default, Floyd-Steinberg dithering is applied when
+                        quantizing colors; this is slow but usually produces
+                        the best results.  Ordered dither is a compromise
+                        between speed and quality; no dithering is fast but
+                        usually looks awful.  Note that these switches have
+                        no effect unless color quantization is being done.
+                        Ordered dither is only available in -onepass mode.
+
+        -map FILE       Quantize to the colors used in the specified image
+                        file.  This is useful for producing multiple files
+                        with identical color maps, or for forcing a predefined
+                        set of colors to be used.  The FILE must be a GIF
+                        or PPM file.  This option overrides -colors and
+                        -onepass.
+
+        -nosmooth       Use a faster, lower-quality upsampling routine.
+
+        -onepass        Use one-pass instead of two-pass color quantization.
+                        The one-pass method is faster and needs less memory,
+                        but it produces a lower-quality image.  -onepass is
+                        ignored unless you also say -colors N.  Also,
+                        the one-pass method is always used for grayscale
+                        output (the two-pass method is no improvement then).
+
+        -maxmemory N    Set limit for amount of memory to use in processing
+                        large images.  Value is in thousands of bytes, or
+                        millions of bytes if "M" is attached to the number.
+                        For example, -max 4m selects 4000000 bytes.  If more
+                        space is needed, temporary files will be used.
+
+        -verbose        Enable debug printout.  More -v's give more printout.
+        or  -debug      Also, version information is printed at startup.
+
+
+HINTS FOR CJPEG
+
+Color GIF files are not the ideal input for JPEG; JPEG is really intended for
+compressing full-color (24-bit) images.  In particular, don't try to convert
+cartoons, line drawings, and other images that have only a few distinct
+colors.  GIF works great on these, JPEG does not.  If you want to convert a
+GIF to JPEG, you should experiment with cjpeg's -quality and -smooth options
+to get a satisfactory conversion.  -smooth 10 or so is often helpful.
+
+Avoid running an image through a series of JPEG compression/decompression
+cycles.  Image quality loss will accumulate; after ten or so cycles the image
+may be noticeably worse than it was after one cycle.  It's best to use a
+lossless format while manipulating an image, then convert to JPEG format when
+you are ready to file the image away.
+
+The -optimize option to cjpeg is worth using when you are making a "final"
+version for posting or archiving.  It's also a win when you are using low
+quality settings to make very small JPEG files; the percentage improvement
+is often a lot more than it is on larger files.  (At present, -optimize
+mode is always selected when generating progressive JPEG files.)
+
+Support for GIF input files was removed in cjpeg v6b due to concerns over
+the Unisys LZW patent.  Although this patent expired in 2006, cjpeg still
+lacks GIF support, for these historical reasons.  (Conversion of GIF files to
+JPEG is usually a bad idea anyway.)
+
+
+HINTS FOR DJPEG
+
+To get a quick preview of an image, use the -grayscale and/or -scale switches.
+"-grayscale -scale 1/8" is the fastest case.
+
+Several options are available that trade off image quality to gain speed.
+"-fast" turns on the recommended settings.
+
+"-dct fast" and/or "-nosmooth" gain speed at a small sacrifice in quality.
+When producing a color-quantized image, "-onepass -dither ordered" is fast but
+much lower quality than the default behavior.  "-dither none" may give
+acceptable results in two-pass mode, but is seldom tolerable in one-pass mode.
+
+Two-pass color quantization requires a good deal of memory; on MS-DOS machines
+it may run out of memory even with -maxmemory 0.  In that case you can still
+decompress, with some loss of image quality, by specifying -onepass for
+one-pass quantization.
+
+To avoid the Unisys LZW patent (now expired), djpeg produces uncompressed GIF
+files.  These are larger than they should be, but are readable by standard GIF
+decoders.
+
+
+HINTS FOR BOTH PROGRAMS
+
+If more space is needed than will fit in the available main memory (as
+determined by -maxmemory), temporary files will be used.  (MS-DOS versions
+will try to get extended or expanded memory first.)  The temporary files are
+often rather large: in typical cases they occupy three bytes per pixel, for
+example 3*800*600 = 1.44Mb for an 800x600 image.  If you don't have enough
+free disk space, leave out -progressive and -optimize (for cjpeg) or specify
+-onepass (for djpeg).
+
+On MS-DOS, the temporary files are created in the directory named by the TMP
+or TEMP environment variable, or in the current directory if neither of those
+exist.  Amiga implementations put the temp files in the directory named by
+JPEGTMP:, so be sure to assign JPEGTMP: to a disk partition with adequate free
+space.
+
+The default memory usage limit (-maxmemory) is set when the software is
+compiled.  If you get an "insufficient memory" error, try specifying a smaller
+-maxmemory value, even -maxmemory 0 to use the absolute minimum space.  You
+may want to recompile with a smaller default value if this happens often.
+
+On machines that have "environment" variables, you can define the environment
+variable JPEGMEM to set the default memory limit.  The value is specified as
+described for the -maxmemory switch.  JPEGMEM overrides the default value
+specified when the program was compiled, and itself is overridden by an
+explicit -maxmemory switch.
+
+On MS-DOS machines, -maxmemory is the amount of main (conventional) memory to
+use.  (Extended or expanded memory is also used if available.)  Most
+DOS-specific versions of this software do their own memory space estimation
+and do not need you to specify -maxmemory.
+
+
+JPEGTRAN
+
+jpegtran performs various useful transformations of JPEG files.
+It can translate the coded representation from one variant of JPEG to another,
+for example from baseline JPEG to progressive JPEG or vice versa.  It can also
+perform some rearrangements of the image data, for example turning an image
+from landscape to portrait format by rotation.  For EXIF files and JPEG files
+containing Exif data, you may prefer to use exiftran instead.
+
+jpegtran works by rearranging the compressed data (DCT coefficients), without
+ever fully decoding the image.  Therefore, its transformations are lossless:
+there is no image degradation at all, which would not be true if you used
+djpeg followed by cjpeg to accomplish the same conversion.  But by the same
+token, jpegtran cannot perform lossy operations such as changing the image
+quality.  However, while the image data is losslessly transformed, metadata
+can be removed.  See the -copy option for specifics.
+
+jpegtran uses a command line syntax similar to cjpeg or djpeg.
+On Unix-like systems, you say:
+        jpegtran [switches] [inputfile] >outputfile
+On most non-Unix systems, you say:
+        jpegtran [switches] inputfile outputfile
+where both the input and output files are JPEG files.
+
+To specify the coded JPEG representation used in the output file,
+jpegtran accepts a subset of the switches recognized by cjpeg:
+        -optimize       Perform optimization of entropy encoding parameters.
+        -progressive    Create progressive JPEG file.
+        -arithmetic     Use arithmetic coding.
+        -restart N      Emit a JPEG restart marker every N MCU rows, or every
+                        N MCU blocks if "B" is attached to the number.
+        -scans file     Use the scan script given in the specified text file.
+See the previous discussion of cjpeg for more details about these switches.
+If you specify none of these switches, you get a plain baseline-JPEG output
+file.  The quality setting and so forth are determined by the input file.
+
+The image can be losslessly transformed by giving one of these switches:
+        -flip horizontal        Mirror image horizontally (left-right).
+        -flip vertical          Mirror image vertically (top-bottom).
+        -rotate 90              Rotate image 90 degrees clockwise.
+        -rotate 180             Rotate image 180 degrees.
+        -rotate 270             Rotate image 270 degrees clockwise (or 90 ccw).
+        -transpose              Transpose image (across UL-to-LR axis).
+        -transverse             Transverse transpose (across UR-to-LL axis).
+
+The transpose transformation has no restrictions regarding image dimensions.
+The other transformations operate rather oddly if the image dimensions are not
+a multiple of the iMCU size (usually 8 or 16 pixels), because they can only
+transform complete blocks of DCT coefficient data in the desired way.
+
+jpegtran's default behavior when transforming an odd-size image is designed
+to preserve exact reversibility and mathematical consistency of the
+transformation set.  As stated, transpose is able to flip the entire image
+area.  Horizontal mirroring leaves any partial iMCU column at the right edge
+untouched, but is able to flip all rows of the image.  Similarly, vertical
+mirroring leaves any partial iMCU row at the bottom edge untouched, but is
+able to flip all columns.  The other transforms can be built up as sequences
+of transpose and flip operations; for consistency, their actions on edge
+pixels are defined to be the same as the end result of the corresponding
+transpose-and-flip sequence.
+
+For practical use, you may prefer to discard any untransformable edge pixels
+rather than having a strange-looking strip along the right and/or bottom edges
+of a transformed image.  To do this, add the -trim switch:
+        -trim           Drop non-transformable edge blocks.
+Obviously, a transformation with -trim is not reversible, so strictly speaking
+jpegtran with this switch is not lossless.  Also, the expected mathematical
+equivalences between the transformations no longer hold.  For example,
+"-rot 270 -trim" trims only the bottom edge, but "-rot 90 -trim" followed by
+"-rot 180 -trim" trims both edges.
+
+If you are only interested in perfect transformations, add the -perfect switch:
+        -perfect        Fail with an error if the transformation is not
+                        perfect.
+For example, you may want to do
+  jpegtran -rot 90 -perfect foo.jpg || djpeg foo.jpg | pnmflip -r90 | cjpeg
+to do a perfect rotation, if available, or an approximated one if not.
+
+This version of jpegtran also offers a lossless crop option, which discards
+data outside of a given image region but losslessly preserves what is inside.
+Like the rotate and flip transforms, lossless crop is restricted by the current
+JPEG format; the upper left corner of the selected region must fall on an iMCU
+boundary.  If it doesn't, then it is silently moved up and/or left to the
+nearest iMCU boundary (the lower right corner is unchanged.)  Thus, the output
+image covers at least the requested region, but it may cover more.  The
+adjustment of the region dimensions may be optionally disabled by attaching an
+'f' character ("force") to the width or height number.
+
+The image can be losslessly cropped by giving the switch:
+        -crop WxH+X+Y   Crop to a rectangular region of width W and height H,
+                        starting at point X,Y.
+
+Other not-strictly-lossless transformation switches are:
+
+        -grayscale      Force grayscale output.
+This option discards the chrominance channels if the input image is YCbCr
+(ie, a standard color JPEG), resulting in a grayscale JPEG file.  The
+luminance channel is preserved exactly, so this is a better method of reducing
+to grayscale than decompression, conversion, and recompression.  This switch
+is particularly handy for fixing a monochrome picture that was mistakenly
+encoded as a color JPEG.  (In such a case, the space savings from getting rid
+of the near-empty chroma channels won't be large; but the decoding time for
+a grayscale JPEG is substantially less than that for a color JPEG.)
+
+jpegtran also recognizes these switches that control what to do with "extra"
+markers, such as comment blocks:
+        -copy none      Copy no extra markers from source file.  This setting
+                        suppresses all comments and other metadata in the
+                        source file.
+        -copy comments  Copy only comment markers.  This setting copies
+                        comments from the source file but discards any other
+                        metadata.
+        -copy all       Copy all extra markers.  This setting preserves
+                        miscellaneous markers found in the source file, such
+                        as JFIF thumbnails, Exif data, and Photoshop settings.
+                        In some files, these extra markers can be sizable.
+                        Note that this option will copy thumbnails as-is;
+                        they will not be transformed.
+The default behavior is -copy comments.  (Note: in IJG releases v6 and v6a,
+jpegtran always did the equivalent of -copy none.)
+
+Additional switches recognized by jpegtran are:
+        -outfile filename
+        -maxmemory N
+        -verbose
+        -debug
+These work the same as in cjpeg or djpeg.
+
+
+THE COMMENT UTILITIES
+
+The JPEG standard allows "comment" (COM) blocks to occur within a JPEG file.
+Although the standard doesn't actually define what COM blocks are for, they
+are widely used to hold user-supplied text strings.  This lets you add
+annotations, titles, index terms, etc to your JPEG files, and later retrieve
+them as text.  COM blocks do not interfere with the image stored in the JPEG
+file.  The maximum size of a COM block is 64K, but you can have as many of
+them as you like in one JPEG file.
+
+We provide two utility programs to display COM block contents and add COM
+blocks to a JPEG file.
+
+rdjpgcom searches a JPEG file and prints the contents of any COM blocks on
+standard output.  The command line syntax is
+        rdjpgcom [-raw] [-verbose] [inputfilename]
+The switch "-raw" (or just "-r") causes rdjpgcom to output non-printable
+characters in JPEG comments.  These characters are normally escaped for
+security reasons.
+The switch "-verbose" (or just "-v") causes rdjpgcom to also display the JPEG
+image dimensions.  If you omit the input file name from the command line,
+the JPEG file is read from standard input.  (This may not work on some
+operating systems, if binary data can't be read from stdin.)
+
+wrjpgcom adds a COM block, containing text you provide, to a JPEG file.
+Ordinarily, the COM block is added after any existing COM blocks, but you
+can delete the old COM blocks if you wish.  wrjpgcom produces a new JPEG
+file; it does not modify the input file.  DO NOT try to overwrite the input
+file by directing wrjpgcom's output back into it; on most systems this will
+just destroy your file.
+
+The command line syntax for wrjpgcom is similar to cjpeg's.  On Unix-like
+systems, it is
+        wrjpgcom [switches] [inputfilename]
+The output file is written to standard output.  The input file comes from
+the named file, or from standard input if no input file is named.
+
+On most non-Unix systems, the syntax is
+        wrjpgcom [switches] inputfilename outputfilename
+where both input and output file names must be given explicitly.
+
+wrjpgcom understands three switches:
+        -replace                 Delete any existing COM blocks from the file.
+        -comment "Comment text"  Supply new COM text on command line.
+        -cfile name              Read text for new COM block from named file.
+(Switch names can be abbreviated.)  If you have only one line of comment text
+to add, you can provide it on the command line with -comment.  The comment
+text must be surrounded with quotes so that it is treated as a single
+argument.  Longer comments can be read from a text file.
+
+If you give neither -comment nor -cfile, then wrjpgcom will read the comment
+text from standard input.  (In this case an input image file name MUST be
+supplied, so that the source JPEG file comes from somewhere else.)  You can
+enter multiple lines, up to 64KB worth.  Type an end-of-file indicator
+(usually control-D or control-Z) to terminate the comment text entry.
+
+wrjpgcom will not add a COM block if the provided comment string is empty.
+Therefore -replace -comment "" can be used to delete all COM blocks from a
+file.
+
+These utility programs do not depend on the IJG JPEG library.  In
+particular, the source code for rdjpgcom is intended as an illustration of
+the minimum amount of code required to parse a JPEG file header correctly.
diff --git a/win/jsimdcfg.inc b/win/jsimdcfg.inc
deleted file mode 100644
index 9d4aede..0000000
--- a/win/jsimdcfg.inc
+++ /dev/null
@@ -1,94 +0,0 @@
-;
-; Automatically generated include file from jsimdcfg.inc.h
-;
-;
-; -- jpeglib.h
-;
-%define DCTSIZE 8
-%define DCTSIZE2 64
-;
-; -- jmorecfg.h
-;
-%define RGB_RED 0
-%define RGB_GREEN 1
-%define RGB_BLUE 2
-%define RGB_PIXELSIZE 3
-%define EXT_RGB_RED 0
-%define EXT_RGB_GREEN 1
-%define EXT_RGB_BLUE 2
-%define EXT_RGB_PIXELSIZE 3
-%define EXT_RGBX_RED 0
-%define EXT_RGBX_GREEN 1
-%define EXT_RGBX_BLUE 2
-%define EXT_RGBX_PIXELSIZE 4
-%define EXT_BGR_RED 2
-%define EXT_BGR_GREEN 1
-%define EXT_BGR_BLUE 0
-%define EXT_BGR_PIXELSIZE 3
-%define EXT_BGRX_RED 2
-%define EXT_BGRX_GREEN 1
-%define EXT_BGRX_BLUE 0
-%define EXT_BGRX_PIXELSIZE 4
-%define EXT_XBGR_RED 3
-%define EXT_XBGR_GREEN 2
-%define EXT_XBGR_BLUE 1
-%define EXT_XBGR_PIXELSIZE 4
-%define EXT_XRGB_RED 1
-%define EXT_XRGB_GREEN 2
-%define EXT_XRGB_BLUE 3
-%define EXT_XRGB_PIXELSIZE 4
-%define RGBX_FILLER_0XFF 1
-; Representation of a single sample (pixel element value).
-; On this SIMD implementation, this must be 'unsigned char'.
-;
-%define JSAMPLE byte ; unsigned char
-%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
-%define CENTERJSAMPLE 128
-; Representation of a DCT frequency coefficient.
-; On this SIMD implementation, this must be 'short'.
-;
-%define JCOEF word ; short
-%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
-; Datatype used for image dimensions.
-; On this SIMD implementation, this must be 'unsigned int'.
-;
-%define JDIMENSION dword ; unsigned int
-%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
-%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
-%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
-%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
-%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
-%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
-%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
-%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
-%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
-;
-; -- jdct.h
-;
-; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
-; the DCT is to be performed in-place in that buffer.
-; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
-;
-%define DCTELEM word ; short
-%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
-%define float FP32 ; float
-%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)
-; To maximize parallelism, Type short is changed to short.
-;
-%define ISLOW_MULT_TYPE word ; must be short
-%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
-%define IFAST_MULT_TYPE word ; must be short
-%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
-%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
-%define FLOAT_MULT_TYPE FP32 ; must be float
-%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
-;
-; -- jsimd.h
-;
-%define JSIMD_NONE 0x00
-%define JSIMD_MMX 0x01
-%define JSIMD_3DNOW 0x02
-%define JSIMD_SSE 0x04
-%define JSIMD_SSE2 0x08
-; Short forms of external names for systems with brain-damaged linkers.
-;
diff --git a/wizard.txt b/wizard.txt
new file mode 100644
index 0000000..ede721e
--- /dev/null
+++ b/wizard.txt
@@ -0,0 +1,211 @@
+Advanced usage instructions for the Independent JPEG Group's JPEG software
+==========================================================================
+
+This file describes cjpeg's "switches for wizards".
+
+The "wizard" switches are intended for experimentation with JPEG by persons
+who are reasonably knowledgeable about the JPEG standard.  If you don't know
+what you are doing, DON'T USE THESE SWITCHES.  You'll likely produce files
+with worse image quality and/or poorer compression than you'd get from the
+default settings.  Furthermore, these switches must be used with caution
+when making files intended for general use, because not all JPEG decoders
+will support unusual JPEG parameter settings.
+
+
+Quantization Table Adjustment
+-----------------------------
+
+Ordinarily, cjpeg starts with a default set of tables (the same ones given
+as examples in the JPEG standard) and scales them up or down according to
+the -quality setting.  The details of the scaling algorithm can be found in
+jcparam.c.  At very low quality settings, some quantization table entries
+can get scaled up to values exceeding 255.  Although 2-byte quantization
+values are supported by the IJG software, this feature is not in baseline
+JPEG and is not supported by all implementations.  If you need to ensure
+wide compatibility of low-quality files, you can constrain the scaled
+quantization values to no more than 255 by giving the -baseline switch.
+Note that use of -baseline will result in poorer quality for the same file
+size, since more bits than necessary are expended on higher AC coefficients.
+
+You can substitute a different set of quantization values by using the
+-qtables switch:
+
+        -qtables file   Use the quantization tables given in the named file.
+
+The specified file should be a text file containing decimal quantization
+values.  The file should contain one to four tables, each of 64 elements.
+The tables are implicitly numbered 0,1,etc. in order of appearance.  Table
+entries appear in normal array order (NOT in the zigzag order in which they
+will be stored in the JPEG file).
+
+Quantization table files are free format, in that arbitrary whitespace can
+appear between numbers.  Also, comments can be included: a comment starts
+with '#' and extends to the end of the line.  Here is an example file that
+duplicates the default quantization tables:
+
+        # Quantization tables given in JPEG spec, section K.1
+
+        # This is table 0 (the luminance table):
+          16  11  10  16  24  40  51  61
+          12  12  14  19  26  58  60  55
+          14  13  16  24  40  57  69  56
+          14  17  22  29  51  87  80  62
+          18  22  37  56  68 109 103  77
+          24  35  55  64  81 104 113  92
+          49  64  78  87 103 121 120 101
+          72  92  95  98 112 100 103  99
+
+        # This is table 1 (the chrominance table):
+          17  18  24  47  99  99  99  99
+          18  21  26  66  99  99  99  99
+          24  26  56  99  99  99  99  99
+          47  66  99  99  99  99  99  99
+          99  99  99  99  99  99  99  99
+          99  99  99  99  99  99  99  99
+          99  99  99  99  99  99  99  99
+          99  99  99  99  99  99  99  99
+
+If the -qtables switch is used without -quality, then the specified tables
+are used exactly as-is.  If both -qtables and -quality are used, then the
+tables taken from the file are scaled in the same fashion that the default
+tables would be scaled for that quality setting.  If -baseline appears, then
+the quantization values are constrained to the range 1-255.
+
+By default, cjpeg will use quantization table 0 for luminance components and
+table 1 for chrominance components.  To override this choice, use the -qslots
+switch:
+
+        -qslots N[,...]         Select which quantization table to use for
+                                each color component.
+
+The -qslots switch specifies a quantization table number for each color
+component, in the order in which the components appear in the JPEG SOF marker.
+For example, to create a separate table for each of Y,Cb,Cr, you could
+provide a -qtables file that defines three quantization tables and say
+"-qslots 0,1,2".  If -qslots gives fewer table numbers than there are color
+components, then the last table number is repeated as necessary.
+
+
+Sampling Factor Adjustment
+--------------------------
+
+By default, cjpeg uses 2:1 horizontal and vertical downsampling when
+compressing YCbCr data, and no downsampling for all other color spaces.
+You can override this default with the -sample switch:
+
+        -sample HxV[,...]       Set JPEG sampling factors for each color
+                                component.
+
+The -sample switch specifies the JPEG sampling factors for each color
+component, in the order in which they appear in the JPEG SOF marker.
+If you specify fewer HxV pairs than there are components, the remaining
+components are set to 1x1 sampling.  For example, the default YCbCr setting
+is equivalent to "-sample 2x2,1x1,1x1", which can be abbreviated to
+"-sample 2x2".
+
+There are still some JPEG decoders in existence that support only 2x1
+sampling (also called 4:2:2 sampling).  Compatibility with such decoders can
+be achieved by specifying "-sample 2x1".  This is not recommended unless
+really necessary, since it increases file size and encoding/decoding time
+with very little quality gain.
+
+
+Multiple Scan / Progression Control
+-----------------------------------
+
+By default, cjpeg emits a single-scan sequential JPEG file.  The
+-progressive switch generates a progressive JPEG file using a default series
+of progression parameters.  You can create multiple-scan sequential JPEG
+files or progressive JPEG files with custom progression parameters by using
+the -scans switch:
+
+        -scans file     Use the scan sequence given in the named file.
+
+The specified file should be a text file containing a "scan script".
+The script specifies the contents and ordering of the scans to be emitted.
+Each entry in the script defines one scan.  A scan definition specifies
+the components to be included in the scan, and for progressive JPEG it also
+specifies the progression parameters Ss,Se,Ah,Al for the scan.  Scan
+definitions are separated by semicolons (';').  A semicolon after the last
+scan definition is optional.
+
+Each scan definition contains one to four component indexes, optionally
+followed by a colon (':') and the four progressive-JPEG parameters.  The
+component indexes denote which color component(s) are to be transmitted in
+the scan.  Components are numbered in the order in which they appear in the
+JPEG SOF marker, with the first component being numbered 0.  (Note that these
+indexes are not the "component ID" codes assigned to the components, just
+positional indexes.)
+
+The progression parameters for each scan are:
+        Ss      Zigzag index of first coefficient included in scan
+        Se      Zigzag index of last coefficient included in scan
+        Ah      Zero for first scan of a coefficient, else Al of prior scan
+        Al      Successive approximation low bit position for scan
+If the progression parameters are omitted, the values 0,63,0,0 are used,
+producing a sequential JPEG file.  cjpeg automatically determines whether
+the script represents a progressive or sequential file, by observing whether
+Ss and Se values other than 0 and 63 appear.  (The -progressive switch is
+not needed to specify this; in fact, it is ignored when -scans appears.)
+The scan script must meet the JPEG restrictions on progression sequences.
+(cjpeg checks that the spec's requirements are obeyed.)
+
+Scan script files are free format, in that arbitrary whitespace can appear
+between numbers and around punctuation.  Also, comments can be included: a
+comment starts with '#' and extends to the end of the line.  For additional
+legibility, commas or dashes can be placed between values.  (Actually, any
+single punctuation character other than ':' or ';' can be inserted.)  For
+example, the following two scan definitions are equivalent:
+        0 1 2: 0 63 0 0;
+        0,1,2 : 0-63, 0,0 ;
+
+Here is an example of a scan script that generates a partially interleaved
+sequential JPEG file:
+
+        0;                      # Y only in first scan
+        1 2;                    # Cb and Cr in second scan
+
+Here is an example of a progressive scan script using only spectral selection
+(no successive approximation):
+
+        # Interleaved DC scan for Y,Cb,Cr:
+        0,1,2: 0-0,   0, 0 ;
+        # AC scans:
+        0:     1-2,   0, 0 ;    # First two Y AC coefficients
+        0:     3-5,   0, 0 ;    # Three more
+        1:     1-63,  0, 0 ;    # All AC coefficients for Cb
+        2:     1-63,  0, 0 ;    # All AC coefficients for Cr
+        0:     6-9,   0, 0 ;    # More Y coefficients
+        0:     10-63, 0, 0 ;    # Remaining Y coefficients
+
+Here is an example of a successive-approximation script.  This is equivalent
+to the default script used by "cjpeg -progressive" for YCbCr images:
+
+        # Initial DC scan for Y,Cb,Cr (lowest bit not sent)
+        0,1,2: 0-0,   0, 1 ;
+        # First AC scan: send first 5 Y AC coefficients, minus 2 lowest bits:
+        0:     1-5,   0, 2 ;
+        # Send all Cr,Cb AC coefficients, minus lowest bit:
+        # (chroma data is usually too small to be worth subdividing further;
+        #  but note we send Cr first since eye is least sensitive to Cb)
+        2:     1-63,  0, 1 ;
+        1:     1-63,  0, 1 ;
+        # Send remaining Y AC coefficients, minus 2 lowest bits:
+        0:     6-63,  0, 2 ;
+        # Send next-to-lowest bit of all Y AC coefficients:
+        0:     1-63,  2, 1 ;
+        # At this point we've sent all but the lowest bit of all coefficients.
+        # Send lowest bit of DC coefficients
+        0,1,2: 0-0,   1, 0 ;
+        # Send lowest bit of AC coefficients
+        2:     1-63,  1, 0 ;
+        1:     1-63,  1, 0 ;
+        # Y AC lowest bit scan is last; it's usually the largest scan
+        0:     1-63,  1, 0 ;
+
+It may be worth pointing out that this script is tuned for quality settings
+of around 50 to 75.  For lower quality settings, you'd probably want to use
+a script with fewer stages of successive approximation (otherwise the
+initial scans will be really bad).  For higher quality settings, you might
+want to use more stages of successive approximation (so that the initial
+scans are not too large).
diff --git a/wrbmp.c b/wrbmp.c
index a42e1c1..50e469c 100644
--- a/wrbmp.c
+++ b/wrbmp.c
@@ -1,12 +1,13 @@
 /*
  * wrbmp.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modifications:
+ * libjpeg-turbo Modifications:
  * Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README file.
+ * Copyright (C) 2014-2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to write output images in Microsoft "BMP"
  * format (MS Windows 3.x and OS/2 1.x flavors).
@@ -20,7 +21,8 @@
  * This code contributed by James Arthur Boucher.
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include "jconfigint.h"
 
 #ifdef BMP_SUPPORTED
 
@@ -45,27 +47,27 @@
 /* Private version of data destination object */
 
 typedef struct {
-  struct djpeg_dest_struct pub;	/* public fields */
+  struct djpeg_dest_struct pub; /* public fields */
 
-  boolean is_os2;		/* saves the OS2 format request flag */
+  boolean is_os2;               /* saves the OS2 format request flag */
 
-  jvirt_sarray_ptr whole_image;	/* needed to reverse row order */
-  JDIMENSION data_width;	/* JSAMPLEs per row */
-  JDIMENSION row_width;		/* physical width of one row in the BMP file */
-  int pad_bytes;		/* number of padding bytes needed per row */
-  JDIMENSION cur_output_row;	/* next row# to write to virtual array */
+  jvirt_sarray_ptr whole_image; /* needed to reverse row order */
+  JDIMENSION data_width;        /* JSAMPLEs per row */
+  JDIMENSION row_width;         /* physical width of one row in the BMP file */
+  int pad_bytes;                /* number of padding bytes needed per row */
+  JDIMENSION cur_output_row;    /* next row# to write to virtual array */
 } bmp_dest_struct;
 
-typedef bmp_dest_struct * bmp_dest_ptr;
+typedef bmp_dest_struct *bmp_dest_ptr;
 
 
 /* Forward declarations */
 LOCAL(void) write_colormap
-	JPP((j_decompress_ptr cinfo, bmp_dest_ptr dest,
-	     int map_colors, int map_entry_size));
+        (j_decompress_ptr cinfo, bmp_dest_ptr dest, int map_colors,
+         int map_entry_size);
 
 
-static inline boolean is_big_endian(void)
+static INLINE boolean is_big_endian(void)
 {
   int test_value = 1;
   if(*(char *)&test_value != 1)
@@ -81,7 +83,7 @@
 
 METHODDEF(void)
 put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-		JDIMENSION rows_supplied)
+                JDIMENSION rows_supplied)
 /* This version is for writing 24-bit pixels */
 {
   bmp_dest_ptr dest = (bmp_dest_ptr) dinfo;
@@ -135,7 +137,7 @@
 
 METHODDEF(void)
 put_gray_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-	       JDIMENSION rows_supplied)
+               JDIMENSION rows_supplied)
 /* This version is for grayscale OR quantized color output */
 {
   bmp_dest_ptr dest = (bmp_dest_ptr) dinfo;
@@ -154,7 +156,7 @@
   inptr = dest->pub.buffer[0];
   outptr = image_ptr[0];
   for (col = cinfo->output_width; col > 0; col--) {
-    *outptr++ = *inptr++;	/* can omit GETJSAMPLE() safely */
+    *outptr++ = *inptr++;       /* can omit GETJSAMPLE() safely */
   }
 
   /* Zero out the pad bytes. */
@@ -191,14 +193,14 @@
   char bmpfileheader[14];
   char bmpinfoheader[40];
 #define PUT_2B(array,offset,value)  \
-	(array[offset] = (char) ((value) & 0xFF), \
-	 array[offset+1] = (char) (((value) >> 8) & 0xFF))
+        (array[offset] = (char) ((value) & 0xFF), \
+         array[offset+1] = (char) (((value) >> 8) & 0xFF))
 #define PUT_4B(array,offset,value)  \
-	(array[offset] = (char) ((value) & 0xFF), \
-	 array[offset+1] = (char) (((value) >> 8) & 0xFF), \
-	 array[offset+2] = (char) (((value) >> 16) & 0xFF), \
-	 array[offset+3] = (char) (((value) >> 24) & 0xFF))
-  INT32 headersize, bfSize;
+        (array[offset] = (char) ((value) & 0xFF), \
+         array[offset+1] = (char) (((value) >> 8) & 0xFF), \
+         array[offset+2] = (char) (((value) >> 16) & 0xFF), \
+         array[offset+3] = (char) (((value) >> 24) & 0xFF))
+  long headersize, bfSize;
   int bits_per_pixel, cmap_entries;
 
   /* Compute colormap size and total file size */
@@ -222,30 +224,30 @@
   }
   /* File size */
   headersize = 14 + 40 + cmap_entries * 4; /* Header and colormap */
-  bfSize = headersize + (INT32) dest->row_width * (INT32) cinfo->output_height;
-  
+  bfSize = headersize + (long) dest->row_width * (long) cinfo->output_height;
+
   /* Set unused fields of header to 0 */
-  MEMZERO(bmpfileheader, SIZEOF(bmpfileheader));
-  MEMZERO(bmpinfoheader, SIZEOF(bmpinfoheader));
+  MEMZERO(bmpfileheader, sizeof(bmpfileheader));
+  MEMZERO(bmpinfoheader, sizeof(bmpinfoheader));
 
   /* Fill the file header */
-  bmpfileheader[0] = 0x42;	/* first 2 bytes are ASCII 'B', 'M' */
+  bmpfileheader[0] = 0x42;      /* first 2 bytes are ASCII 'B', 'M' */
   bmpfileheader[1] = 0x4D;
   PUT_4B(bmpfileheader, 2, bfSize); /* bfSize */
   /* we leave bfReserved1 & bfReserved2 = 0 */
   PUT_4B(bmpfileheader, 10, headersize); /* bfOffBits */
 
   /* Fill the info header (Microsoft calls this a BITMAPINFOHEADER) */
-  PUT_2B(bmpinfoheader, 0, 40);	/* biSize */
+  PUT_2B(bmpinfoheader, 0, 40); /* biSize */
   PUT_4B(bmpinfoheader, 4, cinfo->output_width); /* biWidth */
   PUT_4B(bmpinfoheader, 8, cinfo->output_height); /* biHeight */
-  PUT_2B(bmpinfoheader, 12, 1);	/* biPlanes - must be 1 */
+  PUT_2B(bmpinfoheader, 12, 1); /* biPlanes - must be 1 */
   PUT_2B(bmpinfoheader, 14, bits_per_pixel); /* biBitCount */
   /* we leave biCompression = 0, for none */
   /* we leave biSizeImage = 0; this is correct for uncompressed data */
   if (cinfo->density_unit == 2) { /* if have density in dots/cm, then */
-    PUT_4B(bmpinfoheader, 24, (INT32) (cinfo->X_density*100)); /* XPels/M */
-    PUT_4B(bmpinfoheader, 28, (INT32) (cinfo->Y_density*100)); /* XPels/M */
+    PUT_4B(bmpinfoheader, 24, (long) (cinfo->X_density*100)); /* XPels/M */
+    PUT_4B(bmpinfoheader, 28, (long) (cinfo->Y_density*100)); /* XPels/M */
   }
   PUT_2B(bmpinfoheader, 32, cmap_entries); /* biClrUsed */
   /* we leave biClrImportant = 0 */
@@ -266,7 +268,7 @@
 {
   char bmpfileheader[14];
   char bmpcoreheader[12];
-  INT32 headersize, bfSize;
+  long headersize, bfSize;
   int bits_per_pixel, cmap_entries;
 
   /* Compute colormap size and total file size */
@@ -290,24 +292,24 @@
   }
   /* File size */
   headersize = 14 + 12 + cmap_entries * 3; /* Header and colormap */
-  bfSize = headersize + (INT32) dest->row_width * (INT32) cinfo->output_height;
-  
+  bfSize = headersize + (long) dest->row_width * (long) cinfo->output_height;
+
   /* Set unused fields of header to 0 */
-  MEMZERO(bmpfileheader, SIZEOF(bmpfileheader));
-  MEMZERO(bmpcoreheader, SIZEOF(bmpcoreheader));
+  MEMZERO(bmpfileheader, sizeof(bmpfileheader));
+  MEMZERO(bmpcoreheader, sizeof(bmpcoreheader));
 
   /* Fill the file header */
-  bmpfileheader[0] = 0x42;	/* first 2 bytes are ASCII 'B', 'M' */
+  bmpfileheader[0] = 0x42;      /* first 2 bytes are ASCII 'B', 'M' */
   bmpfileheader[1] = 0x4D;
   PUT_4B(bmpfileheader, 2, bfSize); /* bfSize */
   /* we leave bfReserved1 & bfReserved2 = 0 */
   PUT_4B(bmpfileheader, 10, headersize); /* bfOffBits */
 
   /* Fill the info header (Microsoft calls this a BITMAPCOREHEADER) */
-  PUT_2B(bmpcoreheader, 0, 12);	/* bcSize */
+  PUT_2B(bmpcoreheader, 0, 12); /* bcSize */
   PUT_2B(bmpcoreheader, 4, cinfo->output_width); /* bcWidth */
   PUT_2B(bmpcoreheader, 6, cinfo->output_height); /* bcHeight */
-  PUT_2B(bmpcoreheader, 8, 1);	/* bcPlanes - must be 1 */
+  PUT_2B(bmpcoreheader, 8, 1);  /* bcPlanes - must be 1 */
   PUT_2B(bmpcoreheader, 10, bits_per_pixel); /* bcBitCount */
 
   if (JFWRITE(dest->pub.output_file, bmpfileheader, 14) != (size_t) 14)
@@ -327,31 +329,31 @@
 
 LOCAL(void)
 write_colormap (j_decompress_ptr cinfo, bmp_dest_ptr dest,
-		int map_colors, int map_entry_size)
+                int map_colors, int map_entry_size)
 {
   JSAMPARRAY colormap = cinfo->colormap;
   int num_colors = cinfo->actual_number_of_colors;
-  FILE * outfile = dest->pub.output_file;
+  FILE *outfile = dest->pub.output_file;
   int i;
 
   if (colormap != NULL) {
     if (cinfo->out_color_components == 3) {
       /* Normal case with RGB colormap */
       for (i = 0; i < num_colors; i++) {
-	putc(GETJSAMPLE(colormap[2][i]), outfile);
-	putc(GETJSAMPLE(colormap[1][i]), outfile);
-	putc(GETJSAMPLE(colormap[0][i]), outfile);
-	if (map_entry_size == 4)
-	  putc(0, outfile);
+        putc(GETJSAMPLE(colormap[2][i]), outfile);
+        putc(GETJSAMPLE(colormap[1][i]), outfile);
+        putc(GETJSAMPLE(colormap[0][i]), outfile);
+        if (map_entry_size == 4)
+          putc(0, outfile);
       }
     } else {
       /* Grayscale colormap (only happens with grayscale quantization) */
       for (i = 0; i < num_colors; i++) {
-	putc(GETJSAMPLE(colormap[0][i]), outfile);
-	putc(GETJSAMPLE(colormap[0][i]), outfile);
-	putc(GETJSAMPLE(colormap[0][i]), outfile);
-	if (map_entry_size == 4)
-	  putc(0, outfile);
+        putc(GETJSAMPLE(colormap[0][i]), outfile);
+        putc(GETJSAMPLE(colormap[0][i]), outfile);
+        putc(GETJSAMPLE(colormap[0][i]), outfile);
+        if (map_entry_size == 4)
+          putc(0, outfile);
       }
     }
   } else {
@@ -361,10 +363,10 @@
       putc(i, outfile);
       putc(i, outfile);
       if (map_entry_size == 4)
-	putc(0, outfile);
+        putc(0, outfile);
     }
   }
-  /* Pad colormap with zeros to ensure specified number of colormap entries */ 
+  /* Pad colormap with zeros to ensure specified number of colormap entries */
   if (i > map_colors)
     ERREXIT1(cinfo, JERR_TOO_MANY_COLORS, i);
   for (; i < map_colors; i++) {
@@ -381,7 +383,7 @@
 finish_output_bmp (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
   bmp_dest_ptr dest = (bmp_dest_ptr) dinfo;
-  register FILE * outfile = dest->pub.output_file;
+  register FILE *outfile = dest->pub.output_file;
   JSAMPARRAY image_ptr;
   register JSAMPROW data_ptr;
   JDIMENSION row;
@@ -432,7 +434,7 @@
   /* Create module interface object, fill in method pointers */
   dest = (bmp_dest_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(bmp_dest_struct));
+                                  sizeof(bmp_dest_struct));
   dest->pub.start_output = start_output_bmp;
   dest->pub.finish_output = finish_output_bmp;
   dest->is_os2 = is_os2;
diff --git a/wrgif.c b/wrgif.c
index 5fe8328..cc06f1d 100644
--- a/wrgif.c
+++ b/wrgif.c
@@ -1,9 +1,12 @@
 /*
  * wrgif.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to write output images in GIF format.
  *
@@ -37,7 +40,7 @@
  *    CompuServe Incorporated."
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef GIF_SUPPORTED
 
@@ -45,31 +48,31 @@
 /* Private version of data destination object */
 
 typedef struct {
-  struct djpeg_dest_struct pub;	/* public fields */
+  struct djpeg_dest_struct pub; /* public fields */
 
-  j_decompress_ptr cinfo;	/* back link saves passing separate parm */
+  j_decompress_ptr cinfo;       /* back link saves passing separate parm */
 
   /* State for packing variable-width codes into a bitstream */
-  int n_bits;			/* current number of bits/code */
-  int maxcode;			/* maximum code, given n_bits */
-  INT32 cur_accum;		/* holds bits not yet output */
-  int cur_bits;			/* # of bits in cur_accum */
+  int n_bits;                   /* current number of bits/code */
+  int maxcode;                  /* maximum code, given n_bits */
+  long cur_accum;               /* holds bits not yet output */
+  int cur_bits;                 /* # of bits in cur_accum */
 
   /* State for GIF code assignment */
-  int ClearCode;		/* clear code (doesn't change) */
-  int EOFCode;			/* EOF code (ditto) */
-  int code_counter;		/* counts output symbols */
+  int ClearCode;                /* clear code (doesn't change) */
+  int EOFCode;                  /* EOF code (ditto) */
+  int code_counter;             /* counts output symbols */
 
   /* GIF data packet construction buffer */
-  int bytesinpkt;		/* # of bytes in current packet */
-  char packetbuf[256];		/* workspace for accumulating packet */
+  int bytesinpkt;               /* # of bytes in current packet */
+  char packetbuf[256];          /* workspace for accumulating packet */
 
 } gif_dest_struct;
 
-typedef gif_dest_struct * gif_dest_ptr;
+typedef gif_dest_struct *gif_dest_ptr;
 
 /* Largest value that will fit in N bits */
-#define MAXCODE(n_bits)	((1 << (n_bits)) - 1)
+#define MAXCODE(n_bits) ((1 << (n_bits)) - 1)
 
 
 /*
@@ -81,10 +84,10 @@
 flush_packet (gif_dest_ptr dinfo)
 /* flush any accumulated data */
 {
-  if (dinfo->bytesinpkt > 0) {	/* never write zero-length packet */
+  if (dinfo->bytesinpkt > 0) {  /* never write zero-length packet */
     dinfo->packetbuf[0] = (char) dinfo->bytesinpkt++;
     if (JFWRITE(dinfo->pub.output_file, dinfo->packetbuf, dinfo->bytesinpkt)
-	!= (size_t) dinfo->bytesinpkt)
+        != (size_t) dinfo->bytesinpkt)
       ERREXIT(dinfo->cinfo, JERR_FILE_WRITE);
     dinfo->bytesinpkt = 0;
   }
@@ -93,10 +96,10 @@
 
 /* Add a character to current packet; flush to disk if necessary */
 #define CHAR_OUT(dinfo,c)  \
-	{ (dinfo)->packetbuf[++(dinfo)->bytesinpkt] = (char) (c);  \
-	    if ((dinfo)->bytesinpkt >= 255)  \
-	      flush_packet(dinfo);  \
-	}
+        { (dinfo)->packetbuf[++(dinfo)->bytesinpkt] = (char) (c);  \
+            if ((dinfo)->bytesinpkt >= 255)  \
+              flush_packet(dinfo);  \
+        }
 
 
 /* Routine to convert variable-width codes into a byte stream */
@@ -106,7 +109,7 @@
 /* Emit a code of n_bits bits */
 /* Uses cur_accum and cur_bits to reblock into 8-bit bytes */
 {
-  dinfo->cur_accum |= ((INT32) code) << dinfo->cur_bits;
+  dinfo->cur_accum |= ((long) code) << dinfo->cur_bits;
   dinfo->cur_bits += dinfo->n_bits;
 
   while (dinfo->cur_bits >= 8) {
@@ -173,7 +176,7 @@
     dinfo->code_counter++;
   } else {
     output(dinfo, dinfo->ClearCode);
-    dinfo->code_counter = dinfo->ClearCode + 2;	/* reset the counter */
+    dinfo->code_counter = dinfo->ClearCode + 2; /* reset the counter */
   }
 }
 
@@ -218,7 +221,7 @@
 LOCAL(void)
 emit_header (gif_dest_ptr dinfo, int num_colors, JSAMPARRAY colormap)
 /* Output the GIF file header, including color map */
-/* If colormap==NULL, synthesize a gray-scale colormap */
+/* If colormap==NULL, synthesize a grayscale colormap */
 {
   int BitsPerPixel, ColorMapSize, InitCodeSize, FlagByte;
   int cshift = dinfo->cinfo->data_precision - 8;
@@ -248,9 +251,9 @@
   /* Write the Logical Screen Descriptor */
   put_word(dinfo, (unsigned int) dinfo->cinfo->output_width);
   put_word(dinfo, (unsigned int) dinfo->cinfo->output_height);
-  FlagByte = 0x80;		/* Yes, there is a global color table */
+  FlagByte = 0x80;              /* Yes, there is a global color table */
   FlagByte |= (BitsPerPixel-1) << 4; /* color resolution */
-  FlagByte |= (BitsPerPixel-1);	/* size of global color table */
+  FlagByte |= (BitsPerPixel-1); /* size of global color table */
   putc(FlagByte, dinfo->pub.output_file);
   putc(0, dinfo->pub.output_file); /* Background color index */
   putc(0, dinfo->pub.output_file); /* Reserved (aspect ratio in GIF89) */
@@ -260,18 +263,18 @@
   for (i=0; i < ColorMapSize; i++) {
     if (i < num_colors) {
       if (colormap != NULL) {
-	if (dinfo->cinfo->out_color_space == JCS_RGB) {
-	  /* Normal case: RGB color map */
-	  putc(GETJSAMPLE(colormap[0][i]) >> cshift, dinfo->pub.output_file);
-	  putc(GETJSAMPLE(colormap[1][i]) >> cshift, dinfo->pub.output_file);
-	  putc(GETJSAMPLE(colormap[2][i]) >> cshift, dinfo->pub.output_file);
-	} else {
-	  /* Grayscale "color map": possible if quantizing grayscale image */
-	  put_3bytes(dinfo, GETJSAMPLE(colormap[0][i]) >> cshift);
-	}
+        if (dinfo->cinfo->out_color_space == JCS_RGB) {
+          /* Normal case: RGB color map */
+          putc(GETJSAMPLE(colormap[0][i]) >> cshift, dinfo->pub.output_file);
+          putc(GETJSAMPLE(colormap[1][i]) >> cshift, dinfo->pub.output_file);
+          putc(GETJSAMPLE(colormap[2][i]) >> cshift, dinfo->pub.output_file);
+        } else {
+          /* Grayscale "color map": possible if quantizing grayscale image */
+          put_3bytes(dinfo, GETJSAMPLE(colormap[0][i]) >> cshift);
+        }
       } else {
-	/* Create a gray-scale map of num_colors values, range 0..255 */
-	put_3bytes(dinfo, (i * 255 + (num_colors-1)/2) / (num_colors-1));
+        /* Create a grayscale map of num_colors values, range 0..255 */
+        put_3bytes(dinfo, (i * 255 + (num_colors-1)/2) / (num_colors-1));
       }
     } else {
       /* fill out the map to a power of 2 */
@@ -280,7 +283,7 @@
   }
   /* Write image separator and Image Descriptor */
   putc(',', dinfo->pub.output_file); /* separator */
-  put_word(dinfo, 0);		/* left/top offset */
+  put_word(dinfo, 0);           /* left/top offset */
   put_word(dinfo, 0);
   put_word(dinfo, (unsigned int) dinfo->cinfo->output_width); /* image size */
   put_word(dinfo, (unsigned int) dinfo->cinfo->output_height);
@@ -317,7 +320,7 @@
 
 METHODDEF(void)
 put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-		JDIMENSION rows_supplied)
+                JDIMENSION rows_supplied)
 {
   gif_dest_ptr dest = (gif_dest_ptr) dinfo;
   register JSAMPROW ptr;
@@ -364,8 +367,8 @@
   /* Create module interface object, fill in method pointers */
   dest = (gif_dest_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(gif_dest_struct));
-  dest->cinfo = cinfo;		/* make back link for subroutines */
+                                  sizeof(gif_dest_struct));
+  dest->cinfo = cinfo;          /* make back link for subroutines */
   dest->pub.start_output = start_output_gif;
   dest->pub.put_pixel_rows = put_pixel_rows;
   dest->pub.finish_output = finish_output_gif;
diff --git a/wrjpgcom.1 b/wrjpgcom.1
new file mode 100644
index 0000000..d419a99
--- /dev/null
+++ b/wrjpgcom.1
@@ -0,0 +1,103 @@
+.TH WRJPGCOM 1 "15 June 1995"
+.SH NAME
+wrjpgcom \- insert text comments into a JPEG file
+.SH SYNOPSIS
+.B wrjpgcom
+[
+.B \-replace
+]
+[
+.BI \-comment " text"
+]
+[
+.BI \-cfile " name"
+]
+[
+.I filename
+]
+.LP
+.SH DESCRIPTION
+.LP
+.B wrjpgcom
+reads the named JPEG/JFIF file, or the standard input if no file is named,
+and generates a new JPEG/JFIF file on standard output.  A comment block is
+added to the file.
+.PP
+The JPEG standard allows "comment" (COM) blocks to occur within a JPEG file.
+Although the standard doesn't actually define what COM blocks are for, they
+are widely used to hold user-supplied text strings.  This lets you add
+annotations, titles, index terms, etc to your JPEG files, and later retrieve
+them as text.  COM blocks do not interfere with the image stored in the JPEG
+file.  The maximum size of a COM block is 64K, but you can have as many of
+them as you like in one JPEG file.
+.PP
+.B wrjpgcom
+adds a COM block, containing text you provide, to a JPEG file.
+Ordinarily, the COM block is added after any existing COM blocks; but you
+can delete the old COM blocks if you wish.
+.SH OPTIONS
+Switch names may be abbreviated, and are not case sensitive.
+.TP
+.B \-replace
+Delete any existing COM blocks from the file.
+.TP
+.BI \-comment " text"
+Supply text for new COM block on command line.
+.TP
+.BI \-cfile " name"
+Read text for new COM block from named file.
+.PP
+If you have only one line of comment text to add, you can provide it on the
+command line with
+.BR \-comment .
+The comment text must be surrounded with quotes so that it is treated as a
+single argument.  Longer comments can be read from a text file.
+.PP
+If you give neither
+.B \-comment
+nor
+.BR \-cfile ,
+then
+.B wrjpgcom
+will read the comment text from standard input.  (In this case an input image
+file name MUST be supplied, so that the source JPEG file comes from somewhere
+else.)  You can enter multiple lines, up to 64KB worth.  Type an end-of-file
+indicator (usually control-D) to terminate the comment text entry.
+.PP
+.B wrjpgcom
+will not add a COM block if the provided comment string is empty.  Therefore
+\fB\-replace \-comment ""\fR can be used to delete all COM blocks from a file.
+.SH EXAMPLES
+.LP
+Add a short comment to in.jpg, producing out.jpg:
+.IP
+.B wrjpgcom \-c
+\fI"View of my back yard" in.jpg
+.B >
+.I out.jpg
+.PP
+Attach a long comment previously stored in comment.txt:
+.IP
+.B wrjpgcom
+.I in.jpg
+.B <
+.I comment.txt
+.B >
+.I out.jpg
+.PP
+or equivalently
+.IP
+.B wrjpgcom
+.B -cfile
+.I comment.txt
+.B <
+.I in.jpg
+.B >
+.I out.jpg
+.SH SEE ALSO
+.BR cjpeg (1),
+.BR djpeg (1),
+.BR jpegtran (1),
+.BR rdjpgcom (1)
+.SH AUTHOR
+Independent JPEG Group
diff --git a/wrjpgcom.c b/wrjpgcom.c
index 8c04b05..cd67afd 100644
--- a/wrjpgcom.c
+++ b/wrjpgcom.c
@@ -1,9 +1,12 @@
 /*
  * wrjpgcom.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2014, D. R. Commander
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains a very simple stand-alone application that inserts
  * user-supplied text as a COM (comment) marker in a JFIF file.
@@ -11,59 +14,50 @@
  * JPEG markers.
  */
 
-#define JPEG_CJPEG_DJPEG	/* to get the command-line config symbols */
-#include "jinclude.h"		/* get auto-config symbols, <stdio.h> */
+#define JPEG_CJPEG_DJPEG        /* to get the command-line config symbols */
+#include "jinclude.h"           /* get auto-config symbols, <stdio.h> */
 
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc() */
-extern void * malloc ();
+#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc() */
+extern void *malloc ();
 #endif
-#include <ctype.h>		/* to declare isupper(), tolower() */
+#include <ctype.h>              /* to declare isupper(), tolower() */
 #ifdef USE_SETMODE
-#include <fcntl.h>		/* to declare setmode()'s parameter macros */
+#include <fcntl.h>              /* to declare setmode()'s parameter macros */
 /* If you have setmode() but not <io.h>, just delete this line: */
-#include <io.h>			/* to declare setmode() */
+#include <io.h>                 /* to declare setmode() */
 #endif
 
-#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
+#ifdef USE_CCOMMAND             /* command-line reader for Macintosh */
 #ifdef __MWERKS__
 #include <SIOUX.h>              /* Metrowerks needs this */
-#include <console.h>		/* ... and this */
+#include <console.h>            /* ... and this */
 #endif
 #ifdef THINK_C
-#include <console.h>		/* Think declares it here */
+#include <console.h>            /* Think declares it here */
 #endif
 #endif
 
-#ifdef DONT_USE_B_MODE		/* define mode parameters for fopen() */
-#define READ_BINARY	"r"
-#define WRITE_BINARY	"w"
+#ifdef DONT_USE_B_MODE          /* define mode parameters for fopen() */
+#define READ_BINARY     "r"
+#define WRITE_BINARY    "w"
 #else
-#ifdef VMS			/* VMS is very nonstandard */
-#define READ_BINARY	"rb", "ctx=stm"
-#define WRITE_BINARY	"wb", "ctx=stm"
-#else				/* standard ANSI-compliant case */
-#define READ_BINARY	"rb"
-#define WRITE_BINARY	"wb"
-#endif
+#define READ_BINARY     "rb"
+#define WRITE_BINARY    "wb"
 #endif
 
-#ifndef EXIT_FAILURE		/* define exit() codes if not provided */
+#ifndef EXIT_FAILURE            /* define exit() codes if not provided */
 #define EXIT_FAILURE  1
 #endif
 #ifndef EXIT_SUCCESS
-#ifdef VMS
-#define EXIT_SUCCESS  1		/* VMS is very nonstandard */
-#else
 #define EXIT_SUCCESS  0
 #endif
-#endif
 
 /* Reduce this value if your malloc() can't allocate blocks up to 64K.
  * On DOS, compiling in large model is usually a better solution.
  */
 
 #ifndef MAX_COM_LENGTH
-#define MAX_COM_LENGTH 65000L	/* must be <= 65533 in any case */
+#define MAX_COM_LENGTH 65000L   /* must be <= 65533 in any case */
 #endif
 
 
@@ -72,12 +66,12 @@
  * To reuse this code in another application, you might need to change these.
  */
 
-static FILE * infile;		/* input JPEG file */
+static FILE *infile;            /* input JPEG file */
 
 /* Return next input byte, or EOF if no more */
 #define NEXTBYTE()  getc(infile)
 
-static FILE * outfile;		/* output JPEG file */
+static FILE *outfile;           /* output JPEG file */
 
 /* Emit an output byte */
 #define PUTBYTE(x)  putc((x), outfile)
@@ -154,11 +148,11 @@
  * in this program.  (See jdmarker.c for a more complete list.)
  */
 
-#define M_SOF0  0xC0		/* Start Of Frame N */
-#define M_SOF1  0xC1		/* N indicates which compression process */
-#define M_SOF2  0xC2		/* Only SOF0-SOF2 are now in common use */
+#define M_SOF0  0xC0            /* Start Of Frame N */
+#define M_SOF1  0xC1            /* N indicates which compression process */
+#define M_SOF2  0xC2            /* Only SOF0-SOF2 are now in common use */
 #define M_SOF3  0xC3
-#define M_SOF5  0xC5		/* NB: codes C4 and CC are NOT SOF markers */
+#define M_SOF5  0xC5            /* NB: codes C4 and CC are NOT SOF markers */
 #define M_SOF6  0xC6
 #define M_SOF7  0xC7
 #define M_SOF9  0xC9
@@ -167,10 +161,10 @@
 #define M_SOF13 0xCD
 #define M_SOF14 0xCE
 #define M_SOF15 0xCF
-#define M_SOI   0xD8		/* Start Of Image (beginning of datastream) */
-#define M_EOI   0xD9		/* End Of Image (end of datastream) */
-#define M_SOS   0xDA		/* Start Of Scan (begins compressed data) */
-#define M_COM   0xFE		/* COMment */
+#define M_SOI   0xD8            /* Start Of Image (beginning of datastream) */
+#define M_EOI   0xD9            /* End Of Image (end of datastream) */
+#define M_SOS   0xDA            /* Start Of Scan (begins compressed data) */
+#define M_COM   0xFE            /* COMment */
 
 
 /*
@@ -302,40 +296,40 @@
       /* Note that marker codes 0xC4, 0xC8, 0xCC are not, and must not be,
        * treated as SOFn.  C4 in particular is actually DHT.
        */
-    case M_SOF0:		/* Baseline */
-    case M_SOF1:		/* Extended sequential, Huffman */
-    case M_SOF2:		/* Progressive, Huffman */
-    case M_SOF3:		/* Lossless, Huffman */
-    case M_SOF5:		/* Differential sequential, Huffman */
-    case M_SOF6:		/* Differential progressive, Huffman */
-    case M_SOF7:		/* Differential lossless, Huffman */
-    case M_SOF9:		/* Extended sequential, arithmetic */
-    case M_SOF10:		/* Progressive, arithmetic */
-    case M_SOF11:		/* Lossless, arithmetic */
-    case M_SOF13:		/* Differential sequential, arithmetic */
-    case M_SOF14:		/* Differential progressive, arithmetic */
-    case M_SOF15:		/* Differential lossless, arithmetic */
+    case M_SOF0:                /* Baseline */
+    case M_SOF1:                /* Extended sequential, Huffman */
+    case M_SOF2:                /* Progressive, Huffman */
+    case M_SOF3:                /* Lossless, Huffman */
+    case M_SOF5:                /* Differential sequential, Huffman */
+    case M_SOF6:                /* Differential progressive, Huffman */
+    case M_SOF7:                /* Differential lossless, Huffman */
+    case M_SOF9:                /* Extended sequential, arithmetic */
+    case M_SOF10:               /* Progressive, arithmetic */
+    case M_SOF11:               /* Lossless, arithmetic */
+    case M_SOF13:               /* Differential sequential, arithmetic */
+    case M_SOF14:               /* Differential progressive, arithmetic */
+    case M_SOF15:               /* Differential lossless, arithmetic */
       return marker;
 
-    case M_SOS:			/* should not see compressed data before SOF */
+    case M_SOS:                 /* should not see compressed data before SOF */
       ERREXIT("SOS without prior SOFn");
       break;
 
-    case M_EOI:			/* in case it's a tables-only JPEG stream */
+    case M_EOI:                 /* in case it's a tables-only JPEG stream */
       return marker;
 
-    case M_COM:			/* Existing COM: conditionally discard */
+    case M_COM:                 /* Existing COM: conditionally discard */
       if (keep_COM) {
-	write_marker(marker);
-	copy_variable();
+        write_marker(marker);
+        copy_variable();
       } else {
-	skip_variable();
+        skip_variable();
       }
       break;
 
-    default:			/* Anything else just gets copied */
+    default:                    /* Anything else just gets copied */
       write_marker(marker);
-      copy_variable();		/* we assume it has a parameter count... */
+      copy_variable();          /* we assume it has a parameter count... */
       break;
     }
   } /* end loop */
@@ -344,7 +338,7 @@
 
 /* Command line parsing code */
 
-static const char * progname;	/* program name for error messages */
+static const char *progname;    /* program name for error messages */
 
 
 static void
@@ -370,7 +364,7 @@
   fprintf(stderr, "If you do not give either -comment or -cfile on the command line,\n");
   fprintf(stderr, "then the comment text is read from standard input.\n");
   fprintf(stderr, "It can be multiple lines, up to %u characters total.\n",
-	  (unsigned int) MAX_COM_LENGTH);
+          (unsigned int) MAX_COM_LENGTH);
 #ifndef TWO_FILE_COMMANDLINE
   fprintf(stderr, "You must specify an input JPEG file name when supplying\n");
   fprintf(stderr, "comment text from standard input.\n");
@@ -381,7 +375,7 @@
 
 
 static int
-keymatch (char * arg, const char * keyword, int minchars)
+keymatch (char *arg, const char *keyword, int minchars)
 /* Case-insensitive matching of (possibly abbreviated) keyword switches. */
 /* keyword is the constant keyword (must be lower case already), */
 /* minchars is length of minimum legal abbreviation. */
@@ -391,17 +385,17 @@
 
   while ((ca = *arg++) != '\0') {
     if ((ck = *keyword++) == '\0')
-      return 0;			/* arg longer than keyword, no good */
-    if (isupper(ca))		/* force arg to lcase (assume ck is already) */
+      return 0;                 /* arg longer than keyword, no good */
+    if (isupper(ca))            /* force arg to lcase (assume ck is already) */
       ca = tolower(ca);
     if (ca != ck)
-      return 0;			/* no good */
-    nmatched++;			/* count matched characters */
+      return 0;                 /* no good */
+    nmatched++;                 /* count matched characters */
   }
   /* reached end of argument; fail if it's too short for unique abbrev */
   if (nmatched < minchars)
     return 0;
-  return 1;			/* A-OK */
+  return 1;                     /* A-OK */
 }
 
 
@@ -413,10 +407,10 @@
 main (int argc, char **argv)
 {
   int argn;
-  char * arg;
+  char *arg;
   int keep_COM = 1;
-  char * comment_arg = NULL;
-  FILE * comment_file = NULL;
+  char *comment_arg = NULL;
+  FILE *comment_file = NULL;
   unsigned int comment_length = 0;
   int marker;
 
@@ -427,21 +421,21 @@
 
   progname = argv[0];
   if (progname == NULL || progname[0] == 0)
-    progname = "wrjpgcom";	/* in case C library doesn't provide it */
+    progname = "wrjpgcom";      /* in case C library doesn't provide it */
 
   /* Parse switches, if any */
   for (argn = 1; argn < argc; argn++) {
     arg = argv[argn];
     if (arg[0] != '-')
-      break;			/* not switch, must be file name */
-    arg++;			/* advance over '-' */
+      break;                    /* not switch, must be file name */
+    arg++;                      /* advance over '-' */
     if (keymatch(arg, "replace", 1)) {
       keep_COM = 0;
     } else if (keymatch(arg, "cfile", 2)) {
       if (++argn >= argc) usage();
       if ((comment_file = fopen(argv[argn], "r")) == NULL) {
-	fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]);
-	exit(EXIT_FAILURE);
+        fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]);
+        exit(EXIT_FAILURE);
       }
     } else if (keymatch(arg, "comment", 1)) {
       if (++argn >= argc) usage();
@@ -450,21 +444,36 @@
        * under MS-DOG and must parse out the quoted string ourselves.  Sigh.
        */
       if (comment_arg[0] == '"') {
-	comment_arg = (char *) malloc((size_t) MAX_COM_LENGTH);
-	if (comment_arg == NULL)
-	  ERREXIT("Insufficient memory");
-	strcpy(comment_arg, argv[argn]+1);
-	for (;;) {
-	  comment_length = (unsigned int) strlen(comment_arg);
-	  if (comment_length > 0 && comment_arg[comment_length-1] == '"') {
-	    comment_arg[comment_length-1] = '\0'; /* zap terminating quote */
-	    break;
-	  }
-	  if (++argn >= argc)
-	    ERREXIT("Missing ending quote mark");
-	  strcat(comment_arg, " ");
-	  strcat(comment_arg, argv[argn]);
-	}
+        comment_arg = (char *) malloc((size_t) MAX_COM_LENGTH);
+        if (comment_arg == NULL)
+          ERREXIT("Insufficient memory");
+        if (strlen(argv[argn]) + 2 >= (size_t) MAX_COM_LENGTH) {
+          fprintf(stderr, "Comment text may not exceed %u bytes\n",
+                  (unsigned int) MAX_COM_LENGTH);
+          exit(EXIT_FAILURE);
+        }
+        strcpy(comment_arg, argv[argn]+1);
+        for (;;) {
+          comment_length = (unsigned int) strlen(comment_arg);
+          if (comment_length > 0 && comment_arg[comment_length-1] == '"') {
+            comment_arg[comment_length-1] = '\0'; /* zap terminating quote */
+            break;
+          }
+          if (++argn >= argc)
+            ERREXIT("Missing ending quote mark");
+          if (strlen(comment_arg) + strlen(argv[argn]) + 2 >=
+              (size_t) MAX_COM_LENGTH) {
+            fprintf(stderr, "Comment text may not exceed %u bytes\n",
+                    (unsigned int) MAX_COM_LENGTH);
+            exit(EXIT_FAILURE);
+          }
+          strcat(comment_arg, " ");
+          strcat(comment_arg, argv[argn]);
+        }
+      } else if (strlen(argv[argn]) >= (size_t) MAX_COM_LENGTH) {
+        fprintf(stderr, "Comment text may not exceed %u bytes\n",
+                (unsigned int) MAX_COM_LENGTH);
+        exit(EXIT_FAILURE);
       }
       comment_length = (unsigned int) strlen(comment_arg);
     } else
@@ -488,10 +497,10 @@
     }
   } else {
     /* default input file is stdin */
-#ifdef USE_SETMODE		/* need to hack file mode? */
+#ifdef USE_SETMODE              /* need to hack file mode? */
     setmode(fileno(stdin), O_BINARY);
 #endif
-#ifdef USE_FDOPEN		/* need to re-open in binary mode? */
+#ifdef USE_FDOPEN               /* need to re-open in binary mode? */
     if ((infile = fdopen(fileno(stdin), READ_BINARY)) == NULL) {
       fprintf(stderr, "%s: can't open stdin\n", progname);
       exit(EXIT_FAILURE);
@@ -506,7 +515,7 @@
   /* Must have explicit output file name */
   if (argn != argc-2) {
     fprintf(stderr, "%s: must name one input and one output file\n",
-	    progname);
+            progname);
     usage();
   }
   if ((outfile = fopen(argv[argn+1], WRITE_BINARY)) == NULL) {
@@ -520,10 +529,10 @@
     usage();
   }
   /* default output file is stdout */
-#ifdef USE_SETMODE		/* need to hack file mode? */
+#ifdef USE_SETMODE              /* need to hack file mode? */
   setmode(fileno(stdout), O_BINARY);
 #endif
-#ifdef USE_FDOPEN		/* need to re-open in binary mode? */
+#ifdef USE_FDOPEN               /* need to re-open in binary mode? */
   if ((outfile = fdopen(fileno(stdout), WRITE_BINARY)) == NULL) {
     fprintf(stderr, "%s: can't open stdout\n", progname);
     exit(EXIT_FAILURE);
@@ -535,7 +544,7 @@
 
   /* Collect comment text from comment_file or stdin, if necessary */
   if (comment_arg == NULL) {
-    FILE * src_file;
+    FILE *src_file;
     int c;
 
     comment_arg = (char *) malloc((size_t) MAX_COM_LENGTH);
@@ -545,9 +554,9 @@
     src_file = (comment_file != NULL ? comment_file : stdin);
     while ((c = getc(src_file)) != EOF) {
       if (comment_length >= (unsigned int) MAX_COM_LENGTH) {
-	fprintf(stderr, "Comment text may not exceed %u bytes\n",
-		(unsigned int) MAX_COM_LENGTH);
-	exit(EXIT_FAILURE);
+        fprintf(stderr, "Comment text may not exceed %u bytes\n",
+                (unsigned int) MAX_COM_LENGTH);
+        exit(EXIT_FAILURE);
       }
       comment_arg[comment_length++] = (char) c;
     }
@@ -579,5 +588,5 @@
 
   /* All done. */
   exit(EXIT_SUCCESS);
-  return 0;			/* suppress no-return-value warnings */
+  return 0;                     /* suppress no-return-value warnings */
 }
diff --git a/wrppm.c b/wrppm.c
index 68e0c85..40fbf1f 100644
--- a/wrppm.c
+++ b/wrppm.c
@@ -1,10 +1,13 @@
 /*
  * wrppm.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * Modified 2009 by Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code and
+ * information relevant to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to write output images in PPM/PGM format.
  * The extended 2-byte-per-sample raw PPM/PGM formats are supported.
@@ -16,7 +19,8 @@
  * an ordinary stdio stream.
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include "wrppm.h"
 
 #ifdef PPM_SUPPORTED
 
@@ -42,11 +46,11 @@
 #define PPM_MAXVAL 255
 #else
 /* The word-per-sample format always puts the MSB first. */
-#define PUTPPMSAMPLE(ptr,v)			\
-	{ register int val_ = v;		\
-	  *ptr++ = (char) ((val_ >> 8) & 0xFF);	\
-	  *ptr++ = (char) (val_ & 0xFF);	\
-	}
+#define PUTPPMSAMPLE(ptr,v)                     \
+        { register int val_ = v;                \
+          *ptr++ = (char) ((val_ >> 8) & 0xFF); \
+          *ptr++ = (char) (val_ & 0xFF);        \
+        }
 #define BYTESPERSAMPLE 2
 #define PPM_MAXVAL ((1<<BITS_IN_JSAMPLE)-1)
 #endif
@@ -55,30 +59,10 @@
 
 /*
  * When JSAMPLE is the same size as char, we can just fwrite() the
- * decompressed data to the PPM or PGM file.  On PCs, in order to make this
- * work the output buffer must be allocated in near data space, because we are
- * assuming small-data memory model wherein fwrite() can't reach far memory.
- * If you need to process very wide images on a PC, you might have to compile
- * in large-memory model, or else replace fwrite() with a putc() loop ---
- * which will be much slower.
+ * decompressed data to the PPM or PGM file.
  */
 
 
-/* Private version of data destination object */
-
-typedef struct {
-  struct djpeg_dest_struct pub;	/* public fields */
-
-  /* Usually these two pointers point to the same place: */
-  char *iobuffer;		/* fwrite's I/O buffer */
-  JSAMPROW pixrow;		/* decompressor output buffer */
-  size_t buffer_width;		/* width of I/O buffer */
-  JDIMENSION samples_per_row;	/* JSAMPLEs per output row */
-} ppm_dest_struct;
-
-typedef ppm_dest_struct * ppm_dest_ptr;
-
-
 /*
  * Write some pixel data.
  * In this module rows_supplied will always be 1.
@@ -89,7 +73,7 @@
 
 METHODDEF(void)
 put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-		JDIMENSION rows_supplied)
+                JDIMENSION rows_supplied)
 {
   ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
 
@@ -104,10 +88,10 @@
 
 METHODDEF(void)
 copy_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-		 JDIMENSION rows_supplied)
+                 JDIMENSION rows_supplied)
 {
   ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
-  register char * bufferptr;
+  register char *bufferptr;
   register JSAMPROW ptr;
   register JDIMENSION col;
 
@@ -127,10 +111,10 @@
 
 METHODDEF(void)
 put_demapped_rgb (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-		  JDIMENSION rows_supplied)
+                  JDIMENSION rows_supplied)
 {
   ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
-  register char * bufferptr;
+  register char *bufferptr;
   register int pixval;
   register JSAMPROW ptr;
   register JSAMPROW color_map0 = cinfo->colormap[0];
@@ -152,10 +136,10 @@
 
 METHODDEF(void)
 put_demapped_gray (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-		   JDIMENSION rows_supplied)
+                   JDIMENSION rows_supplied)
 {
   ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
-  register char * bufferptr;
+  register char *bufferptr;
   register JSAMPROW ptr;
   register JSAMPROW color_map = cinfo->colormap[0];
   register JDIMENSION col;
@@ -183,14 +167,14 @@
   case JCS_GRAYSCALE:
     /* emit header for raw PGM format */
     fprintf(dest->pub.output_file, "P5\n%ld %ld\n%d\n",
-	    (long) cinfo->output_width, (long) cinfo->output_height,
-	    PPM_MAXVAL);
+            (long) cinfo->output_width, (long) cinfo->output_height,
+            PPM_MAXVAL);
     break;
   case JCS_RGB:
     /* emit header for raw PPM format */
     fprintf(dest->pub.output_file, "P6\n%ld %ld\n%d\n",
-	    (long) cinfo->output_width, (long) cinfo->output_height,
-	    PPM_MAXVAL);
+            (long) cinfo->output_width, (long) cinfo->output_height,
+            PPM_MAXVAL);
     break;
   default:
     ERREXIT(cinfo, JERR_PPM_COLORSPACE);
@@ -224,21 +208,21 @@
   /* Create module interface object, fill in method pointers */
   dest = (ppm_dest_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(ppm_dest_struct));
+                                  sizeof(ppm_dest_struct));
   dest->pub.start_output = start_output_ppm;
   dest->pub.finish_output = finish_output_ppm;
 
   /* Calculate output image dimensions so we can allocate space */
   jpeg_calc_output_dimensions(cinfo);
 
-  /* Create physical I/O buffer.  Note we make this near on a PC. */
+  /* Create physical I/O buffer */
   dest->samples_per_row = cinfo->output_width * cinfo->out_color_components;
-  dest->buffer_width = dest->samples_per_row * (BYTESPERSAMPLE * SIZEOF(char));
+  dest->buffer_width = dest->samples_per_row * (BYTESPERSAMPLE * sizeof(char));
   dest->iobuffer = (char *) (*cinfo->mem->alloc_small)
     ((j_common_ptr) cinfo, JPOOL_IMAGE, dest->buffer_width);
 
   if (cinfo->quantize_colors || BITS_IN_JSAMPLE != 8 ||
-      SIZEOF(JSAMPLE) != SIZEOF(char)) {
+      sizeof(JSAMPLE) != sizeof(char)) {
     /* When quantizing, we need an output buffer for colormap indexes
      * that's separate from the physical I/O buffer.  We also need a
      * separate buffer if pixel format translation must take place.
@@ -256,7 +240,6 @@
   } else {
     /* We will fwrite() directly from decompressor output buffer. */
     /* Synthesize a JSAMPARRAY pointer structure */
-    /* Cast here implies near->far pointer conversion on PCs */
     dest->pixrow = (JSAMPROW) dest->iobuffer;
     dest->pub.buffer = & dest->pixrow;
     dest->pub.buffer_height = 1;
diff --git a/wrppm.h b/wrppm.h
new file mode 100644
index 0000000..aa6c562
--- /dev/null
+++ b/wrppm.h
@@ -0,0 +1,26 @@
+/*
+ * wrppm.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#ifdef PPM_SUPPORTED
+
+/* Private version of data destination object */
+
+typedef struct {
+  struct djpeg_dest_struct pub; /* public fields */
+
+  /* Usually these two pointers point to the same place: */
+  char *iobuffer;               /* fwrite's I/O buffer */
+  JSAMPROW pixrow;              /* decompressor output buffer */
+  size_t buffer_width;          /* width of I/O buffer */
+  JDIMENSION samples_per_row;   /* JSAMPLEs per output row */
+} ppm_dest_struct;
+
+typedef ppm_dest_struct *ppm_dest_ptr;
+
+#endif
diff --git a/wrrle.c b/wrrle.c
index a4e7337..cc95b41 100644
--- a/wrrle.c
+++ b/wrrle.c
@@ -1,9 +1,12 @@
 /*
  * wrrle.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code and
+ * information relevant to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to write output images in RLE format.
  * The Utah Raster Toolkit library is required (version 3.1 or later).
@@ -16,7 +19,7 @@
  * with updates from Robert Hutchinson.
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef RLE_SUPPORTED
 
@@ -47,24 +50,24 @@
  * though not all of the entries need be used.
  */
 
-#define CMAPBITS	8
-#define CMAPLENGTH	(1<<(CMAPBITS))
+#define CMAPBITS        8
+#define CMAPLENGTH      (1<<(CMAPBITS))
 
 typedef struct {
   struct djpeg_dest_struct pub; /* public fields */
 
-  jvirt_sarray_ptr image;	/* virtual array to store the output image */
-  rle_map *colormap;	 	/* RLE-style color map, or NULL if none */
-  rle_pixel **rle_row;		/* To pass rows to rle_putrow() */
+  jvirt_sarray_ptr image;       /* virtual array to store the output image */
+  rle_map *colormap;            /* RLE-style color map, or NULL if none */
+  rle_pixel **rle_row;          /* To pass rows to rle_putrow() */
 
 } rle_dest_struct;
 
-typedef rle_dest_struct * rle_dest_ptr;
+typedef rle_dest_struct *rle_dest_ptr;
 
 /* Forward declarations */
 METHODDEF(void) rle_put_pixel_rows
-    JPP((j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-	 JDIMENSION rows_supplied));
+        (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+         JDIMENSION rows_supplied);
 
 
 /*
@@ -97,8 +100,8 @@
    */
 
   if (cinfo->output_width > 32767 || cinfo->output_height > 32767)
-    ERREXIT2(cinfo, JERR_RLE_DIMENSIONS, cinfo->output_width, 
-	     cinfo->output_height);
+    ERREXIT2(cinfo, JERR_RLE_DIMENSIONS, cinfo->output_width,
+             cinfo->output_height);
 
   if (cinfo->out_color_space != JCS_GRAYSCALE &&
       cinfo->out_color_space != JCS_RGB)
@@ -113,7 +116,7 @@
 
   if (cinfo->quantize_colors) {
     /* Allocate storage for RLE-style cmap, zero any extra entries */
-    cmapsize = cinfo->out_color_components * CMAPLENGTH * SIZEOF(rle_map);
+    cmapsize = cinfo->out_color_components * CMAPLENGTH * sizeof(rle_map);
     dest->colormap = (rle_map *) (*cinfo->mem->alloc_small)
       ((j_common_ptr) cinfo, JPOOL_IMAGE, cmapsize);
     MEMZERO(dest->colormap, cmapsize);
@@ -151,7 +154,7 @@
 
 METHODDEF(void)
 rle_put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-		    JDIMENSION rows_supplied)
+                    JDIMENSION rows_supplied)
 {
   rle_dest_ptr dest = (rle_dest_ptr) dinfo;
 
@@ -172,7 +175,7 @@
 finish_output_rle (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
   rle_dest_ptr dest = (rle_dest_ptr) dinfo;
-  rle_hdr header;		/* Output file information */
+  rle_hdr header;               /* Output file information */
   rle_pixel **rle_row, *red, *green, *blue;
   JSAMPROW output_row;
   char cmapcomment[80];
@@ -207,8 +210,7 @@
   rle_put_setup(&header);
 
   /* Now output the RLE data from our virtual array.
-   * We assume here that (a) rle_pixel is represented the same as JSAMPLE,
-   * and (b) we are not on a machine where FAR pointers differ from regular.
+   * We assume here that rle_pixel is represented the same as JSAMPLE.
    */
 
 #ifdef PROGRESS_REPORT
@@ -223,7 +225,7 @@
     for (row = cinfo->output_height-1; row >= 0; row--) {
       rle_row = (rle_pixel **) (*cinfo->mem->access_virt_sarray)
         ((j_common_ptr) cinfo, dest->image,
-	 (JDIMENSION) row, (JDIMENSION) 1, FALSE);
+         (JDIMENSION) row, (JDIMENSION) 1, FALSE);
       rle_putrow(rle_row, (int) cinfo->output_width, &header);
 #ifdef PROGRESS_REPORT
       if (progress != NULL) {
@@ -235,9 +237,9 @@
   } else {
     for (row = cinfo->output_height-1; row >= 0; row--) {
       rle_row = (rle_pixel **) dest->rle_row;
-      output_row = * (*cinfo->mem->access_virt_sarray)
+      output_row = *(*cinfo->mem->access_virt_sarray)
         ((j_common_ptr) cinfo, dest->image,
-	 (JDIMENSION) row, (JDIMENSION) 1, FALSE);
+         (JDIMENSION) row, (JDIMENSION) 1, FALSE);
       red = rle_row[0];
       green = rle_row[1];
       blue = rle_row[2];
@@ -281,7 +283,7 @@
   /* Create module interface object, fill in method pointers */
   dest = (rle_dest_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  SIZEOF(rle_dest_struct));
+                                  sizeof(rle_dest_struct));
   dest->pub.start_output = start_output_rle;
   dest->pub.finish_output = finish_output_rle;
 
diff --git a/wrtarga.c b/wrtarga.c
index cf104d2..c02b332 100644
--- a/wrtarga.c
+++ b/wrtarga.c
@@ -1,9 +1,12 @@
 /*
  * wrtarga.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
+ * It was modified by The libjpeg-turbo Project to include only code and
+ * information relevant to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
  *
  * This file contains routines to write output images in Targa format.
  *
@@ -14,7 +17,7 @@
  * Based on code contributed by Lee Daniel Crocker.
  */
 
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef TARGA_SUPPORTED
 
@@ -28,26 +31,17 @@
   Sorry, this code only copes with 8-bit JSAMPLEs. /* deliberate syntax err */
 #endif
 
-/*
- * The output buffer needs to be writable by fwrite().  On PCs, we must
- * allocate the buffer in near data space, because we are assuming small-data
- * memory model, wherein fwrite() can't reach far memory.  If you need to
- * process very wide images on a PC, you might have to compile in large-memory
- * model, or else replace fwrite() with a putc() loop --- which will be much
- * slower.
- */
-
 
 /* Private version of data destination object */
 
 typedef struct {
-  struct djpeg_dest_struct pub;	/* public fields */
+  struct djpeg_dest_struct pub; /* public fields */
 
-  char *iobuffer;		/* physical I/O buffer */
-  JDIMENSION buffer_width;	/* width of one row */
+  char *iobuffer;               /* physical I/O buffer */
+  JDIMENSION buffer_width;      /* width of one row */
 } tga_dest_struct;
 
-typedef tga_dest_struct * tga_dest_ptr;
+typedef tga_dest_struct *tga_dest_ptr;
 
 
 LOCAL(void)
@@ -57,30 +51,30 @@
   char targaheader[18];
 
   /* Set unused fields of header to 0 */
-  MEMZERO(targaheader, SIZEOF(targaheader));
+  MEMZERO(targaheader, sizeof(targaheader));
 
   if (num_colors > 0) {
-    targaheader[1] = 1;		/* color map type 1 */
+    targaheader[1] = 1;         /* color map type 1 */
     targaheader[5] = (char) (num_colors & 0xFF);
     targaheader[6] = (char) (num_colors >> 8);
-    targaheader[7] = 24;	/* 24 bits per cmap entry */
+    targaheader[7] = 24;        /* 24 bits per cmap entry */
   }
 
   targaheader[12] = (char) (cinfo->output_width & 0xFF);
   targaheader[13] = (char) (cinfo->output_width >> 8);
   targaheader[14] = (char) (cinfo->output_height & 0xFF);
   targaheader[15] = (char) (cinfo->output_height >> 8);
-  targaheader[17] = 0x20;	/* Top-down, non-interlaced */
+  targaheader[17] = 0x20;       /* Top-down, non-interlaced */
 
   if (cinfo->out_color_space == JCS_GRAYSCALE) {
-    targaheader[2] = 3;		/* image type = uncompressed gray-scale */
-    targaheader[16] = 8;	/* bits per pixel */
-  } else {			/* must be RGB */
+    targaheader[2] = 3;         /* image type = uncompressed grayscale */
+    targaheader[16] = 8;        /* bits per pixel */
+  } else {                      /* must be RGB */
     if (num_colors > 0) {
-      targaheader[2] = 1;	/* image type = colormapped RGB */
+      targaheader[2] = 1;       /* image type = colormapped RGB */
       targaheader[16] = 8;
     } else {
-      targaheader[2] = 2;	/* image type = uncompressed RGB */
+      targaheader[2] = 2;       /* image type = uncompressed RGB */
       targaheader[16] = 24;
     }
   }
@@ -97,12 +91,12 @@
 
 METHODDEF(void)
 put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-		JDIMENSION rows_supplied)
+                JDIMENSION rows_supplied)
 /* used for unquantized full-color output */
 {
   tga_dest_ptr dest = (tga_dest_ptr) dinfo;
   register JSAMPROW inptr;
-  register char * outptr;
+  register char *outptr;
   register JDIMENSION col;
 
   inptr = dest->pub.buffer[0];
@@ -118,12 +112,12 @@
 
 METHODDEF(void)
 put_gray_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-	       JDIMENSION rows_supplied)
+               JDIMENSION rows_supplied)
 /* used for grayscale OR quantized color output */
 {
   tga_dest_ptr dest = (tga_dest_ptr) dinfo;
   register JSAMPROW inptr;
-  register char * outptr;
+  register char *outptr;
   register JDIMENSION col;
 
   inptr = dest->pub.buffer[0];
@@ -142,11 +136,11 @@
 
 METHODDEF(void)
 put_demapped_gray (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-		   JDIMENSION rows_supplied)
+                   JDIMENSION rows_supplied)
 {
   tga_dest_ptr dest = (tga_dest_ptr) dinfo;
   register JSAMPROW inptr;
-  register char * outptr;
+  register char *outptr;
   register JSAMPROW color_map0 = cinfo->colormap[0];
   register JDIMENSION col;
 
@@ -183,14 +177,14 @@
       /* We only support 8-bit colormap indexes, so only 256 colors */
       num_colors = cinfo->actual_number_of_colors;
       if (num_colors > 256)
-	ERREXIT1(cinfo, JERR_TOO_MANY_COLORS, num_colors);
+        ERREXIT1(cinfo, JERR_TOO_MANY_COLORS, num_colors);
       write_header(cinfo, dinfo, num_colors);
       /* Write the colormap.  Note Targa uses BGR byte order */
       outfile = dest->pub.output_file;
       for (i = 0; i < num_colors; i++) {
-	putc(GETJSAMPLE(cinfo->colormap[2][i]), outfile);
-	putc(GETJSAMPLE(cinfo->colormap[1][i]), outfile);
-	putc(GETJSAMPLE(cinfo->colormap[0][i]), outfile);
+        putc(GETJSAMPLE(cinfo->colormap[2][i]), outfile);
+        putc(GETJSAMPLE(cinfo->colormap[1][i]), outfile);
+        putc(GETJSAMPLE(cinfo->colormap[0][i]), outfile);
       }
       dest->pub.put_pixel_rows = put_gray_rows;
     } else {
@@ -229,18 +223,18 @@
   /* Create module interface object, fill in method pointers */
   dest = (tga_dest_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(tga_dest_struct));
+                                  sizeof(tga_dest_struct));
   dest->pub.start_output = start_output_tga;
   dest->pub.finish_output = finish_output_tga;
 
   /* Calculate output image dimensions so we can allocate space */
   jpeg_calc_output_dimensions(cinfo);
 
-  /* Create I/O buffer.  Note we make this near on a PC. */
+  /* Create I/O buffer. */
   dest->buffer_width = cinfo->output_width * cinfo->output_components;
   dest->iobuffer = (char *)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(size_t) (dest->buffer_width * SIZEOF(char)));
+                                (size_t) (dest->buffer_width * sizeof(char)));
 
   /* Create decompressor output buffer. */
   dest->pub.buffer = (*cinfo->mem->alloc_sarray)