Re-add relevant files and tests from libjpeg v6b git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@81 632fc199-4ca6-4c93-a231-07263d6284db

commit: 60fa0600c0b32383de6880dd9bf2218e0eb1b537 [log] [tgz]
author: DRC <dcommander@users.sourceforge.net> Fri Feb 12 06:01:49 2010 +0000
committer: DRC <dcommander@users.sourceforge.net> Fri Feb 12 06:01:49 2010 +0000
tree: bc592d8107cd9fab120112e3daf33a062289e56b
parent: 60cddeb849b803274914dbe1706cfc801f914319 [diff]
parent: 5ead57a34a398aa798f35bd7a6abad19b2e453e2 [diff]
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..43b5f98
--- /dev/null
+++ b/Makefile.am

@@ -0,0 +1,86 @@
+lib_LTLIBRARIES = libjpeg.la libturbojpeg.la
+libjpeg_la_LDFLAGS = -version-number 62:0:0 -no-undefined
+libturbojpeg_la_LDFLAGS = -avoid-version -no-undefined
+include_HEADERS = jconfig.h jerror.h jmorecfg.h jpeglib.h turbojpeg.h
+
+HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+	jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h
+
+libjpeg_la_SOURCES = $(HDRS) jcapimin.c jcapistd.c jccoefct.c jccolor.c \
+	jcdctmgr.c jchuff.c jcinit.c jcmainct.c jcmarker.c jcmaster.c \
+	jcomapi.c jcparam.c jcphuff.c jcprepct.c jcsample.c jctrans.c \
+	jdapimin.c jdapistd.c jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c \
+	jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c jdmaster.c \
+	jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c \
+	jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c \
+	jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c
+
+libturbojpeg_la_SOURCES = $(libjpeg_la_SOURCES) turbojpegl.c turbojpeg.h
+
+if WITH_SIMD
+
+SUBDIRS = simd
+libjpeg_la_LIBADD = simd/libsimd.la
+libturbojpeg_la_LIBADD = simd/libsimd.la
+
+else
+
+libjpeg_la_SOURCES += jsimd_none.c
+
+endif
+
+TSTHDRS = rrutil.h rrtimer.h
+
+noinst_PROGRAMS = jpgtest jpegut cjpeg djpeg jpegtran rdjpgcom wrjpgcom
+
+jpgtest_SOURCES = $(TSTHDRS) jpgtest.cxx bmp.h bmp.c
+
+jpgtest_LDADD = $(top_srcdir)/libturbojpeg.la
+
+jpegut_SOURCES = $(TSTHDRS) jpegut.c bmp.h bmp.c
+
+jpegut_LDADD = $(top_srcdir)/libturbojpeg.la
+
+cjpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c cjpeg.c rdbmp.c rdgif.c \
+	rdppm.c rdswitch.c rdtarga.c 
+
+cjpeg_LDADD = $(top_srcdir)/libjpeg.la
+
+cjpeg_CFLAGS = -DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED \
+	-DTARGA_SUPPORTED
+
+djpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c djpeg.c rdcolmap.c rdswitch.c \
+	wrbmp.c wrgif.c wrppm.c wrtarga.c
+
+djpeg_LDADD = $(top_srcdir)/libjpeg.la
+
+djpeg_CFLAGS = -DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED \
+	-DTARGA_SUPPORTED
+
+jpegtran_SOURCES = jpegtran.c rdswitch.c cdjpeg.c transupp.c
+
+jpegtran_LDADD = $(top_srcdir)/libjpeg.la
+
+rdjpgcom_SOURCES = wrjpgcom.c
+
+rdjpgcom_LDADD = $(top_srcdir)/libjpeg.la
+
+wrjpgcom_SOURCES = wrjpgcom.c
+
+wrjpgcom_LDADD = $(top_srcdir)/libjpeg.la
+
+test: cjpeg djpeg jpegtran
+	$(RM) testout*
+	$(top_srcdir)/jpegut
+	$(top_srcdir)/djpeg -dct int -ppm -outfile testout.ppm  $(top_srcdir)/testorig.jpg
+	$(top_srcdir)/djpeg -dct int -bmp -colors 256 -outfile testout.bmp  $(top_srcdir)/testorig.jpg
+	$(top_srcdir)/cjpeg -dct int -outfile testout.jpg  $(top_srcdir)/testimg.ppm
+	$(top_srcdir)/djpeg -dct int -ppm -outfile testoutp.ppm $(top_srcdir)/testprog.jpg
+	$(top_srcdir)/cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(top_srcdir)/testimg.ppm
+	$(top_srcdir)/jpegtran -outfile testoutt.jpg $(top_srcdir)/testprog.jpg
+	cmp $(top_srcdir)/testimg.ppm testout.ppm
+	cmp $(top_srcdir)/testimg.bmp testout.bmp
+	cmp $(top_srcdir)/testimg.jpg testout.jpg
+	cmp $(top_srcdir)/testimg.ppm testoutp.ppm
+	cmp $(top_srcdir)/testimgp.jpg testoutp.jpg
+	cmp $(top_srcdir)/testorig.jpg testoutt.jpg

diff --git a/acinclude.m4 b/acinclude.m4
new file mode 100644
index 0000000..e8d7d08
--- /dev/null
+++ b/acinclude.m4

@@ -0,0 +1,121 @@
+# AC_PROG_NASM
+# --------------------------
+# Check that NASM exists and determine flags
+AC_DEFUN([AC_PROG_NASM],[
+
+AC_CHECK_PROGS(NASM, [nasm nasmw])
+test -z "$NASM" && AC_MSG_ERROR([no nasm (Netwide Assembler) found])
+
+AC_MSG_CHECKING([for object file format of host system])
+case "$host_os" in
+  cygwin* | mingw* | pw32* | interix*)
+    objfmt='Win32-COFF'
+  ;;
+  msdosdjgpp* | go32*)
+    objfmt='COFF'
+  ;;
+  os2-emx*)			# not tested
+    objfmt='MSOMF'		# obj
+  ;;
+  linux*coff* | linux*oldld*)
+    objfmt='COFF'		# ???
+  ;;
+  linux*aout*)
+    objfmt='a.out'
+  ;;
+  linux*)
+    case "$host_cpu" in
+      x86_64)
+        objfmt='ELF64'
+        ;;
+      *)
+        objfmt='ELF'
+        ;;
+    esac
+  ;;
+  freebsd* | netbsd* | openbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+      objfmt='BSD-a.out'
+    else
+      objfmt='ELF'
+    fi
+  ;;
+  solaris* | sunos* | sysv* | sco*)
+    case "$host_cpu" in
+      x86_64)
+        objfmt='ELF64'
+        ;;
+      *)
+        objfmt='ELF'
+        ;;
+    esac
+  ;;
+  darwin* | rhapsody* | nextstep* | openstep* | macos*)
+    case "$host_cpu" in
+      x86_64)
+        objfmt='Mach-O64'
+        ;;
+      *)
+        objfmt='Mach-O'
+        ;;
+    esac
+  ;;
+  *)
+    objfmt='ELF ?'
+  ;;
+esac
+
+AC_MSG_RESULT([$objfmt])
+if test "$objfmt" = 'ELF ?'; then
+  objfmt='ELF'
+  AC_MSG_WARN([unexpected host system. assumed that the format is $objfmt.])
+fi
+
+AC_MSG_CHECKING([for object file format specifier (NAFLAGS) ])
+case "$objfmt" in
+  MSOMF)      NAFLAGS='-fobj -DOBJ32';;
+  Win32-COFF) NAFLAGS='-fwin32 -DWIN32';;
+  COFF)       NAFLAGS='-fcoff -DCOFF';;
+  a.out)      NAFLAGS='-faout -DAOUT';;
+  BSD-a.out)  NAFLAGS='-faoutb -DAOUT';;
+  ELF)        NAFLAGS='-felf -DELF';;
+  ELF64)      NAFLAGS='-felf64 -DELF -D__x86_64__';;
+  RDF)        NAFLAGS='-frdf -DRDF';;
+  Mach-O)     NAFLAGS='-fmacho -DMACHO';;
+  Mach-O64)   NAFLAGS='-fmacho64 -DMACHO -D__x86_64__';;
+esac
+AC_MSG_RESULT([$NAFLAGS])
+AC_SUBST([NAFLAGS])
+
+AC_MSG_CHECKING([whether the assembler ($NASM $NAFLAGS) works])
+cat > conftest.asm <<EOF
+[%line __oline__ "configure"
+        section .text
+        global  _main,main
+_main:
+main:   xor     eax,eax
+        ret
+]EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if AC_TRY_EVAL(try_nasm) && test -s conftest.o; then
+  AC_MSG_RESULT(yes)
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.asm >&AC_FD_CC
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([installation or configuration problem: assembler cannot create object files.])
+fi
+
+AC_MSG_CHECKING([whether the linker accepts assembler output])
+try_nasm='${CC-cc} -o conftest${ac_exeext} $LDFLAGS conftest.o $LIBS 1>&AC_FD_CC'
+if AC_TRY_EVAL(try_nasm) && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  AC_MSG_RESULT(yes)
+else
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([configuration problem: maybe object file format mismatch.])
+fi
+
+])

diff --git a/ansi2knr.1 b/ansi2knr.1
deleted file mode 100644
index f9ee5a6..0000000
--- a/ansi2knr.1
+++ /dev/null

@@ -1,36 +0,0 @@
-.TH ANSI2KNR 1 "19 Jan 1996"
-.SH NAME
-ansi2knr \- convert ANSI C to Kernighan & Ritchie C
-.SH SYNOPSIS
-.I ansi2knr
-[--varargs] input_file [output_file]
-.SH DESCRIPTION
-If no output_file is supplied, output goes to stdout.
-.br
-There are no error messages.
-.sp
-.I ansi2knr
-recognizes function definitions by seeing a non-keyword identifier at the left
-margin, followed by a left parenthesis, with a right parenthesis as the last
-character on the line, and with a left brace as the first token on the
-following line (ignoring possible intervening comments).  It will recognize a
-multi-line header provided that no intervening line ends with a left or right
-brace or a semicolon.  These algorithms ignore whitespace and comments, except
-that the function name must be the first thing on the line.
-.sp
-The following constructs will confuse it:
-.br
-     - Any other construct that starts at the left margin and follows the
-above syntax (such as a macro or function call).
-.br
-     - Some macros that tinker with the syntax of the function header.
-.sp
-The --varargs switch is obsolete, and is recognized only for
-backwards compatibility.  The present version of
-.I ansi2knr
-will always attempt to convert a ... argument to va_alist and va_dcl.
-.SH AUTHOR
-L. Peter Deutsch <ghost@aladdin.com> wrote the original ansi2knr and
-continues to maintain the current version; most of the code in the current
-version is his work.  ansi2knr also includes contributions by Francois
-Pinard <pinard@iro.umontreal.ca> and Jim Avera <jima@netcom.com>.

diff --git a/ansi2knr.c b/ansi2knr.c
deleted file mode 100644
index 4e05fc2..0000000
--- a/ansi2knr.c
+++ /dev/null

@@ -1,693 +0,0 @@
-/* ansi2knr.c */
-/* Convert ANSI C function definitions to K&R ("traditional C") syntax */
-
-/*
-ansi2knr is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY.  No author or distributor accepts responsibility to anyone for the
-consequences of using it or for whether it serves any particular purpose or
-works at all, unless he says so in writing.  Refer to the GNU General Public
-License (the "GPL") for full details.
-
-Everyone is granted permission to copy, modify and redistribute ansi2knr,
-but only under the conditions described in the GPL.  A copy of this license
-is supposed to have been given to you along with ansi2knr so you can know
-your rights and responsibilities.  It should be in a file named COPYLEFT.
-[In the IJG distribution, the GPL appears below, not in a separate file.]
-Among other things, the copyright notice and this notice must be preserved
-on all copies.
-
-We explicitly state here what we believe is already implied by the GPL: if
-the ansi2knr program is distributed as a separate set of sources and a
-separate executable file which are aggregated on a storage medium together
-with another program, this in itself does not bring the other program under
-the GPL, nor does the mere fact that such a program or the procedures for
-constructing it invoke the ansi2knr executable bring any other part of the
-program under the GPL.
-*/
-
-/*
----------- Here is the GNU GPL file COPYLEFT, referred to above ----------
------ These terms do NOT apply to the JPEG software itself; see README ------
-
-		    GHOSTSCRIPT GENERAL PUBLIC LICENSE
-		    (Clarified 11 Feb 1988)
-
- Copyright (C) 1988 Richard M. Stallman
- Everyone is permitted to copy and distribute verbatim copies of this
- license, but changing it is not allowed.  You can also use this wording
- to make the terms for other programs.
-
-  The license agreements of most software companies keep you at the
-mercy of those companies.  By contrast, our general public license is
-intended to give everyone the right to share Ghostscript.  To make sure
-that you get the rights we want you to have, we need to make
-restrictions that forbid anyone to deny you these rights or to ask you
-to surrender the rights.  Hence this license agreement.
-
-  Specifically, we want to make sure that you have the right to give
-away copies of Ghostscript, that you receive source code or else can get
-it if you want it, that you can change Ghostscript or use pieces of it
-in new free programs, and that you know you can do these things.
-
-  To make sure that everyone has such rights, we have to forbid you to
-deprive anyone else of these rights.  For example, if you distribute
-copies of Ghostscript, you must give the recipients all the rights that
-you have.  You must make sure that they, too, receive or can get the
-source code.  And you must tell them their rights.
-
-  Also, for our own protection, we must make certain that everyone finds
-out that there is no warranty for Ghostscript.  If Ghostscript is
-modified by someone else and passed on, we want its recipients to know
-that what they have is not what we distributed, so that any problems
-introduced by others will not reflect on our reputation.
-
-  Therefore we (Richard M. Stallman and the Free Software Foundation,
-Inc.) make the following terms which say what you must do to be allowed
-to distribute or change Ghostscript.
-
-
-			COPYING POLICIES
-
-  1. You may copy and distribute verbatim copies of Ghostscript source
-code as you receive it, in any medium, provided that you conspicuously
-and appropriately publish on each copy a valid copyright and license
-notice "Copyright (C) 1989 Aladdin Enterprises.  All rights reserved.
-Distributed by Free Software Foundation, Inc." (or with whatever year is
-appropriate); keep intact the notices on all files that refer to this
-License Agreement and to the absence of any warranty; and give any other
-recipients of the Ghostscript program a copy of this License Agreement
-along with the program.  You may charge a distribution fee for the
-physical act of transferring a copy.
-
-  2. You may modify your copy or copies of Ghostscript or any portion of
-it, and copy and distribute such modifications under the terms of
-Paragraph 1 above, provided that you also do the following:
-
-    a) cause the modified files to carry prominent notices stating
-    that you changed the files and the date of any change; and
-
-    b) cause the whole of any work that you distribute or publish,
-    that in whole or in part contains or is a derivative of Ghostscript
-    or any part thereof, to be licensed at no charge to all third
-    parties on terms identical to those contained in this License
-    Agreement (except that you may choose to grant more extensive
-    warranty protection to some or all third parties, at your option).
-
-    c) You may charge a distribution fee for the physical act of
-    transferring a copy, and you may at your option offer warranty
-    protection in exchange for a fee.
-
-Mere aggregation of another unrelated program with this program (or its
-derivative) on a volume of a storage or distribution medium does not bring
-the other program under the scope of these terms.
-
-  3. You may copy and distribute Ghostscript (or a portion or derivative
-of it, under Paragraph 2) in object code or executable form under the
-terms of Paragraphs 1 and 2 above provided that you also do one of the
-following:
-
-    a) accompany it with the complete corresponding machine-readable
-    source code, which must be distributed under the terms of
-    Paragraphs 1 and 2 above; or,
-
-    b) accompany it with a written offer, valid for at least three
-    years, to give any third party free (except for a nominal
-    shipping charge) a complete machine-readable copy of the
-    corresponding source code, to be distributed under the terms of
-    Paragraphs 1 and 2 above; or,
-
-    c) accompany it with the information you received as to where the
-    corresponding source code may be obtained.  (This alternative is
-    allowed only for noncommercial distribution and only if you
-    received the program in object code or executable form alone.)
-
-For an executable file, complete source code means all the source code for
-all modules it contains; but, as a special exception, it need not include
-source code for modules which are standard libraries that accompany the
-operating system on which the executable file runs.
-
-  4. You may not copy, sublicense, distribute or transfer Ghostscript
-except as expressly provided under this License Agreement.  Any attempt
-otherwise to copy, sublicense, distribute or transfer Ghostscript is
-void and your rights to use the program under this License agreement
-shall be automatically terminated.  However, parties who have received
-computer software programs from you with this License Agreement will not
-have their licenses terminated so long as such parties remain in full
-compliance.
-
-  5. If you wish to incorporate parts of Ghostscript into other free
-programs whose distribution conditions are different, write to the Free
-Software Foundation at 675 Mass Ave, Cambridge, MA 02139.  We have not
-yet worked out a simple rule that can be stated here, but we will often
-permit this.  We will be guided by the two goals of preserving the free
-status of all derivatives of our free software and of promoting the
-sharing and reuse of software.
-
-Your comments and suggestions about our licensing policies and our
-software are welcome!  Please contact the Free Software Foundation,
-Inc., 675 Mass Ave, Cambridge, MA 02139, or call (617) 876-3296.
-
-		       NO WARRANTY
-
-  BECAUSE GHOSTSCRIPT IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY
-NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW.  EXCEPT
-WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC, RICHARD
-M. STALLMAN, ALADDIN ENTERPRISES, L. PETER DEUTSCH, AND/OR OTHER PARTIES
-PROVIDE GHOSTSCRIPT "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
-EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE
-ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF GHOSTSCRIPT IS WITH
-YOU.  SHOULD GHOSTSCRIPT PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
-NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M.
-STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., L. PETER DEUTSCH, ALADDIN
-ENTERPRISES, AND/OR ANY OTHER PARTY WHO MAY MODIFY AND REDISTRIBUTE
-GHOSTSCRIPT AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING
-ANY LOST PROFITS, LOST MONIES, OR OTHER SPECIAL, INCIDENTAL OR
-CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE
-(INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED
-INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR A FAILURE OF THE
-PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) GHOSTSCRIPT, EVEN IF YOU
-HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES, OR FOR ANY CLAIM
-BY ANY OTHER PARTY.
-
--------------------- End of file COPYLEFT ------------------------------
-*/
-
-/*
- * Usage:
-	ansi2knr input_file [output_file]
- * If no output_file is supplied, output goes to stdout.
- * There are no error messages.
- *
- * ansi2knr recognizes function definitions by seeing a non-keyword
- * identifier at the left margin, followed by a left parenthesis,
- * with a right parenthesis as the last character on the line,
- * and with a left brace as the first token on the following line
- * (ignoring possible intervening comments).
- * It will recognize a multi-line header provided that no intervening
- * line ends with a left or right brace or a semicolon.
- * These algorithms ignore whitespace and comments, except that
- * the function name must be the first thing on the line.
- * The following constructs will confuse it:
- *	- Any other construct that starts at the left margin and
- *	    follows the above syntax (such as a macro or function call).
- *	- Some macros that tinker with the syntax of the function header.
- */
-
-/*
- * The original and principal author of ansi2knr is L. Peter Deutsch
- * <ghost@aladdin.com>.  Other authors are noted in the change history
- * that follows (in reverse chronological order):
-	lpd 96-01-21 added code to cope with not HAVE_CONFIG_H and with
-		compilers that don't understand void, as suggested by
-		Tom Lane
-	lpd 96-01-15 changed to require that the first non-comment token
-		on the line following a function header be a left brace,
-		to reduce sensitivity to macros, as suggested by Tom Lane
-		<tgl@sss.pgh.pa.us>
-	lpd 95-06-22 removed #ifndefs whose sole purpose was to define
-		undefined preprocessor symbols as 0; changed all #ifdefs
-		for configuration symbols to #ifs
-	lpd 95-04-05 changed copyright notice to make it clear that
-		including ansi2knr in a program does not bring the entire
-		program under the GPL
-	lpd 94-12-18 added conditionals for systems where ctype macros
-		don't handle 8-bit characters properly, suggested by
-		Francois Pinard <pinard@iro.umontreal.ca>;
-		removed --varargs switch (this is now the default)
-	lpd 94-10-10 removed CONFIG_BROKETS conditional
-	lpd 94-07-16 added some conditionals to help GNU `configure',
-		suggested by Francois Pinard <pinard@iro.umontreal.ca>;
-		properly erase prototype args in function parameters,
-		contributed by Jim Avera <jima@netcom.com>;
-		correct error in writeblanks (it shouldn't erase EOLs)
-	lpd 89-xx-xx original version
- */
-
-/* Most of the conditionals here are to make ansi2knr work with */
-/* or without the GNU configure machinery. */
-
-#if HAVE_CONFIG_H
-# include <config.h>
-#endif
-
-#include <stdio.h>
-#include <ctype.h>
-
-#if HAVE_CONFIG_H
-
-/*
-   For properly autoconfiguring ansi2knr, use AC_CONFIG_HEADER(config.h).
-   This will define HAVE_CONFIG_H and so, activate the following lines.
- */
-
-# if STDC_HEADERS || HAVE_STRING_H
-#  include <string.h>
-# else
-#  include <strings.h>
-# endif
-
-#else /* not HAVE_CONFIG_H */
-
-/* Otherwise do it the hard way */
-
-# ifdef BSD
-#  include <strings.h>
-# else
-#  ifdef VMS
-    extern int strlen(), strncmp();
-#  else
-#   include <string.h>
-#  endif
-# endif
-
-#endif /* not HAVE_CONFIG_H */
-
-#if STDC_HEADERS
-# include <stdlib.h>
-#else
-/*
-   malloc and free should be declared in stdlib.h,
-   but if you've got a K&R compiler, they probably aren't.
- */
-# ifdef MSDOS
-#  include <malloc.h>
-# else
-#  ifdef VMS
-     extern char *malloc();
-     extern void free();
-#  else
-     extern char *malloc();
-     extern int free();
-#  endif
-# endif
-
-#endif
-
-/*
- * The ctype macros don't always handle 8-bit characters correctly.
- * Compensate for this here.
- */
-#ifdef isascii
-#  undef HAVE_ISASCII		/* just in case */
-#  define HAVE_ISASCII 1
-#else
-#endif
-#if STDC_HEADERS || !HAVE_ISASCII
-#  define is_ascii(c) 1
-#else
-#  define is_ascii(c) isascii(c)
-#endif
-
-#define is_space(c) (is_ascii(c) && isspace(c))
-#define is_alpha(c) (is_ascii(c) && isalpha(c))
-#define is_alnum(c) (is_ascii(c) && isalnum(c))
-
-/* Scanning macros */
-#define isidchar(ch) (is_alnum(ch) || (ch) == '_')
-#define isidfirstchar(ch) (is_alpha(ch) || (ch) == '_')
-
-/* Forward references */
-char *skipspace();
-int writeblanks();
-int test1();
-int convert1();
-
-/* The main program */
-int
-main(argc, argv)
-    int argc;
-    char *argv[];
-{	FILE *in, *out;
-#define bufsize 5000			/* arbitrary size */
-	char *buf;
-	char *line;
-	char *more;
-	/*
-	 * In previous versions, ansi2knr recognized a --varargs switch.
-	 * If this switch was supplied, ansi2knr would attempt to convert
-	 * a ... argument to va_alist and va_dcl; if this switch was not
-	 * supplied, ansi2knr would simply drop any such arguments.
-	 * Now, ansi2knr always does this conversion, and we only
-	 * check for this switch for backward compatibility.
-	 */
-	int convert_varargs = 1;
-
-	if ( argc > 1 && argv[1][0] == '-' )
-	  {	if ( !strcmp(argv[1], "--varargs") )
-		  {	convert_varargs = 1;
-			argc--;
-			argv++;
-		  }
-		else
-		  {	fprintf(stderr, "Unrecognized switch: %s\n", argv[1]);
-			exit(1);
-		  }
-	  }
-	switch ( argc )
-	   {
-	default:
-		printf("Usage: ansi2knr input_file [output_file]\n");
-		exit(0);
-	case 2:
-		out = stdout;
-		break;
-	case 3:
-		out = fopen(argv[2], "w");
-		if ( out == NULL )
-		   {	fprintf(stderr, "Cannot open output file %s\n", argv[2]);
-			exit(1);
-		   }
-	   }
-	in = fopen(argv[1], "r");
-	if ( in == NULL )
-	   {	fprintf(stderr, "Cannot open input file %s\n", argv[1]);
-		exit(1);
-	   }
-	fprintf(out, "#line 1 \"%s\"\n", argv[1]);
-	buf = malloc(bufsize);
-	line = buf;
-	while ( fgets(line, (unsigned)(buf + bufsize - line), in) != NULL )
-	   {
-test:		line += strlen(line);
-		switch ( test1(buf) )
-		   {
-		case 2:			/* a function header */
-			convert1(buf, out, 1, convert_varargs);
-			break;
-		case 1:			/* a function */
-			/* Check for a { at the start of the next line. */
-			more = ++line;
-f:			if ( line >= buf + (bufsize - 1) ) /* overflow check */
-			  goto wl;
-			if ( fgets(line, (unsigned)(buf + bufsize - line), in) == NULL )
-			  goto wl;
-			switch ( *skipspace(more, 1) )
-			  {
-			  case '{':
-			    /* Definitely a function header. */
-			    convert1(buf, out, 0, convert_varargs);
-			    fputs(more, out);
-			    break;
-			  case 0:
-			    /* The next line was blank or a comment: */
-			    /* keep scanning for a non-comment. */
-			    line += strlen(line);
-			    goto f;
-			  default:
-			    /* buf isn't a function header, but */
-			    /* more might be. */
-			    fputs(buf, out);
-			    strcpy(buf, more);
-			    line = buf;
-			    goto test;
-			  }
-			break;
-		case -1:		/* maybe the start of a function */
-			if ( line != buf + (bufsize - 1) ) /* overflow check */
-			  continue;
-			/* falls through */
-		default:		/* not a function */
-wl:			fputs(buf, out);
-			break;
-		   }
-		line = buf;
-	   }
-	if ( line != buf )
-	  fputs(buf, out);
-	free(buf);
-	fclose(out);
-	fclose(in);
-	return 0;
-}
-
-/* Skip over space and comments, in either direction. */
-char *
-skipspace(p, dir)
-    register char *p;
-    register int dir;			/* 1 for forward, -1 for backward */
-{	for ( ; ; )
-	   {	while ( is_space(*p) )
-		  p += dir;
-		if ( !(*p == '/' && p[dir] == '*') )
-		  break;
-		p += dir;  p += dir;
-		while ( !(*p == '*' && p[dir] == '/') )
-		   {	if ( *p == 0 )
-			  return p;	/* multi-line comment?? */
-			p += dir;
-		   }
-		p += dir;  p += dir;
-	   }
-	return p;
-}
-
-/*
- * Write blanks over part of a string.
- * Don't overwrite end-of-line characters.
- */
-int
-writeblanks(start, end)
-    char *start;
-    char *end;
-{	char *p;
-	for ( p = start; p < end; p++ )
-	  if ( *p != '\r' && *p != '\n' )
-	    *p = ' ';
-	return 0;
-}
-
-/*
- * Test whether the string in buf is a function definition.
- * The string may contain and/or end with a newline.
- * Return as follows:
- *	0 - definitely not a function definition;
- *	1 - definitely a function definition;
- *	2 - definitely a function prototype (NOT USED);
- *	-1 - may be the beginning of a function definition,
- *		append another line and look again.
- * The reason we don't attempt to convert function prototypes is that
- * Ghostscript's declaration-generating macros look too much like
- * prototypes, and confuse the algorithms.
- */
-int
-test1(buf)
-    char *buf;
-{	register char *p = buf;
-	char *bend;
-	char *endfn;
-	int contin;
-
-	if ( !isidfirstchar(*p) )
-	  return 0;		/* no name at left margin */
-	bend = skipspace(buf + strlen(buf) - 1, -1);
-	switch ( *bend )
-	   {
-	   case ';': contin = 0 /*2*/; break;
-	   case ')': contin = 1; break;
-	   case '{': return 0;		/* not a function */
-	   case '}': return 0;		/* not a function */
-	   default: contin = -1;
-	   }
-	while ( isidchar(*p) )
-	  p++;
-	endfn = p;
-	p = skipspace(p, 1);
-	if ( *p++ != '(' )
-	  return 0;		/* not a function */
-	p = skipspace(p, 1);
-	if ( *p == ')' )
-	  return 0;		/* no parameters */
-	/* Check that the apparent function name isn't a keyword. */
-	/* We only need to check for keywords that could be followed */
-	/* by a left parenthesis (which, unfortunately, is most of them). */
-	   {	static char *words[] =
-		   {	"asm", "auto", "case", "char", "const", "double",
-			"extern", "float", "for", "if", "int", "long",
-			"register", "return", "short", "signed", "sizeof",
-			"static", "switch", "typedef", "unsigned",
-			"void", "volatile", "while", 0
-		   };
-		char **key = words;
-		char *kp;
-		int len = endfn - buf;
-
-		while ( (kp = *key) != 0 )
-		   {	if ( strlen(kp) == len && !strncmp(kp, buf, len) )
-			  return 0;	/* name is a keyword */
-			key++;
-		   }
-	   }
-	return contin;
-}
-
-/* Convert a recognized function definition or header to K&R syntax. */
-int
-convert1(buf, out, header, convert_varargs)
-    char *buf;
-    FILE *out;
-    int header;			/* Boolean */
-    int convert_varargs;	/* Boolean */
-{	char *endfn;
-	register char *p;
-	char **breaks;
-	unsigned num_breaks = 2;	/* for testing */
-	char **btop;
-	char **bp;
-	char **ap;
-	char *vararg = 0;
-
-	/* Pre-ANSI implementations don't agree on whether strchr */
-	/* is called strchr or index, so we open-code it here. */
-	for ( endfn = buf; *(endfn++) != '('; )
-	  ;
-top:	p = endfn;
-	breaks = (char **)malloc(sizeof(char *) * num_breaks * 2);
-	if ( breaks == 0 )
-	   {	/* Couldn't allocate break table, give up */
-		fprintf(stderr, "Unable to allocate break table!\n");
-		fputs(buf, out);
-		return -1;
-	   }
-	btop = breaks + num_breaks * 2 - 2;
-	bp = breaks;
-	/* Parse the argument list */
-	do
-	   {	int level = 0;
-		char *lp = NULL;
-		char *rp;
-		char *end = NULL;
-
-		if ( bp >= btop )
-		   {	/* Filled up break table. */
-			/* Allocate a bigger one and start over. */
-			free((char *)breaks);
-			num_breaks <<= 1;
-			goto top;
-		   }
-		*bp++ = p;
-		/* Find the end of the argument */
-		for ( ; end == NULL; p++ )
-		   {	switch(*p)
-			   {
-			   case ',':
-				if ( !level ) end = p;
-				break;
-			   case '(':
-				if ( !level ) lp = p;
-				level++;
-				break;
-			   case ')':
-				if ( --level < 0 ) end = p;
-				else rp = p;
-				break;
-			   case '/':
-				p = skipspace(p, 1) - 1;
-				break;
-			   default:
-				;
-			   }
-		   }
-		/* Erase any embedded prototype parameters. */
-		if ( lp )
-		  writeblanks(lp + 1, rp);
-		p--;			/* back up over terminator */
-		/* Find the name being declared. */
-		/* This is complicated because of procedure and */
-		/* array modifiers. */
-		for ( ; ; )
-		   {	p = skipspace(p - 1, -1);
-			switch ( *p )
-			   {
-			   case ']':	/* skip array dimension(s) */
-			   case ')':	/* skip procedure args OR name */
-			   {	int level = 1;
-				while ( level )
-				 switch ( *--p )
-				   {
-				   case ']': case ')': level++; break;
-				   case '[': case '(': level--; break;
-				   case '/': p = skipspace(p, -1) + 1; break;
-				   default: ;
-				   }
-			   }
-				if ( *p == '(' && *skipspace(p + 1, 1) == '*' )
-				   {	/* We found the name being declared */
-					while ( !isidfirstchar(*p) )
-					  p = skipspace(p, 1) + 1;
-					goto found;
-				   }
-				break;
-			   default:
-				goto found;
-			   }
-		   }
-found:		if ( *p == '.' && p[-1] == '.' && p[-2] == '.' )
-		  {	if ( convert_varargs )
-			  {	*bp++ = "va_alist";
-				vararg = p-2;
-			  }
-			else
-			  {	p++;
-				if ( bp == breaks + 1 )	/* sole argument */
-				  writeblanks(breaks[0], p);
-				else
-				  writeblanks(bp[-1] - 1, p);
-				bp--;
-			  }
-		   }
-		else
-		   {	while ( isidchar(*p) ) p--;
-			*bp++ = p+1;
-		   }
-		p = end;
-	   }
-	while ( *p++ == ',' );
-	*bp = p;
-	/* Make a special check for 'void' arglist */
-	if ( bp == breaks+2 )
-	   {	p = skipspace(breaks[0], 1);
-		if ( !strncmp(p, "void", 4) )
-		   {	p = skipspace(p+4, 1);
-			if ( p == breaks[2] - 1 )
-			   {	bp = breaks;	/* yup, pretend arglist is empty */
-				writeblanks(breaks[0], p + 1);
-			   }
-		   }
-	   }
-	/* Put out the function name and left parenthesis. */
-	p = buf;
-	while ( p != endfn ) putc(*p, out), p++;
-	/* Put out the declaration. */
-	if ( header )
-	  {	fputs(");", out);
-		for ( p = breaks[0]; *p; p++ )
-		  if ( *p == '\r' || *p == '\n' )
-		    putc(*p, out);
-	  }
-	else
-	  {	for ( ap = breaks+1; ap < bp; ap += 2 )
-		  {	p = *ap;
-			while ( isidchar(*p) )
-			  putc(*p, out), p++;
-			if ( ap < bp - 1 )
-			  fputs(", ", out);
-		  }
-		fputs(")  ", out);
-		/* Put out the argument declarations */
-		for ( ap = breaks+2; ap <= bp; ap += 2 )
-		  (*ap)[-1] = ';';
-		if ( vararg != 0 )
-		  {	*vararg = 0;
-			fputs(breaks[0], out);		/* any prior args */
-			fputs("va_dcl", out);		/* the final arg */
-			fputs(bp[0], out);
-		  }
-		else
-		  fputs(breaks[0], out);
-	  }
-	free((char *)breaks);
-	return 0;
-}

diff --git a/bmp.c b/bmp.c
new file mode 100644
index 0000000..3ccc877
--- /dev/null
+++ b/bmp.c

@@ -0,0 +1,370 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005 Sun Microsystems, Inc.
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version.  The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * wxWindows Library License for more details.
+*/
+
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifdef _WIN32
+ #include <io.h>
+#else
+ #include <unistd.h>
+#endif
+#include "./rrutil.h"
+#include "./bmp.h"
+
+#ifndef BI_BITFIELDS
+#define BI_BITFIELDS 3L
+#endif
+#ifndef BI_RGB
+#define BI_RGB 0L
+#endif
+
+#define BMPHDRSIZE 54
+typedef struct _bmphdr
+{
+	unsigned short bfType;
+	unsigned int bfSize;
+	unsigned short bfReserved1, bfReserved2;
+	unsigned int bfOffBits;
+
+	unsigned int biSize;
+	int biWidth, biHeight;
+	unsigned short biPlanes, biBitCount;
+	unsigned int biCompression, biSizeImage;
+	int biXPelsPerMeter, biYPelsPerMeter;
+	unsigned int biClrUsed, biClrImportant;
+} bmphdr;
+
+static const char *__bmperr="No error";
+
+static const int ps[BMPPIXELFORMATS]={3, 4, 3, 4, 4, 4};
+static const int roffset[BMPPIXELFORMATS]={0, 0, 2, 2, 3, 1};
+static const int goffset[BMPPIXELFORMATS]={1, 1, 1, 1, 2, 2};
+static const int boffset[BMPPIXELFORMATS]={2, 2, 0, 0, 1, 3};
+
+#define _throw(m) {__bmperr=m;  retcode=-1;  goto finally;}
+#define _unix(f) {if((f)==-1) _throw(strerror(errno));}
+#define _catch(f) {if((f)==-1) {retcode=-1;  goto finally;}}
+
+#define readme(fd, addr, size) \
+	if((bytesread=read(fd, addr, (size)))==-1) _throw(strerror(errno)); \
+	if(bytesread!=(size)) _throw("Read error");
+
+void pixelconvert(unsigned char *srcbuf, enum BMPPIXELFORMAT srcformat,
+	int srcpitch, unsigned char *dstbuf, enum BMPPIXELFORMAT dstformat, int dstpitch,
+	int w, int h, int flip)
+{
+	unsigned char *srcptr, *srcptr0, *dstptr, *dstptr0;
+	int i, j;
+
+	srcptr=flip? &srcbuf[srcpitch*(h-1)]:srcbuf;
+	for(j=0, dstptr=dstbuf; j<h; j++,
+		srcptr+=flip? -srcpitch:srcpitch, dstptr+=dstpitch)
+	{
+		for(i=0, srcptr0=srcptr, dstptr0=dstptr; i<w; i++,
+			srcptr0+=ps[srcformat], dstptr0+=ps[dstformat])
+		{
+			dstptr0[roffset[dstformat]]=srcptr0[roffset[srcformat]];
+			dstptr0[goffset[dstformat]]=srcptr0[goffset[srcformat]];
+			dstptr0[boffset[dstformat]]=srcptr0[boffset[srcformat]];
+		}
+	}
+}
+
+int loadppm(int *fd, unsigned char **buf, int *w, int *h,
+	enum BMPPIXELFORMAT f, int align, int dstbottomup, int ascii)
+{
+	FILE *fs=NULL;  int retcode=0, scalefactor, dstpitch;
+	unsigned char *tempbuf=NULL;  char temps[255], temps2[255];
+	int numread=0, totalread=0, pixel[3], i, j;
+
+	if((fs=fdopen(*fd, "r"))==NULL) _throw(strerror(errno));
+
+	do
+	{
+		if(!fgets(temps, 255, fs)) _throw("Read error");
+		if(strlen(temps)==0 || temps[0]=='\n') continue;
+		if(sscanf(temps, "%s", temps2)==1 && temps2[1]=='#') continue;
+		switch(totalread)
+		{
+			case 0:
+				if((numread=sscanf(temps, "%d %d %d", w, h, &scalefactor))==EOF)
+					_throw("Read error");
+				break;
+			case 1:
+				if((numread=sscanf(temps, "%d %d", h, &scalefactor))==EOF)
+					_throw("Read error");
+				break;
+			case 2:
+				if((numread=sscanf(temps, "%d", &scalefactor))==EOF)
+					_throw("Read error");
+				break;
+		}
+		totalread+=numread;
+	} while(totalread<3);
+	if((*w)<1 || (*h)<1 || scalefactor<1) _throw("Corrupt PPM header");
+
+	dstpitch=(((*w)*ps[f])+(align-1))&(~(align-1));
+	if((*buf=(unsigned char *)malloc(dstpitch*(*h)))==NULL)
+		_throw("Memory allocation error");
+	if(ascii)
+	{
+		for(j=0; j<*h; j++)
+		{
+			for(i=0; i<*w; i++)
+			{
+				if(fscanf(fs, "%d%d%d", &pixel[0], &pixel[1], &pixel[2])!=3)
+					_throw("Read error");
+				(*buf)[j*dstpitch+i*ps[f]+roffset[f]]=(unsigned char)(pixel[0]*255/scalefactor);
+				(*buf)[j*dstpitch+i*ps[f]+goffset[f]]=(unsigned char)(pixel[1]*255/scalefactor);
+				(*buf)[j*dstpitch+i*ps[f]+boffset[f]]=(unsigned char)(pixel[2]*255/scalefactor);
+			}
+		}
+	}
+	else
+	{
+		if(scalefactor!=255)
+			_throw("Binary PPMs must have 8-bit components");
+		if((tempbuf=(unsigned char *)malloc((*w)*(*h)*3))==NULL)
+			_throw("Memory allocation error");
+		if(fread(tempbuf, (*w)*(*h)*3, 1, fs)!=1) _throw("Read error");
+		pixelconvert(tempbuf, BMP_RGB, (*w)*3, *buf, f, dstpitch, *w, *h, dstbottomup);
+	}
+
+	finally:
+	if(fs) {fclose(fs);  *fd=-1;}
+	if(tempbuf) free(tempbuf);
+	return retcode;
+}
+
+
+int loadbmp(char *filename, unsigned char **buf, int *w, int *h, 
+	enum BMPPIXELFORMAT f, int align, int dstbottomup)
+{
+	int fd=-1, bytesread, srcpitch, srcbottomup=1, srcps, dstpitch,
+		retcode=0;
+	unsigned char *tempbuf=NULL;
+	bmphdr bh;  int flags=O_RDONLY;
+
+	dstbottomup=dstbottomup? 1:0;
+	#ifdef _WIN32
+	flags|=O_BINARY;
+	#endif
+	if(!filename || !buf || !w || !h || f<0 || f>BMPPIXELFORMATS-1 || align<1)
+		_throw("invalid argument to loadbmp()");
+	if((align&(align-1))!=0)
+		_throw("Alignment must be a power of 2");
+	_unix(fd=open(filename, flags));
+
+	readme(fd, &bh.bfType, sizeof(unsigned short));
+	if(!littleendian())	bh.bfType=byteswap16(bh.bfType);
+
+	if(bh.bfType==0x3650)
+	{
+		_catch(loadppm(&fd, buf, w, h, f, align, dstbottomup, 0));
+		goto finally;
+	}
+	if(bh.bfType==0x3350)
+	{
+		_catch(loadppm(&fd, buf, w, h, f, align, dstbottomup, 1));
+		goto finally;
+	}
+
+	readme(fd, &bh.bfSize, sizeof(unsigned int));
+	readme(fd, &bh.bfReserved1, sizeof(unsigned short));
+	readme(fd, &bh.bfReserved2, sizeof(unsigned short));
+	readme(fd, &bh.bfOffBits, sizeof(unsigned int));
+	readme(fd, &bh.biSize, sizeof(unsigned int));
+	readme(fd, &bh.biWidth, sizeof(int));
+	readme(fd, &bh.biHeight, sizeof(int));
+	readme(fd, &bh.biPlanes, sizeof(unsigned short));
+	readme(fd, &bh.biBitCount, sizeof(unsigned short));
+	readme(fd, &bh.biCompression, sizeof(unsigned int));
+	readme(fd, &bh.biSizeImage, sizeof(unsigned int));
+	readme(fd, &bh.biXPelsPerMeter, sizeof(int));
+	readme(fd, &bh.biYPelsPerMeter, sizeof(int));
+	readme(fd, &bh.biClrUsed, sizeof(unsigned int));
+	readme(fd, &bh.biClrImportant, sizeof(unsigned int));
+
+	if(!littleendian())
+	{
+		bh.bfSize=byteswap(bh.bfSize);
+		bh.bfOffBits=byteswap(bh.bfOffBits);
+		bh.biSize=byteswap(bh.biSize);
+		bh.biWidth=byteswap(bh.biWidth);
+		bh.biHeight=byteswap(bh.biHeight);
+		bh.biPlanes=byteswap16(bh.biPlanes);
+		bh.biBitCount=byteswap16(bh.biBitCount);
+		bh.biCompression=byteswap(bh.biCompression);
+		bh.biSizeImage=byteswap(bh.biSizeImage);
+		bh.biXPelsPerMeter=byteswap(bh.biXPelsPerMeter);
+		bh.biYPelsPerMeter=byteswap(bh.biYPelsPerMeter);
+		bh.biClrUsed=byteswap(bh.biClrUsed);
+		bh.biClrImportant=byteswap(bh.biClrImportant);
+	}
+
+	if(bh.bfType!=0x4d42 || bh.bfOffBits<BMPHDRSIZE
+	|| bh.biWidth<1 || bh.biHeight==0)
+		_throw("Corrupt bitmap header");
+	if((bh.biBitCount!=24 && bh.biBitCount!=32) || bh.biCompression!=BI_RGB)
+		_throw("Only uncompessed RGB bitmaps are supported");
+
+	*w=bh.biWidth;  *h=bh.biHeight;  srcps=bh.biBitCount/8;
+	if(*h<0) {*h=-(*h);  srcbottomup=0;}
+	srcpitch=(((*w)*srcps)+3)&(~3);
+	dstpitch=(((*w)*ps[f])+(align-1))&(~(align-1));
+
+	if(srcpitch*(*h)+bh.bfOffBits!=bh.bfSize) _throw("Corrupt bitmap header");
+	if((tempbuf=(unsigned char *)malloc(srcpitch*(*h)))==NULL
+	|| (*buf=(unsigned char *)malloc(dstpitch*(*h)))==NULL)
+		_throw("Memory allocation error");
+	if(lseek(fd, (long)bh.bfOffBits, SEEK_SET)!=(long)bh.bfOffBits)
+		_throw(strerror(errno));
+	_unix(bytesread=read(fd, tempbuf, srcpitch*(*h)));
+	if(bytesread!=srcpitch*(*h)) _throw("Read error");
+
+	pixelconvert(tempbuf, BMP_BGR, srcpitch, *buf, f, dstpitch, *w, *h, 
+		srcbottomup!=dstbottomup);
+
+	finally:
+	if(tempbuf) free(tempbuf);
+	if(fd!=-1) close(fd);
+	return retcode;
+}
+
+#define writeme(fd, addr, size) \
+	if((byteswritten=write(fd, addr, (size)))==-1) _throw(strerror(errno)); \
+	if(byteswritten!=(size)) _throw("Write error");
+
+int saveppm(char *filename, unsigned char *buf, int w, int h,
+	enum BMPPIXELFORMAT f, int srcpitch, int srcbottomup)
+{
+	FILE *fs=NULL;  int retcode=0;
+	unsigned char *tempbuf=NULL;
+
+	if((fs=fopen(filename, "wb"))==NULL) _throw(strerror(errno));
+	if(fprintf(fs, "P6\n")<1) _throw("Write error");
+	if(fprintf(fs, "%d %d\n", w, h)<1) _throw("Write error");
+	if(fprintf(fs, "255\n")<1) _throw("Write error");
+
+	if((tempbuf=(unsigned char *)malloc(w*h*3))==NULL)
+		_throw("Memory allocation error");
+
+	pixelconvert(buf, f, srcpitch, tempbuf, BMP_RGB, w*3, w, h, 
+		srcbottomup);
+
+	if((fwrite(tempbuf, w*h*3, 1, fs))!=1) _throw("Write error");
+
+	finally:
+	if(tempbuf) free(tempbuf);
+	if(fs) fclose(fs);
+	return retcode;
+}
+
+int savebmp(char *filename, unsigned char *buf, int w, int h,
+	enum BMPPIXELFORMAT f, int srcpitch, int srcbottomup)
+{
+	int fd=-1, byteswritten, dstpitch, retcode=0;
+	int flags=O_RDWR|O_CREAT|O_TRUNC;
+	unsigned char *tempbuf=NULL;  char *temp;
+	bmphdr bh;  int mode;
+
+	#ifdef _WIN32
+	flags|=O_BINARY;  mode=_S_IREAD|_S_IWRITE;
+	#else
+	mode=S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH;
+	#endif
+	if(!filename || !buf || w<1 || h<1 || f<0 || f>BMPPIXELFORMATS-1 || srcpitch<0)
+		_throw("bad argument to savebmp()");
+
+	if(srcpitch==0) srcpitch=w*ps[f];
+
+	if((temp=strrchr(filename, '.'))!=NULL)
+	{
+		if(!stricmp(temp, ".ppm"))
+			return saveppm(filename, buf, w, h, f, srcpitch, srcbottomup);
+	}
+
+	_unix(fd=open(filename, flags, mode));
+	dstpitch=((w*3)+3)&(~3);
+
+	bh.bfType=0x4d42;
+	bh.bfSize=BMPHDRSIZE+dstpitch*h;
+	bh.bfReserved1=0;  bh.bfReserved2=0;
+	bh.bfOffBits=BMPHDRSIZE;
+	bh.biSize=40;
+	bh.biWidth=w;  bh.biHeight=h;
+	bh.biPlanes=0;  bh.biBitCount=24;
+	bh.biCompression=BI_RGB;  bh.biSizeImage=0;
+	bh.biXPelsPerMeter=0;  bh.biYPelsPerMeter=0;
+	bh.biClrUsed=0;  bh.biClrImportant=0;
+
+	if(!littleendian())
+	{
+		bh.bfType=byteswap16(bh.bfType);
+		bh.bfSize=byteswap(bh.bfSize);
+		bh.bfOffBits=byteswap(bh.bfOffBits);
+		bh.biSize=byteswap(bh.biSize);
+		bh.biWidth=byteswap(bh.biWidth);
+		bh.biHeight=byteswap(bh.biHeight);
+		bh.biPlanes=byteswap16(bh.biPlanes);
+		bh.biBitCount=byteswap16(bh.biBitCount);
+		bh.biCompression=byteswap(bh.biCompression);
+		bh.biSizeImage=byteswap(bh.biSizeImage);
+		bh.biXPelsPerMeter=byteswap(bh.biXPelsPerMeter);
+		bh.biYPelsPerMeter=byteswap(bh.biYPelsPerMeter);
+		bh.biClrUsed=byteswap(bh.biClrUsed);
+		bh.biClrImportant=byteswap(bh.biClrImportant);
+	}
+
+	writeme(fd, &bh.bfType, sizeof(unsigned short));
+	writeme(fd, &bh.bfSize, sizeof(unsigned int));
+	writeme(fd, &bh.bfReserved1, sizeof(unsigned short));
+	writeme(fd, &bh.bfReserved2, sizeof(unsigned short));
+	writeme(fd, &bh.bfOffBits, sizeof(unsigned int));
+	writeme(fd, &bh.biSize, sizeof(unsigned int));
+	writeme(fd, &bh.biWidth, sizeof(int));
+	writeme(fd, &bh.biHeight, sizeof(int));
+	writeme(fd, &bh.biPlanes, sizeof(unsigned short));
+	writeme(fd, &bh.biBitCount, sizeof(unsigned short));
+	writeme(fd, &bh.biCompression, sizeof(unsigned int));
+	writeme(fd, &bh.biSizeImage, sizeof(unsigned int));
+	writeme(fd, &bh.biXPelsPerMeter, sizeof(int));
+	writeme(fd, &bh.biYPelsPerMeter, sizeof(int));
+	writeme(fd, &bh.biClrUsed, sizeof(unsigned int));
+	writeme(fd, &bh.biClrImportant, sizeof(unsigned int));
+
+	if((tempbuf=(unsigned char *)malloc(dstpitch*h))==NULL)
+		_throw("Memory allocation error");
+
+	pixelconvert(buf, f, srcpitch, tempbuf, BMP_BGR, dstpitch, w, h, 
+		!srcbottomup);
+
+	if((byteswritten=write(fd, tempbuf, dstpitch*h))!=dstpitch*h)
+		_throw(strerror(errno));
+
+	finally:
+	if(tempbuf) free(tempbuf);
+	if(fd!=-1) close(fd);
+	return retcode;
+}
+
+const char *bmpgeterr(void)
+{
+	return __bmperr;
+}

diff --git a/bmp.h b/bmp.h
new file mode 100644
index 0000000..437d327
--- /dev/null
+++ b/bmp.h

@@ -0,0 +1,48 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005 Sun Microsystems, Inc.
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version.  The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * wxWindows Library License for more details.
+*/
+
+// This provides rudimentary facilities for loading and saving true color
+// BMP and PPM files
+
+#ifndef __BMP_H__
+#define __BMP_H__
+
+#define BMPPIXELFORMATS 6
+enum BMPPIXELFORMAT {BMP_RGB=0, BMP_RGBA, BMP_BGR, BMP_BGRA, BMP_ABGR, BMP_ARGB};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This will load a Windows bitmap from a file and return a buffer with the
+// specified pixel format, scanline alignment, and orientation.  The width and
+// height are returned in w and h.
+
+int loadbmp(char *filename, unsigned char **buf, int *w, int *h,
+	enum BMPPIXELFORMAT f, int align, int dstbottomup);
+
+// This will save a buffer with the specified pixel format, pitch, orientation,
+// width, and height as a 24-bit Windows bitmap or PPM (the filename determines
+// which format to use)
+
+int savebmp(char *filename, unsigned char *buf, int w, int h,
+	enum BMPPIXELFORMAT f, int srcpitch, int srcbottomup);
+
+const char *bmpgeterr(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

diff --git a/cjpeg.1 b/cjpeg.1
deleted file mode 100644
index d175a96..0000000
--- a/cjpeg.1
+++ /dev/null

@@ -1,292 +0,0 @@
-.TH CJPEG 1 "20 March 1998"
-.SH NAME
-cjpeg \- compress an image file to a JPEG file
-.SH SYNOPSIS
-.B cjpeg
-[
-.I options
-]
-[
-.I filename
-]
-.LP
-.SH DESCRIPTION
-.LP
-.B cjpeg
-compresses the named image file, or the standard input if no file is
-named, and produces a JPEG/JFIF file on the standard output.
-The currently supported input file formats are: PPM (PBMPLUS color
-format), PGM (PBMPLUS gray-scale format), BMP, Targa, and RLE (Utah Raster
-Toolkit format).  (RLE is supported only if the URT library is available.)
-.SH OPTIONS
-All switch names may be abbreviated; for example,
-.B \-grayscale
-may be written
-.B \-gray
-or
-.BR \-gr .
-Most of the "basic" switches can be abbreviated to as little as one letter.
-Upper and lower case are equivalent (thus
-.B \-BMP
-is the same as
-.BR \-bmp ).
-British spellings are also accepted (e.g.,
-.BR \-greyscale ),
-though for brevity these are not mentioned below.
-.PP
-The basic switches are:
-.TP
-.BI \-quality " N"
-Scale quantization tables to adjust image quality.  Quality is 0 (worst) to
-100 (best); default is 75.  (See below for more info.)
-.TP
-.B \-grayscale
-Create monochrome JPEG file from color input.  Be sure to use this switch when
-compressing a grayscale BMP file, because
-.B cjpeg
-isn't bright enough to notice whether a BMP file uses only shades of gray.
-By saying
-.BR \-grayscale ,
-you'll get a smaller JPEG file that takes less time to process.
-.TP
-.B \-optimize
-Perform optimization of entropy encoding parameters.  Without this, default
-encoding parameters are used.
-.B \-optimize
-usually makes the JPEG file a little smaller, but
-.B cjpeg
-runs somewhat slower and needs much more memory.  Image quality and speed of
-decompression are unaffected by
-.BR \-optimize .
-.TP
-.B \-progressive
-Create progressive JPEG file (see below).
-.TP
-.B \-targa
-Input file is Targa format.  Targa files that contain an "identification"
-field will not be automatically recognized by
-.BR cjpeg ;
-for such files you must specify
-.B \-targa
-to make
-.B cjpeg
-treat the input as Targa format.
-For most Targa files, you won't need this switch.
-.PP
-The
-.B \-quality
-switch lets you trade off compressed file size against quality of the
-reconstructed image: the higher the quality setting, the larger the JPEG file,
-and the closer the output image will be to the original input.  Normally you
-want to use the lowest quality setting (smallest file) that decompresses into
-something visually indistinguishable from the original image.  For this
-purpose the quality setting should be between 50 and 95; the default of 75 is
-often about right.  If you see defects at
-.B \-quality
-75, then go up 5 or 10 counts at a time until you are happy with the output
-image.  (The optimal setting will vary from one image to another.)
-.PP
-.B \-quality
-100 will generate a quantization table of all 1's, minimizing loss in the
-quantization step (but there is still information loss in subsampling, as well
-as roundoff error).  This setting is mainly of interest for experimental
-purposes.  Quality values above about 95 are
-.B not
-recommended for normal use; the compressed file size goes up dramatically for
-hardly any gain in output image quality.
-.PP
-In the other direction, quality values below 50 will produce very small files
-of low image quality.  Settings around 5 to 10 might be useful in preparing an
-index of a large image library, for example.  Try
-.B \-quality
-2 (or so) for some amusing Cubist effects.  (Note: quality
-values below about 25 generate 2-byte quantization tables, which are
-considered optional in the JPEG standard.
-.B cjpeg
-emits a warning message when you give such a quality value, because some
-other JPEG programs may be unable to decode the resulting file.  Use
-.B \-baseline
-if you need to ensure compatibility at low quality values.)
-.PP
-The
-.B \-progressive
-switch creates a "progressive JPEG" file.  In this type of JPEG file, the data
-is stored in multiple scans of increasing quality.  If the file is being
-transmitted over a slow communications link, the decoder can use the first
-scan to display a low-quality image very quickly, and can then improve the
-display with each subsequent scan.  The final image is exactly equivalent to a
-standard JPEG file of the same quality setting, and the total file size is
-about the same --- often a little smaller.
-.B Caution:
-progressive JPEG is not yet widely implemented, so many decoders will be
-unable to view a progressive JPEG file at all.
-.PP
-Switches for advanced users:
-.TP
-.B \-dct int
-Use integer DCT method (default).
-.TP
-.B \-dct fast
-Use fast integer DCT (less accurate).
-.TP
-.B \-dct float
-Use floating-point DCT method.
-The float method is very slightly more accurate than the int method, but is
-much slower unless your machine has very fast floating-point hardware.  Also
-note that results of the floating-point method may vary slightly across
-machines, while the integer methods should give the same results everywhere.
-The fast integer method is much less accurate than the other two.
-.TP
-.BI \-restart " N"
-Emit a JPEG restart marker every N MCU rows, or every N MCU blocks if "B" is
-attached to the number.
-.B \-restart 0
-(the default) means no restart markers.
-.TP
-.BI \-smooth " N"
-Smooth the input image to eliminate dithering noise.  N, ranging from 1 to
-100, indicates the strength of smoothing.  0 (the default) means no smoothing.
-.TP
-.BI \-maxmemory " N"
-Set limit for amount of memory to use in processing large images.  Value is
-in thousands of bytes, or millions of bytes if "M" is attached to the
-number.  For example,
-.B \-max 4m
-selects 4000000 bytes.  If more space is needed, temporary files will be used.
-.TP
-.BI \-outfile " name"
-Send output image to the named file, not to standard output.
-.TP
-.B \-verbose
-Enable debug printout.  More
-.BR \-v 's
-give more output.  Also, version information is printed at startup.
-.TP
-.B \-debug
-Same as
-.BR \-verbose .
-.PP
-The
-.B \-restart
-option inserts extra markers that allow a JPEG decoder to resynchronize after
-a transmission error.  Without restart markers, any damage to a compressed
-file will usually ruin the image from the point of the error to the end of the
-image; with restart markers, the damage is usually confined to the portion of
-the image up to the next restart marker.  Of course, the restart markers
-occupy extra space.  We recommend
-.B \-restart 1
-for images that will be transmitted across unreliable networks such as Usenet.
-.PP
-The
-.B \-smooth
-option filters the input to eliminate fine-scale noise.  This is often useful
-when converting dithered images to JPEG: a moderate smoothing factor of 10 to
-50 gets rid of dithering patterns in the input file, resulting in a smaller
-JPEG file and a better-looking image.  Too large a smoothing factor will
-visibly blur the image, however.
-.PP
-Switches for wizards:
-.TP
-.B \-baseline
-Force baseline-compatible quantization tables to be generated.  This clamps
-quantization values to 8 bits even at low quality settings.  (This switch is
-poorly named, since it does not ensure that the output is actually baseline
-JPEG.  For example, you can use
-.B \-baseline
-and
-.B \-progressive
-together.)
-.TP
-.BI \-qtables " file"
-Use the quantization tables given in the specified text file.
-.TP
-.BI \-qslots " N[,...]"
-Select which quantization table to use for each color component.
-.TP
-.BI \-sample " HxV[,...]"
-Set JPEG sampling factors for each color component.
-.TP
-.BI \-scans " file"
-Use the scan script given in the specified text file.
-.PP
-The "wizard" switches are intended for experimentation with JPEG.  If you
-don't know what you are doing, \fBdon't use them\fR.  These switches are
-documented further in the file wizard.doc.
-.SH EXAMPLES
-.LP
-This example compresses the PPM file foo.ppm with a quality factor of
-60 and saves the output as foo.jpg:
-.IP
-.B cjpeg \-quality
-.I 60 foo.ppm
-.B >
-.I foo.jpg
-.SH HINTS
-Color GIF files are not the ideal input for JPEG; JPEG is really intended for
-compressing full-color (24-bit) images.  In particular, don't try to convert
-cartoons, line drawings, and other images that have only a few distinct
-colors.  GIF works great on these, JPEG does not.  If you want to convert a
-GIF to JPEG, you should experiment with
-.BR cjpeg 's
-.B \-quality
-and
-.B \-smooth
-options to get a satisfactory conversion.
-.B \-smooth 10
-or so is often helpful.
-.PP
-Avoid running an image through a series of JPEG compression/decompression
-cycles.  Image quality loss will accumulate; after ten or so cycles the image
-may be noticeably worse than it was after one cycle.  It's best to use a
-lossless format while manipulating an image, then convert to JPEG format when
-you are ready to file the image away.
-.PP
-The
-.B \-optimize
-option to
-.B cjpeg
-is worth using when you are making a "final" version for posting or archiving.
-It's also a win when you are using low quality settings to make very small
-JPEG files; the percentage improvement is often a lot more than it is on
-larger files.  (At present,
-.B \-optimize
-mode is always selected when generating progressive JPEG files.)
-.SH ENVIRONMENT
-.TP
-.B JPEGMEM
-If this environment variable is set, its value is the default memory limit.
-The value is specified as described for the
-.B \-maxmemory
-switch.
-.B JPEGMEM
-overrides the default value specified when the program was compiled, and
-itself is overridden by an explicit
-.BR \-maxmemory .
-.SH SEE ALSO
-.BR djpeg (1),
-.BR jpegtran (1),
-.BR rdjpgcom (1),
-.BR wrjpgcom (1)
-.br
-.BR ppm (5),
-.BR pgm (5)
-.br
-Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
-Communications of the ACM, April 1991 (vol. 34, no. 4), pp. 30-44.
-.SH AUTHOR
-Independent JPEG Group
-.SH BUGS
-Arithmetic coding is not supported for legal reasons.
-.PP
-GIF input files are no longer supported, to avoid the Unisys LZW patent.
-Use a Unisys-licensed program if you need to read a GIF file.  (Conversion
-of GIF files to JPEG is usually a bad idea anyway.)
-.PP
-Not all variants of BMP and Targa file formats are supported.
-.PP
-The
-.B \-targa
-switch is not a bug, it's a feature.  (It would be a bug if the Targa format
-designers had not been clueless.)
-.PP
-Still not as fast as we'd like.

diff --git a/ckconfig.c b/ckconfig.c
deleted file mode 100644
index 34baf79..0000000
--- a/ckconfig.c
+++ /dev/null

@@ -1,402 +0,0 @@
-/*
- * ckconfig.c
- *
- * Copyright (C) 1991-1994, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- */
-
-/*
- * This program is intended to help you determine how to configure the JPEG
- * software for installation on a particular system.  The idea is to try to
- * compile and execute this program.  If your compiler fails to compile the
- * program, make changes as indicated in the comments below.  Once you can
- * compile the program, run it, and it will produce a "jconfig.h" file for
- * your system.
- *
- * As a general rule, each time you try to compile this program,
- * pay attention only to the *first* error message you get from the compiler.
- * Many C compilers will issue lots of spurious error messages once they
- * have gotten confused.  Go to the line indicated in the first error message,
- * and read the comments preceding that line to see what to change.
- *
- * Almost all of the edits you may need to make to this program consist of
- * changing a line that reads "#define SOME_SYMBOL" to "#undef SOME_SYMBOL",
- * or vice versa.  This is called defining or undefining that symbol.
- */
-
-
-/* First we must see if your system has the include files we need.
- * We start out with the assumption that your system has all the ANSI-standard
- * include files.  If you get any error trying to include one of these files,
- * undefine the corresponding HAVE_xxx symbol.
- */
-
-#define HAVE_STDDEF_H		/* replace 'define' by 'undef' if error here */
-#ifdef HAVE_STDDEF_H		/* next line will be skipped if you undef... */
-#include <stddef.h>
-#endif
-
-#define HAVE_STDLIB_H		/* same thing for stdlib.h */
-#ifdef HAVE_STDLIB_H
-#include <stdlib.h>
-#endif
-
-#include <stdio.h>		/* If you ain't got this, you ain't got C. */
-
-/* We have to see if your string functions are defined by
- * strings.h (old BSD convention) or string.h (everybody else).
- * We try the non-BSD convention first; define NEED_BSD_STRINGS
- * if the compiler says it can't find string.h.
- */
-
-#undef NEED_BSD_STRINGS
-
-#ifdef NEED_BSD_STRINGS
-#include <strings.h>
-#else
-#include <string.h>
-#endif
-
-/* On some systems (especially older Unix machines), type size_t is
- * defined only in the include file <sys/types.h>.  If you get a failure
- * on the size_t test below, try defining NEED_SYS_TYPES_H.
- */
-
-#undef NEED_SYS_TYPES_H		/* start by assuming we don't need it */
-#ifdef NEED_SYS_TYPES_H
-#include <sys/types.h>
-#endif
-
-
-/* Usually type size_t is defined in one of the include files we've included
- * above.  If not, you'll get an error on the "typedef size_t my_size_t;" line.
- * In that case, first try defining NEED_SYS_TYPES_H just above.
- * If that doesn't work, you'll have to search through your system library
- * to figure out which include file defines "size_t".  Look for a line that
- * says "typedef something-or-other size_t;".  Then, change the line below
- * that says "#include <someincludefile.h>" to instead include the file
- * you found size_t in, and define NEED_SPECIAL_INCLUDE.  If you can't find
- * type size_t anywhere, try replacing "#include <someincludefile.h>" with
- * "typedef unsigned int size_t;".
- */
-
-#undef NEED_SPECIAL_INCLUDE	/* assume we DON'T need it, for starters */
-
-#ifdef NEED_SPECIAL_INCLUDE
-#include <someincludefile.h>
-#endif
-
-typedef size_t my_size_t;	/* The payoff: do we have size_t now? */
-
-
-/* The next question is whether your compiler supports ANSI-style function
- * prototypes.  You need to know this in order to choose between using
- * makefile.ansi and using makefile.unix.
- * The #define line below is set to assume you have ANSI function prototypes.
- * If you get an error in this group of lines, undefine HAVE_PROTOTYPES.
- */
-
-#define HAVE_PROTOTYPES
-
-#ifdef HAVE_PROTOTYPES
-int testfunction (int arg1, int * arg2); /* check prototypes */
-
-struct methods_struct {		/* check method-pointer declarations */
-  int (*error_exit) (char *msgtext);
-  int (*trace_message) (char *msgtext);
-  int (*another_method) (void);
-};
-
-int testfunction (int arg1, int * arg2) /* check definitions */
-{
-  return arg2[arg1];
-}
-
-int test2function (void)	/* check void arg list */
-{
-  return 0;
-}
-#endif
-
-
-/* Now we want to find out if your compiler knows what "unsigned char" means.
- * If you get an error on the "unsigned char un_char;" line,
- * then undefine HAVE_UNSIGNED_CHAR.
- */
-
-#define HAVE_UNSIGNED_CHAR
-
-#ifdef HAVE_UNSIGNED_CHAR
-unsigned char un_char;
-#endif
-
-
-/* Now we want to find out if your compiler knows what "unsigned short" means.
- * If you get an error on the "unsigned short un_short;" line,
- * then undefine HAVE_UNSIGNED_SHORT.
- */
-
-#define HAVE_UNSIGNED_SHORT
-
-#ifdef HAVE_UNSIGNED_SHORT
-unsigned short un_short;
-#endif
-
-
-/* Now we want to find out if your compiler understands type "void".
- * If you get an error anywhere in here, undefine HAVE_VOID.
- */
-
-#define HAVE_VOID
-
-#ifdef HAVE_VOID
-/* Caution: a C++ compiler will insist on complete prototypes */
-typedef void * void_ptr;	/* check void * */
-#ifdef HAVE_PROTOTYPES		/* check ptr to function returning void */
-typedef void (*void_func) (int a, int b);
-#else
-typedef void (*void_func) ();
-#endif
-
-#ifdef HAVE_PROTOTYPES		/* check void function result */
-void test3function (void_ptr arg1, void_func arg2)
-#else
-void test3function (arg1, arg2)
-     void_ptr arg1;
-     void_func arg2;
-#endif
-{
-  char * locptr = (char *) arg1; /* check casting to and from void * */
-  arg1 = (void *) locptr;
-  (*arg2) (1, 2);		/* check call of fcn returning void */
-}
-#endif
-
-
-/* Now we want to find out if your compiler knows what "const" means.
- * If you get an error here, undefine HAVE_CONST.
- */
-
-#define HAVE_CONST
-
-#ifdef HAVE_CONST
-static const int carray[3] = {1, 2, 3};
-
-#ifdef HAVE_PROTOTYPES
-int test4function (const int arg1)
-#else
-int test4function (arg1)
-     const int arg1;
-#endif
-{
-  return carray[arg1];
-}
-#endif
-
-
-/* If you get an error or warning about this structure definition,
- * define INCOMPLETE_TYPES_BROKEN.
- */
-
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifndef INCOMPLETE_TYPES_BROKEN
-typedef struct undefined_structure * undef_struct_ptr;
-#endif
-
-
-/* If you get an error about duplicate names,
- * define NEED_SHORT_EXTERNAL_NAMES.
- */
-
-#undef NEED_SHORT_EXTERNAL_NAMES
-
-#ifndef NEED_SHORT_EXTERNAL_NAMES
-
-int possibly_duplicate_function ()
-{
-  return 0;
-}
-
-int possibly_dupli_function ()
-{
-  return 1;
-}
-
-#endif
-
-
-
-/************************************************************************
- *  OK, that's it.  You should not have to change anything beyond this
- *  point in order to compile and execute this program.  (You might get
- *  some warnings, but you can ignore them.)
- *  When you run the program, it will make a couple more tests that it
- *  can do automatically, and then it will create jconfig.h and print out
- *  any additional suggestions it has.
- ************************************************************************
- */
-
-
-#ifdef HAVE_PROTOTYPES
-int is_char_signed (int arg)
-#else
-int is_char_signed (arg)
-     int arg;
-#endif
-{
-  if (arg == 189) {		/* expected result for unsigned char */
-    return 0;			/* type char is unsigned */
-  }
-  else if (arg != -67) {	/* expected result for signed char */
-    printf("Hmm, it seems 'char' is not eight bits wide on your machine.\n");
-    printf("I fear the JPEG software will not work at all.\n\n");
-  }
-  return 1;			/* assume char is signed otherwise */
-}
-
-
-#ifdef HAVE_PROTOTYPES
-int is_shifting_signed (long arg)
-#else
-int is_shifting_signed (arg)
-     long arg;
-#endif
-/* See whether right-shift on a long is signed or not. */
-{
-  long res = arg >> 4;
-
-  if (res == -0x7F7E80CL) {	/* expected result for signed shift */
-    return 1;			/* right shift is signed */
-  }
-  /* see if unsigned-shift hack will fix it. */
-  /* we can't just test exact value since it depends on width of long... */
-  res |= (~0L) << (32-4);
-  if (res == -0x7F7E80CL) {	/* expected result now? */
-    return 0;			/* right shift is unsigned */
-  }
-  printf("Right shift isn't acting as I expect it to.\n");
-  printf("I fear the JPEG software will not work at all.\n\n");
-  return 0;			/* try it with unsigned anyway */
-}
-
-
-#ifdef HAVE_PROTOTYPES
-int main (int argc, char ** argv)
-#else
-int main (argc, argv)
-     int argc;
-     char ** argv;
-#endif
-{
-  char signed_char_check = (char) (-67);
-  FILE *outfile;
-
-  /* Attempt to write jconfig.h */
-  if ((outfile = fopen("jconfig.h", "w")) == NULL) {
-    printf("Failed to write jconfig.h\n");
-    return 1;
-  }
-
-  /* Write out all the info */
-  fprintf(outfile, "/* jconfig.h --- generated by ckconfig.c */\n");
-  fprintf(outfile, "/* see jconfig.doc for explanations */\n\n");
-#ifdef HAVE_PROTOTYPES
-  fprintf(outfile, "#define HAVE_PROTOTYPES\n");
-#else
-  fprintf(outfile, "#undef HAVE_PROTOTYPES\n");
-#endif
-#ifdef HAVE_UNSIGNED_CHAR
-  fprintf(outfile, "#define HAVE_UNSIGNED_CHAR\n");
-#else
-  fprintf(outfile, "#undef HAVE_UNSIGNED_CHAR\n");
-#endif
-#ifdef HAVE_UNSIGNED_SHORT
-  fprintf(outfile, "#define HAVE_UNSIGNED_SHORT\n");
-#else
-  fprintf(outfile, "#undef HAVE_UNSIGNED_SHORT\n");
-#endif
-#ifdef HAVE_VOID
-  fprintf(outfile, "/* #define void char */\n");
-#else
-  fprintf(outfile, "#define void char\n");
-#endif
-#ifdef HAVE_CONST
-  fprintf(outfile, "/* #define const */\n");
-#else
-  fprintf(outfile, "#define const\n");
-#endif
-  if (is_char_signed((int) signed_char_check))
-    fprintf(outfile, "#undef CHAR_IS_UNSIGNED\n");
-  else
-    fprintf(outfile, "#define CHAR_IS_UNSIGNED\n");
-#ifdef HAVE_STDDEF_H
-  fprintf(outfile, "#define HAVE_STDDEF_H\n");
-#else
-  fprintf(outfile, "#undef HAVE_STDDEF_H\n");
-#endif
-#ifdef HAVE_STDLIB_H
-  fprintf(outfile, "#define HAVE_STDLIB_H\n");
-#else
-  fprintf(outfile, "#undef HAVE_STDLIB_H\n");
-#endif
-#ifdef NEED_BSD_STRINGS
-  fprintf(outfile, "#define NEED_BSD_STRINGS\n");
-#else
-  fprintf(outfile, "#undef NEED_BSD_STRINGS\n");
-#endif
-#ifdef NEED_SYS_TYPES_H
-  fprintf(outfile, "#define NEED_SYS_TYPES_H\n");
-#else
-  fprintf(outfile, "#undef NEED_SYS_TYPES_H\n");
-#endif
-  fprintf(outfile, "#undef NEED_FAR_POINTERS\n");
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-  fprintf(outfile, "#define NEED_SHORT_EXTERNAL_NAMES\n");
-#else
-  fprintf(outfile, "#undef NEED_SHORT_EXTERNAL_NAMES\n");
-#endif
-#ifdef INCOMPLETE_TYPES_BROKEN
-  fprintf(outfile, "#define INCOMPLETE_TYPES_BROKEN\n");
-#else
-  fprintf(outfile, "#undef INCOMPLETE_TYPES_BROKEN\n");
-#endif
-  fprintf(outfile, "\n#ifdef JPEG_INTERNALS\n\n");
-  if (is_shifting_signed(-0x7F7E80B1L))
-    fprintf(outfile, "#undef RIGHT_SHIFT_IS_UNSIGNED\n");
-  else
-    fprintf(outfile, "#define RIGHT_SHIFT_IS_UNSIGNED\n");
-  fprintf(outfile, "\n#endif /* JPEG_INTERNALS */\n");
-  fprintf(outfile, "\n#ifdef JPEG_CJPEG_DJPEG\n\n");
-  fprintf(outfile, "#define BMP_SUPPORTED		/* BMP image file format */\n");
-  fprintf(outfile, "#define GIF_SUPPORTED		/* GIF image file format */\n");
-  fprintf(outfile, "#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */\n");
-  fprintf(outfile, "#undef RLE_SUPPORTED		/* Utah RLE image file format */\n");
-  fprintf(outfile, "#define TARGA_SUPPORTED		/* Targa image file format */\n\n");
-  fprintf(outfile, "#undef TWO_FILE_COMMANDLINE	/* You may need this on non-Unix systems */\n");
-  fprintf(outfile, "#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */\n");
-  fprintf(outfile, "#undef DONT_USE_B_MODE\n");
-  fprintf(outfile, "/* #define PROGRESS_REPORT */	/* optional */\n");
-  fprintf(outfile, "\n#endif /* JPEG_CJPEG_DJPEG */\n");
-
-  /* Close the jconfig.h file */
-  fclose(outfile);
-
-  /* User report */
-  printf("Configuration check for Independent JPEG Group's software done.\n");
-  printf("\nI have written the jconfig.h file for you.\n\n");
-#ifdef HAVE_PROTOTYPES
-  printf("You should use makefile.ansi as the starting point for your Makefile.\n");
-#else
-  printf("You should use makefile.unix as the starting point for your Makefile.\n");
-#endif
-
-#ifdef NEED_SPECIAL_INCLUDE
-  printf("\nYou'll need to change jconfig.h to include the system include file\n");
-  printf("that you found type size_t in, or add a direct definition of type\n");
-  printf("size_t if that's what you used.  Just add it to the end.\n");
-#endif
-
-  return 0;
-}

diff --git a/config.guess b/config.guess
deleted file mode 100755
index 413ed41..0000000
--- a/config.guess
+++ /dev/null

@@ -1,883 +0,0 @@
-#! /bin/sh
-# Attempt to guess a canonical system name.
-#   Copyright (C) 1992, 93, 94, 95, 96, 1997 Free Software Foundation, Inc.
-#
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# Written by Per Bothner <bothner@cygnus.com>.
-# The master version of this file is at the FSF in /home/gd/gnu/lib.
-#
-# This script attempts to guess a canonical system name similar to
-# config.sub.  If it succeeds, it prints the system name on stdout, and
-# exits with 0.  Otherwise, it exits with 1.
-#
-# The plan is that this can be called by configure scripts if you
-# don't specify an explicit system type (host/target name).
-#
-# Only a few systems have been added to this list; please add others
-# (but try to keep the structure clean).
-#
-
-# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
-# (ghazi@noc.rutgers.edu 8/24/94.)
-if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
-	PATH=$PATH:/.attbin ; export PATH
-fi
-
-UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
-UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
-UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown
-UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
-
-trap 'rm -f dummy.c dummy.o dummy; exit 1' 1 2 15
-
-# Note: order is significant - the case branches are not exclusive.
-
-case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
-    alpha:OSF1:*:*)
-	if test $UNAME_RELEASE = "V4.0"; then
-		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
-	fi
-	# A Vn.n version is a released version.
-	# A Tn.n version is a released field test version.
-	# A Xn.n version is an unreleased experimental baselevel.
-	# 1.2 uses "1.2" for uname -r.
-	cat <<EOF >dummy.s
-	.globl main
-	.ent main
-main:
-	.frame \$30,0,\$26,0
-	.prologue 0
-	.long 0x47e03d80 # implver $0
-	lda \$2,259
-	.long 0x47e20c21 # amask $2,$1
-	srl \$1,8,\$2
-	sll \$2,2,\$2
-	sll \$0,3,\$0
-	addl \$1,\$0,\$0
-	addl \$2,\$0,\$0
-	ret \$31,(\$26),1
-	.end main
-EOF
-	${CC-cc} dummy.s -o dummy 2>/dev/null
-	if test "$?" = 0 ; then
-		./dummy
-		case "$?" in
-			7)
-				UNAME_MACHINE="alpha"
-				;;
-			15)
-				UNAME_MACHINE="alphaev5"
-				;;
-			14)
-				UNAME_MACHINE="alphaev56"
-				;;
-			10)
-				UNAME_MACHINE="alphapca56"
-				;;
-			16)
-				UNAME_MACHINE="alphaev6"
-				;;
-		esac
-	fi
-	rm -f dummy.s dummy
-	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[VTX]//' | tr [[A-Z]] [[a-z]]`
-	exit 0 ;;
-    21064:Windows_NT:50:3)
-	echo alpha-dec-winnt3.5
-	exit 0 ;;
-    Amiga*:UNIX_System_V:4.0:*)
-	echo m68k-cbm-sysv4
-	exit 0;;
-    amiga:NetBSD:*:*)
-      echo m68k-cbm-netbsd${UNAME_RELEASE}
-      exit 0 ;;
-    amiga:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    arc64:OpenBSD:*:*)
-	echo mips64el-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    arc:OpenBSD:*:*)
-	echo mipsel-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    hkmips:OpenBSD:*:*)
-	echo mips-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    pmax:OpenBSD:*:*)
-	echo mipsel-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    sgi:OpenBSD:*:*)
-	echo mips-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    wgrisc:OpenBSD:*:*)
-	echo mipsel-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
-	echo arm-acorn-riscix${UNAME_RELEASE}
-	exit 0;;
-    arm32:NetBSD:*:*)
-	echo arm-unknown-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
-	exit 0 ;;
-    SR2?01:HI-UX/MPP:*:*)
-	echo hppa1.1-hitachi-hiuxmpp
-	exit 0;;
-    Pyramid*:OSx*:*:*|MIS*:OSx*:*:*)
-	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
-	if test "`(/bin/universe) 2>/dev/null`" = att ; then
-		echo pyramid-pyramid-sysv3
-	else
-		echo pyramid-pyramid-bsd
-	fi
-	exit 0 ;;
-    NILE:*:*:dcosx)
-	echo pyramid-pyramid-svr4
-	exit 0 ;;
-    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
-	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
-    i86pc:SunOS:5.*:*)
-	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
-    sun4*:SunOS:6*:*)
-	# According to config.sub, this is the proper way to canonicalize
-	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
-	# it's likely to be more like Solaris than SunOS4.
-	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
-    sun4*:SunOS:*:*)
-	case "`/usr/bin/arch -k`" in
-	    Series*|S4*)
-		UNAME_RELEASE=`uname -v`
-		;;
-	esac
-	# Japanese Language versions have a version number like `4.1.3-JL'.
-	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
-	exit 0 ;;
-    sun3*:SunOS:*:*)
-	echo m68k-sun-sunos${UNAME_RELEASE}
-	exit 0 ;;
-    sun*:*:4.2BSD:*)
-	UNAME_RELEASE=`(head -1 /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
-	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
-	case "`/bin/arch`" in
-	    sun3)
-		echo m68k-sun-sunos${UNAME_RELEASE}
-		;;
-	    sun4)
-		echo sparc-sun-sunos${UNAME_RELEASE}
-		;;
-	esac
-	exit 0 ;;
-    aushp:SunOS:*:*)
-	echo sparc-auspex-sunos${UNAME_RELEASE}
-	exit 0 ;;
-    atari*:NetBSD:*:*)
-	echo m68k-atari-netbsd${UNAME_RELEASE}
-	exit 0 ;;
-    atari*:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    sun3*:NetBSD:*:*)
-	echo m68k-sun-netbsd${UNAME_RELEASE}
-	exit 0 ;;
-    sun3*:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    mac68k:NetBSD:*:*)
-	echo m68k-apple-netbsd${UNAME_RELEASE}
-	exit 0 ;;
-    mac68k:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    mvme68k:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    mvme88k:OpenBSD:*:*)
-	echo m88k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    powerpc:machten:*:*)
-	echo powerpc-apple-machten${UNAME_RELEASE}
-	exit 0 ;;
-    RISC*:Mach:*:*)
-	echo mips-dec-mach_bsd4.3
-	exit 0 ;;
-    RISC*:ULTRIX:*:*)
-	echo mips-dec-ultrix${UNAME_RELEASE}
-	exit 0 ;;
-    VAX*:ULTRIX*:*:*)
-	echo vax-dec-ultrix${UNAME_RELEASE}
-	exit 0 ;;
-    2020:CLIX:*:*)
-	echo clipper-intergraph-clix${UNAME_RELEASE}
-	exit 0 ;;
-    mips:*:*:UMIPS | mips:*:*:RISCos)
-	sed 's/^	//' << EOF >dummy.c
-	int main (argc, argv) int argc; char **argv; {
-	#if defined (host_mips) && defined (MIPSEB)
-	#if defined (SYSTYPE_SYSV)
-	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_SVR4)
-	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
-	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
-	#endif
-	#endif
-	  exit (-1);
-	}
-EOF
-	${CC-cc} dummy.c -o dummy \
-	  && ./dummy `echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` \
-	  && rm dummy.c dummy && exit 0
-	rm -f dummy.c dummy
-	echo mips-mips-riscos${UNAME_RELEASE}
-	exit 0 ;;
-    Night_Hawk:Power_UNIX:*:*)
-	echo powerpc-harris-powerunix
-	exit 0 ;;
-    m88k:CX/UX:7*:*)
-	echo m88k-harris-cxux7
-	exit 0 ;;
-    m88k:*:4*:R4*)
-	echo m88k-motorola-sysv4
-	exit 0 ;;
-    m88k:*:3*:R3*)
-	echo m88k-motorola-sysv3
-	exit 0 ;;
-    AViiON:dgux:*:*)
-        # DG/UX returns AViiON for all architectures
-        UNAME_PROCESSOR=`/usr/bin/uname -p`
-        if [ $UNAME_PROCESSOR = mc88100 -o $UNAME_PROCESSOR = mc88110 ] ; then
-	if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx \
-	     -o ${TARGET_BINARY_INTERFACE}x = x ] ; then
-		echo m88k-dg-dgux${UNAME_RELEASE}
-	else
-		echo m88k-dg-dguxbcs${UNAME_RELEASE}
-	fi
-        else echo i586-dg-dgux${UNAME_RELEASE}
-        fi
- 	exit 0 ;;
-    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
-	echo m88k-dolphin-sysv3
-	exit 0 ;;
-    M88*:*:R3*:*)
-	# Delta 88k system running SVR3
-	echo m88k-motorola-sysv3
-	exit 0 ;;
-    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
-	echo m88k-tektronix-sysv3
-	exit 0 ;;
-    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
-	echo m68k-tektronix-bsd
-	exit 0 ;;
-    *:IRIX*:*:*)
-	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
-	exit 0 ;;
-    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
-	echo romp-ibm-aix      # uname -m gives an 8 hex-code CPU id
-	exit 0 ;;              # Note that: echo "'`uname -s`'" gives 'AIX '
-    i?86:AIX:*:*)
-	echo i386-ibm-aix
-	exit 0 ;;
-    *:AIX:2:3)
-	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
-		sed 's/^		//' << EOF >dummy.c
-		#include <sys/systemcfg.h>
-
-		main()
-			{
-			if (!__power_pc())
-				exit(1);
-			puts("powerpc-ibm-aix3.2.5");
-			exit(0);
-			}
-EOF
-		${CC-cc} dummy.c -o dummy && ./dummy && rm dummy.c dummy && exit 0
-		rm -f dummy.c dummy
-		echo rs6000-ibm-aix3.2.5
-	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
-		echo rs6000-ibm-aix3.2.4
-	else
-		echo rs6000-ibm-aix3.2
-	fi
-	exit 0 ;;
-    *:AIX:*:4)
-	if /usr/sbin/lsattr -EHl proc0 | grep POWER >/dev/null 2>&1; then
-		IBM_ARCH=rs6000
-	else
-		IBM_ARCH=powerpc
-	fi
-	if [ -x /usr/bin/oslevel ] ; then
-		IBM_REV=`/usr/bin/oslevel`
-	else
-		IBM_REV=4.${UNAME_RELEASE}
-	fi
-	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
-	exit 0 ;;
-    *:AIX:*:*)
-	echo rs6000-ibm-aix
-	exit 0 ;;
-    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
-	echo romp-ibm-bsd4.4
-	exit 0 ;;
-    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC NetBSD and
-	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
-	exit 0 ;;                           # report: romp-ibm BSD 4.3
-    *:BOSX:*:*)
-	echo rs6000-bull-bosx
-	exit 0 ;;
-    DPX/2?00:B.O.S.:*:*)
-	echo m68k-bull-sysv3
-	exit 0 ;;
-    9000/[34]??:4.3bsd:1.*:*)
-	echo m68k-hp-bsd
-	exit 0 ;;
-    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
-	echo m68k-hp-bsd4.4
-	exit 0 ;;
-    9000/[3478]??:HP-UX:*:*)
-	case "${UNAME_MACHINE}" in
-	    9000/31? )            HP_ARCH=m68000 ;;
-	    9000/[34]?? )         HP_ARCH=m68k ;;
-	    9000/7?? | 9000/8?[1679] ) HP_ARCH=hppa1.1 ;;
-	    9000/8?? )            HP_ARCH=hppa1.0 ;;
-	esac
-	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
-	exit 0 ;;
-    3050*:HI-UX:*:*)
-	sed 's/^	//' << EOF >dummy.c
-	#include <unistd.h>
-	int
-	main ()
-	{
-	  long cpu = sysconf (_SC_CPU_VERSION);
-	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
-	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
-	     results, however.  */
-	  if (CPU_IS_PA_RISC (cpu))
-	    {
-	      switch (cpu)
-		{
-		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
-		  default: puts ("hppa-hitachi-hiuxwe2"); break;
-		}
-	    }
-	  else if (CPU_IS_HP_MC68K (cpu))
-	    puts ("m68k-hitachi-hiuxwe2");
-	  else puts ("unknown-hitachi-hiuxwe2");
-	  exit (0);
-	}
-EOF
-	${CC-cc} dummy.c -o dummy && ./dummy && rm dummy.c dummy && exit 0
-	rm -f dummy.c dummy
-	echo unknown-hitachi-hiuxwe2
-	exit 0 ;;
-    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
-	echo hppa1.1-hp-bsd
-	exit 0 ;;
-    9000/8??:4.3bsd:*:*)
-	echo hppa1.0-hp-bsd
-	exit 0 ;;
-    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
-	echo hppa1.1-hp-osf
-	exit 0 ;;
-    hp8??:OSF1:*:*)
-	echo hppa1.0-hp-osf
-	exit 0 ;;
-    i?86:OSF1:*:*)
-	if [ -x /usr/sbin/sysversion ] ; then
-	    echo ${UNAME_MACHINE}-unknown-osf1mk
-	else
-	    echo ${UNAME_MACHINE}-unknown-osf1
-	fi
-	exit 0 ;;
-    parisc*:Lites*:*:*)
-	echo hppa1.1-hp-lites
-	exit 0 ;;
-    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
-	echo c1-convex-bsd
-        exit 0 ;;
-    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-        exit 0 ;;
-    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
-	echo c34-convex-bsd
-        exit 0 ;;
-    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
-	echo c38-convex-bsd
-        exit 0 ;;
-    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
-	echo c4-convex-bsd
-        exit 0 ;;
-    CRAY*X-MP:*:*:*)
-	echo xmp-cray-unicos
-        exit 0 ;;
-    CRAY*Y-MP:*:*:*)
-	echo ymp-cray-unicos${UNAME_RELEASE}
-	exit 0 ;;
-    CRAY*[A-Z]90:*:*:*)
-	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
-	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
-	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/
-	exit 0 ;;
-    CRAY*TS:*:*:*)
-	echo t90-cray-unicos${UNAME_RELEASE}
-	exit 0 ;;
-    CRAY-2:*:*:*)
-	echo cray2-cray-unicos
-        exit 0 ;;
-    F300:UNIX_System_V:*:*)
-        FUJITSU_SYS=`uname -p | tr [A-Z] [a-z] | sed -e 's/\///'`
-        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
-        echo "f300-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-        exit 0 ;;
-    F301:UNIX_System_V:*:*)
-       echo f301-fujitsu-uxpv`echo $UNAME_RELEASE | sed 's/ .*//'`
-       exit 0 ;;
-    hp3[0-9][05]:NetBSD:*:*)
-	echo m68k-hp-netbsd${UNAME_RELEASE}
-	exit 0 ;;
-    hp300:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    i?86:BSD/386:*:* | *:BSD/OS:*:*)
-	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
-	exit 0 ;;
-    *:FreeBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
-	exit 0 ;;
-    *:NetBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
-	exit 0 ;;
-    *:OpenBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-openbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
-	exit 0 ;;
-    i*:CYGWIN*:*)
-	echo i386-pc-cygwin32
-	exit 0 ;;
-    i*:MINGW*:*)
-	echo i386-pc-mingw32
-	exit 0 ;;
-    p*:CYGWIN*:*)
-	echo powerpcle-unknown-cygwin32
-	exit 0 ;;
-    prep*:SunOS:5.*:*)
-	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
-    *:GNU:*:*)
-	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
-	exit 0 ;;
-    *:Linux:*:*)
-	# The BFD linker knows what the default object file format is, so
-	# first see if it will tell us.
-	ld_help_string=`ld --help 2>&1`
-	ld_supported_emulations=`echo $ld_help_string \
-			 | sed -ne '/supported emulations:/!d
-				    s/[ 	][ 	]*/ /g
-				    s/.*supported emulations: *//
-				    s/ .*//
-				    p'`
-        case "$ld_supported_emulations" in
-	  i?86linux)  echo "${UNAME_MACHINE}-pc-linux-gnuaout"      ; exit 0 ;;
-	  i?86coff)   echo "${UNAME_MACHINE}-pc-linux-gnucoff"      ; exit 0 ;;
-	  sparclinux) echo "${UNAME_MACHINE}-unknown-linux-gnuaout" ; exit 0 ;;
-	  m68klinux)  echo "${UNAME_MACHINE}-unknown-linux-gnuaout" ; exit 0 ;;
-	  elf32ppc)   echo "powerpc-unknown-linux-gnu"              ; exit 0 ;;
-	esac
-
-	if test "${UNAME_MACHINE}" = "alpha" ; then
-		sed 's/^	//'  <<EOF >dummy.s
-		.globl main
-		.ent main
-	main:
-		.frame \$30,0,\$26,0
-		.prologue 0
-		.long 0x47e03d80 # implver $0
-		lda \$2,259
-		.long 0x47e20c21 # amask $2,$1
-		srl \$1,8,\$2
-		sll \$2,2,\$2
-		sll \$0,3,\$0
-		addl \$1,\$0,\$0
-		addl \$2,\$0,\$0
-		ret \$31,(\$26),1
-		.end main
-EOF
-		LIBC=""
-		${CC-cc} dummy.s -o dummy 2>/dev/null
-		if test "$?" = 0 ; then
-			./dummy
-			case "$?" in
-			7)
-				UNAME_MACHINE="alpha"
-				;;
-			15)
-				UNAME_MACHINE="alphaev5"
-				;;
-			14)
-				UNAME_MACHINE="alphaev56"
-				;;
-			10)
-				UNAME_MACHINE="alphapca56"
-				;;
-			16)
-				UNAME_MACHINE="alphaev6"
-				;;
-			esac	
-
-			objdump --private-headers dummy | \
-			  grep ld.so.1 > /dev/null
-			if test "$?" = 0 ; then
-				LIBC="libc1"
-			fi
-		fi	
-		rm -f dummy.s dummy
-		echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} ; exit 0
-	elif test "${UNAME_MACHINE}" = "mips" ; then
-	  cat >dummy.c <<EOF
-main(argc, argv)
-     int argc;
-     char *argv[];
-{
-#ifdef __MIPSEB__
-  printf ("%s-unknown-linux-gnu\n", argv[1]);
-#endif
-#ifdef __MIPSEL__
-  printf ("%sel-unknown-linux-gnu\n", argv[1]);
-#endif
-  return 0;
-}
-EOF
-	  ${CC-cc} dummy.c -o dummy 2>/dev/null && ./dummy "${UNAME_MACHINE}" && rm dummy.c dummy && exit 0
-	  rm -f dummy.c dummy
-	else
-	  # Either a pre-BFD a.out linker (linux-gnuoldld)
-	  # or one that does not give us useful --help.
-	  # GCC wants to distinguish between linux-gnuoldld and linux-gnuaout.
-	  # If ld does not provide *any* "supported emulations:"
-	  # that means it is gnuoldld.
-	  echo "$ld_help_string" | grep >/dev/null 2>&1 "supported emulations:"
-	  test $? != 0 && echo "${UNAME_MACHINE}-pc-linux-gnuoldld" && exit 0
-
-	  case "${UNAME_MACHINE}" in
-	  i?86)
-	    VENDOR=pc;
-	    ;;
-	  *)
-	    VENDOR=unknown;
-	    ;;
-	  esac
-	  # Determine whether the default compiler is a.out or elf
-	  cat >dummy.c <<EOF
-#include <features.h>
-main(argc, argv)
-     int argc;
-     char *argv[];
-{
-#ifdef __ELF__
-# ifdef __GLIBC__
-#  if __GLIBC__ >= 2
-    printf ("%s-${VENDOR}-linux-gnu\n", argv[1]);
-#  else
-    printf ("%s-${VENDOR}-linux-gnulibc1\n", argv[1]);
-#  endif
-# else
-   printf ("%s-${VENDOR}-linux-gnulibc1\n", argv[1]);
-# endif
-#else
-  printf ("%s-${VENDOR}-linux-gnuaout\n", argv[1]);
-#endif
-  return 0;
-}
-EOF
-	  ${CC-cc} dummy.c -o dummy 2>/dev/null && ./dummy "${UNAME_MACHINE}" && rm dummy.c dummy && exit 0
-	  rm -f dummy.c dummy
-	fi ;;
-# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.  earlier versions
-# are messed up and put the nodename in both sysname and nodename.
-    i?86:DYNIX/ptx:4*:*)
-	echo i386-sequent-sysv4
-	exit 0 ;;
-    i?86:UNIX_SV:4.2MP:2.*)
-        # Unixware is an offshoot of SVR4, but it has its own version
-        # number series starting with 2...
-        # I am not positive that other SVR4 systems won't match this,
-	# I just have to hope.  -- rms.
-        # Use sysv4.2uw... so that sysv4* matches it.
-	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
-	exit 0 ;;
-    i?86:*:4.*:* | i?86:SYSTEM_V:4.*:*)
-	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
-		echo ${UNAME_MACHINE}-univel-sysv${UNAME_RELEASE}
-	else
-		echo ${UNAME_MACHINE}-pc-sysv${UNAME_RELEASE}
-	fi
-	exit 0 ;;
-    i?86:*:3.2:*)
-	if test -f /usr/options/cb.name; then
-		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
-		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
-	elif /bin/uname -X 2>/dev/null >/dev/null ; then
-		UNAME_REL=`(/bin/uname -X|egrep Release|sed -e 's/.*= //')`
-		(/bin/uname -X|egrep i80486 >/dev/null) && UNAME_MACHINE=i486
-		(/bin/uname -X|egrep '^Machine.*Pentium' >/dev/null) \
-			&& UNAME_MACHINE=i586
-		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
-	else
-		echo ${UNAME_MACHINE}-pc-sysv32
-	fi
-	exit 0 ;;
-    pc:*:*:*)
-        # uname -m prints for DJGPP always 'pc', but it prints nothing about
-        # the processor, so we play safe by assuming i386.
-	echo i386-pc-msdosdjgpp
-        exit 0 ;;
-    Intel:Mach:3*:*)
-	echo i386-pc-mach3
-	exit 0 ;;
-    paragon:*:*:*)
-	echo i860-intel-osf1
-	exit 0 ;;
-    i860:*:4.*:*) # i860-SVR4
-	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
-	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
-	else # Add other i860-SVR4 vendors below as they are discovered.
-	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
-	fi
-	exit 0 ;;
-    mini*:CTIX:SYS*5:*)
-	# "miniframe"
-	echo m68010-convergent-sysv
-	exit 0 ;;
-    M68*:*:R3V[567]*:*)
-	test -r /sysV68 && echo 'm68k-motorola-sysv' && exit 0 ;;
-    3[34]??:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 4850:*:4.0:3.0)
-	OS_REL=''
-	test -r /etc/.relid \
-	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && echo i486-ncr-sysv4.3${OS_REL} && exit 0
-	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	  && echo i586-ncr-sysv4.3${OS_REL} && exit 0 ;;
-    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
-        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-          && echo i486-ncr-sysv4 && exit 0 ;;
-    m68*:LynxOS:2.*:*)
-	echo m68k-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
-    mc68030:UNIX_System_V:4.*:*)
-	echo m68k-atari-sysv4
-	exit 0 ;;
-    i?86:LynxOS:2.*:*)
-	echo i386-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
-    TSUNAMI:LynxOS:2.*:*)
-	echo sparc-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
-    rs6000:LynxOS:2.*:* | PowerPC:LynxOS:2.*:*)
-	echo rs6000-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
-    SM[BE]S:UNIX_SV:*:*)
-	echo mips-dde-sysv${UNAME_RELEASE}
-	exit 0 ;;
-    RM*:SINIX-*:*:*)
-	echo mips-sni-sysv4
-	exit 0 ;;
-    *:SINIX-*:*:*)
-	if uname -p 2>/dev/null >/dev/null ; then
-		UNAME_MACHINE=`(uname -p) 2>/dev/null`
-		echo ${UNAME_MACHINE}-sni-sysv4
-	else
-		echo ns32k-sni-sysv
-	fi
-	exit 0 ;;
-    PENTIUM:CPunix:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
-                           # says <Richard.M.Bartel@ccMail.Census.GOV>
-        echo i586-unisys-sysv4
-        exit 0 ;;
-    *:UNIX_System_V:4*:FTX*)
-	# From Gerald Hewes <hewes@openmarket.com>.
-	# How about differentiating between stratus architectures? -djm
-	echo hppa1.1-stratus-sysv4
-	exit 0 ;;
-    *:*:*:FTX*)
-	# From seanf@swdc.stratus.com.
-	echo i860-stratus-sysv4
-	exit 0 ;;
-    mc68*:A/UX:*:*)
-	echo m68k-apple-aux${UNAME_RELEASE}
-	exit 0 ;;
-    news*:NEWS-OS:*:6*)
-	echo mips-sony-newsos6
-	exit 0 ;;
-    R3000:*System_V*:*:* | R4000:UNIX_SYSV:*:*)
-	if [ -d /usr/nec ]; then
-	        echo mips-nec-sysv${UNAME_RELEASE}
-	else
-	        echo mips-unknown-sysv${UNAME_RELEASE}
-	fi
-        exit 0 ;;
-esac
-
-#echo '(No uname command or uname output not recognized.)' 1>&2
-#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
-
-cat >dummy.c <<EOF
-#ifdef _SEQUENT_
-# include <sys/types.h>
-# include <sys/utsname.h>
-#endif
-main ()
-{
-#if defined (sony)
-#if defined (MIPSEB)
-  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
-     I don't know....  */
-  printf ("mips-sony-bsd\n"); exit (0);
-#else
-#include <sys/param.h>
-  printf ("m68k-sony-newsos%s\n",
-#ifdef NEWSOS4
-          "4"
-#else
-	  ""
-#endif
-         ); exit (0);
-#endif
-#endif
-
-#if defined (__arm) && defined (__acorn) && defined (__unix)
-  printf ("arm-acorn-riscix"); exit (0);
-#endif
-
-#if defined (hp300) && !defined (hpux)
-  printf ("m68k-hp-bsd\n"); exit (0);
-#endif
-
-#if defined (NeXT)
-#if !defined (__ARCHITECTURE__)
-#define __ARCHITECTURE__ "m68k"
-#endif
-  int version;
-  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
-  printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
-  exit (0);
-#endif
-
-#if defined (MULTIMAX) || defined (n16)
-#if defined (UMAXV)
-  printf ("ns32k-encore-sysv\n"); exit (0);
-#else
-#if defined (CMU)
-  printf ("ns32k-encore-mach\n"); exit (0);
-#else
-  printf ("ns32k-encore-bsd\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (__386BSD__)
-  printf ("i386-pc-bsd\n"); exit (0);
-#endif
-
-#if defined (sequent)
-#if defined (i386)
-  printf ("i386-sequent-dynix\n"); exit (0);
-#endif
-#if defined (ns32000)
-  printf ("ns32k-sequent-dynix\n"); exit (0);
-#endif
-#endif
-
-#if defined (_SEQUENT_)
-    struct utsname un;
-
-    uname(&un);
-
-    if (strncmp(un.version, "V2", 2) == 0) {
-	printf ("i386-sequent-ptx2\n"); exit (0);
-    }
-    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
-	printf ("i386-sequent-ptx1\n"); exit (0);
-    }
-    printf ("i386-sequent-ptx\n"); exit (0);
-
-#endif
-
-#if defined (vax)
-#if !defined (ultrix)
-  printf ("vax-dec-bsd\n"); exit (0);
-#else
-  printf ("vax-dec-ultrix\n"); exit (0);
-#endif
-#endif
-
-#if defined (alliant) && defined (i860)
-  printf ("i860-alliant-bsd\n"); exit (0);
-#endif
-
-  exit (1);
-}
-EOF
-
-${CC-cc} dummy.c -o dummy 2>/dev/null && ./dummy && rm dummy.c dummy && exit 0
-rm -f dummy.c dummy
-
-# Apollos put the system type in the environment.
-
-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit 0; }
-
-# Convex versions that predate uname can use getsysinfo(1)
-
-if [ -x /usr/convex/getsysinfo ]
-then
-    case `getsysinfo -f cpu_type` in
-    c1*)
-	echo c1-convex-bsd
-	exit 0 ;;
-    c2*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-	exit 0 ;;
-    c34*)
-	echo c34-convex-bsd
-	exit 0 ;;
-    c38*)
-	echo c38-convex-bsd
-	exit 0 ;;
-    c4*)
-	echo c4-convex-bsd
-	exit 0 ;;
-    esac
-fi
-
-#echo '(Unable to guess system type)' 1>&2
-
-exit 1

diff --git a/config.sub b/config.sub
deleted file mode 100755
index 213a6d4..0000000
--- a/config.sub
+++ /dev/null

@@ -1,954 +0,0 @@
-#! /bin/sh
-# Configuration validation subroutine script, version 1.1.
-#   Copyright (C) 1991, 92, 93, 94, 95, 96, 1997 Free Software Foundation, Inc.
-# This file is (in principle) common to ALL GNU software.
-# The presence of a machine in this file suggests that SOME GNU software
-# can handle that machine.  It does not imply ALL GNU software can.
-#
-# This file is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330,
-# Boston, MA 02111-1307, USA.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# Configuration subroutine to validate and canonicalize a configuration type.
-# Supply the specified configuration type as an argument.
-# If it is invalid, we print an error message on stderr and exit with code 1.
-# Otherwise, we print the canonical config type on stdout and succeed.
-
-# This file is supposed to be the same for all GNU packages
-# and recognize all the CPU types, system types and aliases
-# that are meaningful with *any* GNU software.
-# Each package is responsible for reporting which valid configurations
-# it does not support.  The user should be able to distinguish
-# a failure to support a valid configuration from a meaningless
-# configuration.
-
-# The goal of this file is to map all the various variations of a given
-# machine specification into a single specification in the form:
-#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
-# or in some cases, the newer four-part form:
-#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
-# It is wrong to echo any other type of specification.
-
-if [ x$1 = x ]
-then
-	echo Configuration name missing. 1>&2
-	echo "Usage: $0 CPU-MFR-OPSYS" 1>&2
-	echo "or     $0 ALIAS" 1>&2
-	echo where ALIAS is a recognized configuration type. 1>&2
-	exit 1
-fi
-
-# First pass through any local machine types.
-case $1 in
-	*local*)
-		echo $1
-		exit 0
-		;;
-	*)
-	;;
-esac
-
-# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
-# Here we must recognize all the valid KERNEL-OS combinations.
-maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
-case $maybe_os in
-  linux-gnu*)
-    os=-$maybe_os
-    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
-    ;;
-  *)
-    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
-    if [ $basic_machine != $1 ]
-    then os=`echo $1 | sed 's/.*-/-/'`
-    else os=; fi
-    ;;
-esac
-
-### Let's recognize common machines as not being operating systems so
-### that things like config.sub decstation-3100 work.  We also
-### recognize some manufacturers as not being operating systems, so we
-### can provide default operating systems below.
-case $os in
-	-sun*os*)
-		# Prevent following clause from handling this invalid input.
-		;;
-	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
-	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
-	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
-	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
-	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
-	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-	-apple)
-		os=
-		basic_machine=$1
-		;;
-	-hiux*)
-		os=-hiuxwe2
-		;;
-	-sco5)
-		os=sco3.2v5
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco4)
-		os=-sco3.2v4
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco3.2.[4-9]*)
-		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco3.2v[4-9]*)
-		# Don't forget version if it is 3.2v4 or newer.
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco*)
-		os=-sco3.2v2
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-isc)
-		os=-isc2.2
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-clix*)
-		basic_machine=clipper-intergraph
-		;;
-	-isc*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-lynx*)
-		os=-lynxos
-		;;
-	-ptx*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
-		;;
-	-windowsnt*)
-		os=`echo $os | sed -e 's/windowsnt/winnt/'`
-		;;
-	-psos*)
-		os=-psos
-		;;
-esac
-
-# Decode aliases for certain CPU-COMPANY combinations.
-case $basic_machine in
-	# Recognize the basic CPU types without company name.
-	# Some are omitted here because they have special meanings below.
-	tahoe | i860 | m32r | m68k | m68000 | m88k | ns32k | arc | arm \
-		| arme[lb] | pyramid | mn10200 | mn10300 \
-		| tron | a29k | 580 | i960 | h8300 | hppa | hppa1.0 | hppa1.1 \
-		| alpha | alphaev5 | alphaev56 | we32k | ns16k | clipper \
-		| i370 | sh | powerpc | powerpcle | 1750a | dsp16xx | pdp11 \
-		| mips64 | mipsel | mips64el | mips64orion | mips64orionel \
-		| mipstx39 | mipstx39el \
-		| sparc | sparclet | sparclite | sparc64 | v850)
-		basic_machine=$basic_machine-unknown
-		;;
-	# We use `pc' rather than `unknown'
-	# because (1) that's what they normally are, and
-	# (2) the word "unknown" tends to confuse beginning users.
-	i[3456]86)
-	  basic_machine=$basic_machine-pc
-	  ;;
-	# Object if more than one company name word.
-	*-*-*)
-		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
-		exit 1
-		;;
-	# Recognize the basic CPU types with company name.
-	vax-* | tahoe-* | i[3456]86-* | i860-* | m32r-* | m68k-* | m68000-* \
-	      | m88k-* | sparc-* | ns32k-* | fx80-* | arc-* | arm-* | c[123]* \
-	      | mips-* | pyramid-* | tron-* | a29k-* | romp-* | rs6000-* \
-	      | power-* | none-* | 580-* | cray2-* | h8300-* | i960-* \
-	      | xmp-* | ymp-* | hppa-* | hppa1.0-* | hppa1.1-* \
-	      | alpha-* | alphaev5-* | alphaev56-* | we32k-* | cydra-* \
-	      | ns16k-* | pn-* | np1-* | xps100-* | clipper-* | orion-* \
-	      | sparclite-* | pdp11-* | sh-* | powerpc-* | powerpcle-* \
-	      | sparc64-* | mips64-* | mipsel-* \
-	      | mips64el-* | mips64orion-* | mips64orionel-*  \
-	      | mipstx39-* | mipstx39el-* \
-	      | f301-*)
-		;;
-	# Recognize the various machine names and aliases which stand
-	# for a CPU type and a company and sometimes even an OS.
-	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
-		basic_machine=m68000-att
-		;;
-	3b*)
-		basic_machine=we32k-att
-		;;
-	alliant | fx80)
-		basic_machine=fx80-alliant
-		;;
-	altos | altos3068)
-		basic_machine=m68k-altos
-		;;
-	am29k)
-		basic_machine=a29k-none
-		os=-bsd
-		;;
-	amdahl)
-		basic_machine=580-amdahl
-		os=-sysv
-		;;
-	amiga | amiga-*)
-		basic_machine=m68k-cbm
-		;;
-	amigaos | amigados)
-		basic_machine=m68k-cbm
-		os=-amigaos
-		;;
-	amigaunix | amix)
-		basic_machine=m68k-cbm
-		os=-sysv4
-		;;
-	apollo68)
-		basic_machine=m68k-apollo
-		os=-sysv
-		;;
-	aux)
-		basic_machine=m68k-apple
-		os=-aux
-		;;
-	balance)
-		basic_machine=ns32k-sequent
-		os=-dynix
-		;;
-	convex-c1)
-		basic_machine=c1-convex
-		os=-bsd
-		;;
-	convex-c2)
-		basic_machine=c2-convex
-		os=-bsd
-		;;
-	convex-c32)
-		basic_machine=c32-convex
-		os=-bsd
-		;;
-	convex-c34)
-		basic_machine=c34-convex
-		os=-bsd
-		;;
-	convex-c38)
-		basic_machine=c38-convex
-		os=-bsd
-		;;
-	cray | ymp)
-		basic_machine=ymp-cray
-		os=-unicos
-		;;
-	cray2)
-		basic_machine=cray2-cray
-		os=-unicos
-		;;
-	[ctj]90-cray)
-		basic_machine=c90-cray
-		os=-unicos
-		;;
-	crds | unos)
-		basic_machine=m68k-crds
-		;;
-	da30 | da30-*)
-		basic_machine=m68k-da30
-		;;
-	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
-		basic_machine=mips-dec
-		;;
-	delta | 3300 | motorola-3300 | motorola-delta \
-	      | 3300-motorola | delta-motorola)
-		basic_machine=m68k-motorola
-		;;
-	delta88)
-		basic_machine=m88k-motorola
-		os=-sysv3
-		;;
-	dpx20 | dpx20-*)
-		basic_machine=rs6000-bull
-		os=-bosx
-		;;
-	dpx2* | dpx2*-bull)
-		basic_machine=m68k-bull
-		os=-sysv3
-		;;
-	ebmon29k)
-		basic_machine=a29k-amd
-		os=-ebmon
-		;;
-	elxsi)
-		basic_machine=elxsi-elxsi
-		os=-bsd
-		;;
-	encore | umax | mmax)
-		basic_machine=ns32k-encore
-		;;
-	fx2800)
-		basic_machine=i860-alliant
-		;;
-	genix)
-		basic_machine=ns32k-ns
-		;;
-	gmicro)
-		basic_machine=tron-gmicro
-		os=-sysv
-		;;
-	h3050r* | hiux*)
-		basic_machine=hppa1.1-hitachi
-		os=-hiuxwe2
-		;;
-	h8300hms)
-		basic_machine=h8300-hitachi
-		os=-hms
-		;;
-	harris)
-		basic_machine=m88k-harris
-		os=-sysv3
-		;;
-	hp300-*)
-		basic_machine=m68k-hp
-		;;
-	hp300bsd)
-		basic_machine=m68k-hp
-		os=-bsd
-		;;
-	hp300hpux)
-		basic_machine=m68k-hp
-		os=-hpux
-		;;
-	hp9k2[0-9][0-9] | hp9k31[0-9])
-		basic_machine=m68000-hp
-		;;
-	hp9k3[2-9][0-9])
-		basic_machine=m68k-hp
-		;;
-	hp9k7[0-9][0-9] | hp7[0-9][0-9] | hp9k8[0-9]7 | hp8[0-9]7)
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[0-9][0-9] | hp8[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hppa-next)
-		os=-nextstep3
-		;;
-	i370-ibm* | ibm*)
-		basic_machine=i370-ibm
-		os=-mvs
-		;;
-# I'm not sure what "Sysv32" means.  Should this be sysv3.2?
-	i[3456]86v32)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv32
-		;;
-	i[3456]86v4*)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv4
-		;;
-	i[3456]86v)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv
-		;;
-	i[3456]86sol2)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-solaris2
-		;;
-	iris | iris4d)
-		basic_machine=mips-sgi
-		case $os in
-		    -irix*)
-			;;
-		    *)
-			os=-irix4
-			;;
-		esac
-		;;
-	isi68 | isi)
-		basic_machine=m68k-isi
-		os=-sysv
-		;;
-	m88k-omron*)
-		basic_machine=m88k-omron
-		;;
-	magnum | m3230)
-		basic_machine=mips-mips
-		os=-sysv
-		;;
-	merlin)
-		basic_machine=ns32k-utek
-		os=-sysv
-		;;
-	miniframe)
-		basic_machine=m68000-convergent
-		;;
-	mipsel*-linux*)
-		basic_machine=mipsel-unknown
-		os=-linux-gnu
-		;;
-	mips*-linux*)
-		basic_machine=mips-unknown
-		os=-linux-gnu
-		;;
-	mips3*-*)
-		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
-		;;
-	mips3*)
-		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
-		;;
-	ncr3000)
-		basic_machine=i486-ncr
-		os=-sysv4
-		;;
-	news | news700 | news800 | news900)
-		basic_machine=m68k-sony
-		os=-newsos
-		;;
-	news1000)
-		basic_machine=m68030-sony
-		os=-newsos
-		;;
-	news-3600 | risc-news)
-		basic_machine=mips-sony
-		os=-newsos
-		;;
-	next | m*-next )
-		basic_machine=m68k-next
-		case $os in
-		    -nextstep* )
-			;;
-		    -ns2*)
-		      os=-nextstep2
-			;;
-		    *)
-		      os=-nextstep3
-			;;
-		esac
-		;;
-	nh3000)
-		basic_machine=m68k-harris
-		os=-cxux
-		;;
-	nh[45]000)
-		basic_machine=m88k-harris
-		os=-cxux
-		;;
-	nindy960)
-		basic_machine=i960-intel
-		os=-nindy
-		;;
-	np1)
-		basic_machine=np1-gould
-		;;
-	pa-hitachi)
-		basic_machine=hppa1.1-hitachi
-		os=-hiuxwe2
-		;;
-	paragon)
-		basic_machine=i860-intel
-		os=-osf
-		;;
-	pbd)
-		basic_machine=sparc-tti
-		;;
-	pbb)
-		basic_machine=m68k-tti
-		;;
-        pc532 | pc532-*)
-		basic_machine=ns32k-pc532
-		;;
-	pentium | p5)
-		basic_machine=i586-intel
-		;;
-	pentiumpro | p6)
-		basic_machine=i686-intel
-		;;
-	pentium-* | p5-*)
-		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentiumpro-* | p6-*)
-		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	k5)
-		# We don't have specific support for AMD's K5 yet, so just call it a Pentium
-		basic_machine=i586-amd
-		;;
-	nexen)
-		# We don't have specific support for Nexgen yet, so just call it a Pentium
-		basic_machine=i586-nexgen
-		;;
-	pn)
-		basic_machine=pn-gould
-		;;
-	power)	basic_machine=rs6000-ibm
-		;;
-	ppc)	basic_machine=powerpc-unknown
-	        ;;
-	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppcle | powerpclittle | ppc-le | powerpc-little)
-		basic_machine=powerpcle-unknown
-	        ;;
-	ppcle-* | powerpclittle-*)
-		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ps2)
-		basic_machine=i386-ibm
-		;;
-	rm[46]00)
-		basic_machine=mips-siemens
-		;;
-	rtpc | rtpc-*)
-		basic_machine=romp-ibm
-		;;
-	sequent)
-		basic_machine=i386-sequent
-		;;
-	sh)
-		basic_machine=sh-hitachi
-		os=-hms
-		;;
-	sps7)
-		basic_machine=m68k-bull
-		os=-sysv2
-		;;
-	spur)
-		basic_machine=spur-unknown
-		;;
-	sun2)
-		basic_machine=m68000-sun
-		;;
-	sun2os3)
-		basic_machine=m68000-sun
-		os=-sunos3
-		;;
-	sun2os4)
-		basic_machine=m68000-sun
-		os=-sunos4
-		;;
-	sun3os3)
-		basic_machine=m68k-sun
-		os=-sunos3
-		;;
-	sun3os4)
-		basic_machine=m68k-sun
-		os=-sunos4
-		;;
-	sun4os3)
-		basic_machine=sparc-sun
-		os=-sunos3
-		;;
-	sun4os4)
-		basic_machine=sparc-sun
-		os=-sunos4
-		;;
-	sun4sol2)
-		basic_machine=sparc-sun
-		os=-solaris2
-		;;
-	sun3 | sun3-*)
-		basic_machine=m68k-sun
-		;;
-	sun4)
-		basic_machine=sparc-sun
-		;;
-	sun386 | sun386i | roadrunner)
-		basic_machine=i386-sun
-		;;
-	symmetry)
-		basic_machine=i386-sequent
-		os=-dynix
-		;;
-	tx39)
-		basic_machine=mipstx39-unknown
-		;;
-	tx39el)
-		basic_machine=mipstx39el-unknown
-		;;
-	tower | tower-32)
-		basic_machine=m68k-ncr
-		;;
-	udi29k)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-	ultra3)
-		basic_machine=a29k-nyu
-		os=-sym1
-		;;
-	vaxv)
-		basic_machine=vax-dec
-		os=-sysv
-		;;
-	vms)
-		basic_machine=vax-dec
-		os=-vms
-		;;
-	vpp*|vx|vx-*)
-               basic_machine=f301-fujitsu
-               ;;
-	vxworks960)
-		basic_machine=i960-wrs
-		os=-vxworks
-		;;
-	vxworks68)
-		basic_machine=m68k-wrs
-		os=-vxworks
-		;;
-	vxworks29k)
-		basic_machine=a29k-wrs
-		os=-vxworks
-		;;
-	xmp)
-		basic_machine=xmp-cray
-		os=-unicos
-		;;
-        xps | xps100)
-		basic_machine=xps100-honeywell
-		;;
-	none)
-		basic_machine=none-none
-		os=-none
-		;;
-
-# Here we handle the default manufacturer of certain CPU types.  It is in
-# some cases the only manufacturer, in others, it is the most popular.
-	mips)
-		if [ x$os = x-linux-gnu ]; then
-			basic_machine=mips-unknown
-		else
-			basic_machine=mips-mips
-		fi
-		;;
-	romp)
-		basic_machine=romp-ibm
-		;;
-	rs6000)
-		basic_machine=rs6000-ibm
-		;;
-	vax)
-		basic_machine=vax-dec
-		;;
-	pdp11)
-		basic_machine=pdp11-dec
-		;;
-	we32k)
-		basic_machine=we32k-att
-		;;
-	sparc)
-		basic_machine=sparc-sun
-		;;
-        cydra)
-		basic_machine=cydra-cydrome
-		;;
-	orion)
-		basic_machine=orion-highlevel
-		;;
-	orion105)
-		basic_machine=clipper-highlevel
-		;;
-	*)
-		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
-		exit 1
-		;;
-esac
-
-# Here we canonicalize certain aliases for manufacturers.
-case $basic_machine in
-	*-digital*)
-		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
-		;;
-	*-commodore*)
-		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
-		;;
-	*)
-		;;
-esac
-
-# Decode manufacturer-specific aliases for certain operating systems.
-
-if [ x"$os" != x"" ]
-then
-case $os in
-        # First match some system type aliases
-        # that might get confused with valid system types.
-	# -solaris* is a basic system type, with this one exception.
-	-solaris1 | -solaris1.*)
-		os=`echo $os | sed -e 's|solaris1|sunos4|'`
-		;;
-	-solaris)
-		os=-solaris2
-		;;
-	-svr4*)
-		os=-sysv4
-		;;
-	-unixware*)
-		os=-sysv4.2uw
-		;;
-	-gnu/linux*)
-		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
-		;;
-	# First accept the basic system types.
-	# The portable systems comes first.
-	# Each alternative MUST END IN A *, to match a version number.
-	# -sysv* is not here because it comes later, after sysvr4.
-	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
-	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
-	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
-	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-	      | -aos* \
-	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
-	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
-	      | -hiux* | -386bsd* | -netbsd* | -openbsd* | -freebsd* | -riscix* \
-	      | -lynxos* | -bosx* | -nextstep* | -cxux* | -aout* | -elf* \
-	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
-	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
-	      | -cygwin32* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
-	      | -mingw32* | -linux-gnu* | -uxpv*)
-	# Remember, each alternative MUST END IN *, to match a version number.
-		;;
-	-linux*)
-		os=`echo $os | sed -e 's|linux|linux-gnu|'`
-		;;
-	-sunos5*)
-		os=`echo $os | sed -e 's|sunos5|solaris2|'`
-		;;
-	-sunos6*)
-		os=`echo $os | sed -e 's|sunos6|solaris3|'`
-		;;
-	-osfrose*)
-		os=-osfrose
-		;;
-	-osf*)
-		os=-osf
-		;;
-	-utek*)
-		os=-bsd
-		;;
-	-dynix*)
-		os=-bsd
-		;;
-	-acis*)
-		os=-aos
-		;;
-	-ctix* | -uts*)
-		os=-sysv
-		;;
-	-ns2 )
-	        os=-nextstep2
-		;;
-	# Preserve the version number of sinix5.
-	-sinix5.*)
-		os=`echo $os | sed -e 's|sinix|sysv|'`
-		;;
-	-sinix*)
-		os=-sysv4
-		;;
-	-triton*)
-		os=-sysv3
-		;;
-	-oss*)
-		os=-sysv3
-		;;
-	-svr4)
-		os=-sysv4
-		;;
-	-svr3)
-		os=-sysv3
-		;;
-	-sysvr4)
-		os=-sysv4
-		;;
-	# This must come after -sysvr4.
-	-sysv*)
-		;;
-	-xenix)
-		os=-xenix
-		;;
-	-none)
-		;;
-	*)
-		# Get rid of the `-' at the beginning of $os.
-		os=`echo $os | sed 's/[^-]*-//'`
-		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
-		exit 1
-		;;
-esac
-else
-
-# Here we handle the default operating systems that come with various machines.
-# The value should be what the vendor currently ships out the door with their
-# machine or put another way, the most popular os provided with the machine.
-
-# Note that if you're going to try to match "-MANUFACTURER" here (say,
-# "-sun"), then you have to tell the case statement up towards the top
-# that MANUFACTURER isn't an operating system.  Otherwise, code above
-# will signal an error saying that MANUFACTURER isn't an operating
-# system, and we'll never get to this point.
-
-case $basic_machine in
-	*-acorn)
-		os=-riscix1.2
-		;;
-	arm*-semi)
-		os=-aout
-		;;
-        pdp11-*)
-		os=-none
-		;;
-	*-dec | vax-*)
-		os=-ultrix4.2
-		;;
-	m68*-apollo)
-		os=-domain
-		;;
-	i386-sun)
-		os=-sunos4.0.2
-		;;
-	m68000-sun)
-		os=-sunos3
-		# This also exists in the configure program, but was not the
-		# default.
-		# os=-sunos4
-		;;
-	*-tti)	# must be before sparc entry or we get the wrong os.
-		os=-sysv3
-		;;
-	sparc-* | *-sun)
-		os=-sunos4.1.1
-		;;
-	*-ibm)
-		os=-aix
-		;;
-	*-hp)
-		os=-hpux
-		;;
-	*-hitachi)
-		os=-hiux
-		;;
-	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
-		os=-sysv
-		;;
-	*-cbm)
-		os=-amigaos
-		;;
-	*-dg)
-		os=-dgux
-		;;
-	*-dolphin)
-		os=-sysv3
-		;;
-	m68k-ccur)
-		os=-rtu
-		;;
-	m88k-omron*)
-		os=-luna
-		;;
-	*-next )
-		os=-nextstep
-		;;
-	*-sequent)
-		os=-ptx
-		;;
-	*-crds)
-		os=-unos
-		;;
-	*-ns)
-		os=-genix
-		;;
-	i370-*)
-		os=-mvs
-		;;
-	*-next)
-		os=-nextstep3
-		;;
-        *-gould)
-		os=-sysv
-		;;
-        *-highlevel)
-		os=-bsd
-		;;
-	*-encore)
-		os=-bsd
-		;;
-        *-sgi)
-		os=-irix
-		;;
-        *-siemens)
-		os=-sysv4
-		;;
-	*-masscomp)
-		os=-rtu
-		;;
-	f301-fujitsu)
-		os=-uxpv
-		;;
-	*)
-		os=-none
-		;;
-esac
-fi
-
-# Here we handle the case where we know the os, and the CPU type, but not the
-# manufacturer.  We pick the logical manufacturer.
-vendor=unknown
-case $basic_machine in
-	*-unknown)
-		case $os in
-			-riscix*)
-				vendor=acorn
-				;;
-			-sunos*)
-				vendor=sun
-				;;
-			-aix*)
-				vendor=ibm
-				;;
-			-hpux*)
-				vendor=hp
-				;;
-			-hiux*)
-				vendor=hitachi
-				;;
-			-unos*)
-				vendor=crds
-				;;
-			-dgux*)
-				vendor=dg
-				;;
-			-luna*)
-				vendor=omron
-				;;
-			-genix*)
-				vendor=ns
-				;;
-			-mvs*)
-				vendor=ibm
-				;;
-			-ptx*)
-				vendor=sequent
-				;;
-			-vxsim* | -vxworks*)
-				vendor=wrs
-				;;
-			-aux*)
-				vendor=apple
-				;;
-		esac
-		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
-		;;
-esac
-
-echo $basic_machine$os

diff --git a/configure b/configure
deleted file mode 100755
index 35c9db5..0000000
--- a/configure
+++ /dev/null

@@ -1,2011 +0,0 @@
-#! /bin/sh
-
-# Guess values for system-dependent variables and create Makefiles.
-# Generated automatically using autoconf version 2.12 
-# Copyright (C) 1992, 93, 94, 95, 96 Free Software Foundation, Inc.
-#
-# This configure script is free software; the Free Software Foundation
-# gives unlimited permission to copy, distribute and modify it.
-
-# Defaults:
-ac_help=
-ac_default_prefix=/usr/local
-# Any additions from configure.in:
-ac_help="$ac_help
-  --enable-shared         build shared library using GNU libtool"
-ac_help="$ac_help
-  --enable-static         build static library using GNU libtool"
-ac_help="$ac_help
-  --enable-maxmem[=N]     enable use of temp files, set max mem usage to N MB"
-ac_help="$ac_help
-"
-
-# Initialize some variables set by options.
-# The variables have the same names as the options, with
-# dashes changed to underlines.
-build=NONE
-cache_file=./config.cache
-exec_prefix=NONE
-host=NONE
-no_create=
-nonopt=NONE
-no_recursion=
-prefix=NONE
-program_prefix=NONE
-program_suffix=NONE
-program_transform_name=s,x,x,
-silent=
-site=
-srcdir=
-target=NONE
-verbose=
-x_includes=NONE
-x_libraries=NONE
-bindir='${exec_prefix}/bin'
-sbindir='${exec_prefix}/sbin'
-libexecdir='${exec_prefix}/libexec'
-datadir='${prefix}/share'
-sysconfdir='${prefix}/etc'
-sharedstatedir='${prefix}/com'
-localstatedir='${prefix}/var'
-libdir='${exec_prefix}/lib'
-includedir='${prefix}/include'
-oldincludedir='/usr/include'
-infodir='${prefix}/info'
-mandir='${prefix}/man'
-
-# Initialize some other variables.
-subdirs=
-MFLAGS= MAKEFLAGS=
-# Maximum number of lines to put in a shell here document.
-ac_max_here_lines=12
-
-ac_prev=
-for ac_option
-do
-
-  # If the previous option needs an argument, assign it.
-  if test -n "$ac_prev"; then
-    eval "$ac_prev=\$ac_option"
-    ac_prev=
-    continue
-  fi
-
-  case "$ac_option" in
-  -*=*) ac_optarg=`echo "$ac_option" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
-  *) ac_optarg= ;;
-  esac
-
-  # Accept the important Cygnus configure options, so we can diagnose typos.
-
-  case "$ac_option" in
-
-  -bindir | --bindir | --bindi | --bind | --bin | --bi)
-    ac_prev=bindir ;;
-  -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
-    bindir="$ac_optarg" ;;
-
-  -build | --build | --buil | --bui | --bu)
-    ac_prev=build ;;
-  -build=* | --build=* | --buil=* | --bui=* | --bu=*)
-    build="$ac_optarg" ;;
-
-  -cache-file | --cache-file | --cache-fil | --cache-fi \
-  | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
-    ac_prev=cache_file ;;
-  -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
-  | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
-    cache_file="$ac_optarg" ;;
-
-  -datadir | --datadir | --datadi | --datad | --data | --dat | --da)
-    ac_prev=datadir ;;
-  -datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \
-  | --da=*)
-    datadir="$ac_optarg" ;;
-
-  -disable-* | --disable-*)
-    ac_feature=`echo $ac_option|sed -e 's/-*disable-//'`
-    # Reject names that are not valid shell variable names.
-    if test -n "`echo $ac_feature| sed 's/[-a-zA-Z0-9_]//g'`"; then
-      { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; }
-    fi
-    ac_feature=`echo $ac_feature| sed 's/-/_/g'`
-    eval "enable_${ac_feature}=no" ;;
-
-  -enable-* | --enable-*)
-    ac_feature=`echo $ac_option|sed -e 's/-*enable-//' -e 's/=.*//'`
-    # Reject names that are not valid shell variable names.
-    if test -n "`echo $ac_feature| sed 's/[-_a-zA-Z0-9]//g'`"; then
-      { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; }
-    fi
-    ac_feature=`echo $ac_feature| sed 's/-/_/g'`
-    case "$ac_option" in
-      *=*) ;;
-      *) ac_optarg=yes ;;
-    esac
-    eval "enable_${ac_feature}='$ac_optarg'" ;;
-
-  -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
-  | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
-  | --exec | --exe | --ex)
-    ac_prev=exec_prefix ;;
-  -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
-  | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
-  | --exec=* | --exe=* | --ex=*)
-    exec_prefix="$ac_optarg" ;;
-
-  -gas | --gas | --ga | --g)
-    # Obsolete; use --with-gas.
-    with_gas=yes ;;
-
-  -help | --help | --hel | --he)
-    # Omit some internal or obsolete options to make the list less imposing.
-    # This message is too long to be a string in the A/UX 3.1 sh.
-    cat << EOF
-Usage: configure [options] [host]
-Options: [defaults in brackets after descriptions]
-Configuration:
-  --cache-file=FILE       cache test results in FILE
-  --help                  print this message
-  --no-create             do not create output files
-  --quiet, --silent       do not print \`checking...' messages
-  --version               print the version of autoconf that created configure
-Directory and file names:
-  --prefix=PREFIX         install architecture-independent files in PREFIX
-                          [$ac_default_prefix]
-  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
-                          [same as prefix]
-  --bindir=DIR            user executables in DIR [EPREFIX/bin]
-  --sbindir=DIR           system admin executables in DIR [EPREFIX/sbin]
-  --libexecdir=DIR        program executables in DIR [EPREFIX/libexec]
-  --datadir=DIR           read-only architecture-independent data in DIR
-                          [PREFIX/share]
-  --sysconfdir=DIR        read-only single-machine data in DIR [PREFIX/etc]
-  --sharedstatedir=DIR    modifiable architecture-independent data in DIR
-                          [PREFIX/com]
-  --localstatedir=DIR     modifiable single-machine data in DIR [PREFIX/var]
-  --libdir=DIR            object code libraries in DIR [EPREFIX/lib]
-  --includedir=DIR        C header files in DIR [PREFIX/include]
-  --oldincludedir=DIR     C header files for non-gcc in DIR [/usr/include]
-  --infodir=DIR           info documentation in DIR [PREFIX/info]
-  --mandir=DIR            man documentation in DIR [PREFIX/man]
-  --srcdir=DIR            find the sources in DIR [configure dir or ..]
-  --program-prefix=PREFIX prepend PREFIX to installed program names
-  --program-suffix=SUFFIX append SUFFIX to installed program names
-  --program-transform-name=PROGRAM
-                          run sed PROGRAM on installed program names
-EOF
-    cat << EOF
-Host type:
-  --build=BUILD           configure for building on BUILD [BUILD=HOST]
-  --host=HOST             configure for HOST [guessed]
-  --target=TARGET         configure for TARGET [TARGET=HOST]
-Features and packages:
-  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
-  --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
-  --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
-  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
-  --x-includes=DIR        X include files are in DIR
-  --x-libraries=DIR       X library files are in DIR
-EOF
-    if test -n "$ac_help"; then
-      echo "--enable and --with options recognized:$ac_help"
-    fi
-    exit 0 ;;
-
-  -host | --host | --hos | --ho)
-    ac_prev=host ;;
-  -host=* | --host=* | --hos=* | --ho=*)
-    host="$ac_optarg" ;;
-
-  -includedir | --includedir | --includedi | --included | --include \
-  | --includ | --inclu | --incl | --inc)
-    ac_prev=includedir ;;
-  -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
-  | --includ=* | --inclu=* | --incl=* | --inc=*)
-    includedir="$ac_optarg" ;;
-
-  -infodir | --infodir | --infodi | --infod | --info | --inf)
-    ac_prev=infodir ;;
-  -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
-    infodir="$ac_optarg" ;;
-
-  -libdir | --libdir | --libdi | --libd)
-    ac_prev=libdir ;;
-  -libdir=* | --libdir=* | --libdi=* | --libd=*)
-    libdir="$ac_optarg" ;;
-
-  -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
-  | --libexe | --libex | --libe)
-    ac_prev=libexecdir ;;
-  -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
-  | --libexe=* | --libex=* | --libe=*)
-    libexecdir="$ac_optarg" ;;
-
-  -localstatedir | --localstatedir | --localstatedi | --localstated \
-  | --localstate | --localstat | --localsta | --localst \
-  | --locals | --local | --loca | --loc | --lo)
-    ac_prev=localstatedir ;;
-  -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
-  | --localstate=* | --localstat=* | --localsta=* | --localst=* \
-  | --locals=* | --local=* | --loca=* | --loc=* | --lo=*)
-    localstatedir="$ac_optarg" ;;
-
-  -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
-    ac_prev=mandir ;;
-  -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
-    mandir="$ac_optarg" ;;
-
-  -nfp | --nfp | --nf)
-    # Obsolete; use --without-fp.
-    with_fp=no ;;
-
-  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
-  | --no-cr | --no-c)
-    no_create=yes ;;
-
-  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
-  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
-    no_recursion=yes ;;
-
-  -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
-  | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
-  | --oldin | --oldi | --old | --ol | --o)
-    ac_prev=oldincludedir ;;
-  -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
-  | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
-  | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
-    oldincludedir="$ac_optarg" ;;
-
-  -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
-    ac_prev=prefix ;;
-  -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
-    prefix="$ac_optarg" ;;
-
-  -program-prefix | --program-prefix | --program-prefi | --program-pref \
-  | --program-pre | --program-pr | --program-p)
-    ac_prev=program_prefix ;;
-  -program-prefix=* | --program-prefix=* | --program-prefi=* \
-  | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
-    program_prefix="$ac_optarg" ;;
-
-  -program-suffix | --program-suffix | --program-suffi | --program-suff \
-  | --program-suf | --program-su | --program-s)
-    ac_prev=program_suffix ;;
-  -program-suffix=* | --program-suffix=* | --program-suffi=* \
-  | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
-    program_suffix="$ac_optarg" ;;
-
-  -program-transform-name | --program-transform-name \
-  | --program-transform-nam | --program-transform-na \
-  | --program-transform-n | --program-transform- \
-  | --program-transform | --program-transfor \
-  | --program-transfo | --program-transf \
-  | --program-trans | --program-tran \
-  | --progr-tra | --program-tr | --program-t)
-    ac_prev=program_transform_name ;;
-  -program-transform-name=* | --program-transform-name=* \
-  | --program-transform-nam=* | --program-transform-na=* \
-  | --program-transform-n=* | --program-transform-=* \
-  | --program-transform=* | --program-transfor=* \
-  | --program-transfo=* | --program-transf=* \
-  | --program-trans=* | --program-tran=* \
-  | --progr-tra=* | --program-tr=* | --program-t=*)
-    program_transform_name="$ac_optarg" ;;
-
-  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
-  | -silent | --silent | --silen | --sile | --sil)
-    silent=yes ;;
-
-  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
-    ac_prev=sbindir ;;
-  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
-  | --sbi=* | --sb=*)
-    sbindir="$ac_optarg" ;;
-
-  -sharedstatedir | --sharedstatedir | --sharedstatedi \
-  | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
-  | --sharedst | --shareds | --shared | --share | --shar \
-  | --sha | --sh)
-    ac_prev=sharedstatedir ;;
-  -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
-  | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
-  | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
-  | --sha=* | --sh=*)
-    sharedstatedir="$ac_optarg" ;;
-
-  -site | --site | --sit)
-    ac_prev=site ;;
-  -site=* | --site=* | --sit=*)
-    site="$ac_optarg" ;;
-
-  -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
-    ac_prev=srcdir ;;
-  -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
-    srcdir="$ac_optarg" ;;
-
-  -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
-  | --syscon | --sysco | --sysc | --sys | --sy)
-    ac_prev=sysconfdir ;;
-  -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
-  | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
-    sysconfdir="$ac_optarg" ;;
-
-  -target | --target | --targe | --targ | --tar | --ta | --t)
-    ac_prev=target ;;
-  -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
-    target="$ac_optarg" ;;
-
-  -v | -verbose | --verbose | --verbos | --verbo | --verb)
-    verbose=yes ;;
-
-  -version | --version | --versio | --versi | --vers)
-    echo "configure generated by autoconf version 2.12"
-    exit 0 ;;
-
-  -with-* | --with-*)
-    ac_package=`echo $ac_option|sed -e 's/-*with-//' -e 's/=.*//'`
-    # Reject names that are not valid shell variable names.
-    if test -n "`echo $ac_package| sed 's/[-_a-zA-Z0-9]//g'`"; then
-      { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; }
-    fi
-    ac_package=`echo $ac_package| sed 's/-/_/g'`
-    case "$ac_option" in
-      *=*) ;;
-      *) ac_optarg=yes ;;
-    esac
-    eval "with_${ac_package}='$ac_optarg'" ;;
-
-  -without-* | --without-*)
-    ac_package=`echo $ac_option|sed -e 's/-*without-//'`
-    # Reject names that are not valid shell variable names.
-    if test -n "`echo $ac_package| sed 's/[-a-zA-Z0-9_]//g'`"; then
-      { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; }
-    fi
-    ac_package=`echo $ac_package| sed 's/-/_/g'`
-    eval "with_${ac_package}=no" ;;
-
-  --x)
-    # Obsolete; use --with-x.
-    with_x=yes ;;
-
-  -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
-  | --x-incl | --x-inc | --x-in | --x-i)
-    ac_prev=x_includes ;;
-  -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
-  | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
-    x_includes="$ac_optarg" ;;
-
-  -x-libraries | --x-libraries | --x-librarie | --x-librari \
-  | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
-    ac_prev=x_libraries ;;
-  -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
-  | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
-    x_libraries="$ac_optarg" ;;
-
-  -*) { echo "configure: error: $ac_option: invalid option; use --help to show usage" 1>&2; exit 1; }
-    ;;
-
-  *=*)
-    varname=`echo "$ac_option"|sed -e 's/=.*//'`
-    # Reject names that aren't valid shell variable names.
-    if test -n "`echo $varname| sed 's/[a-zA-Z0-9_]//g'`"; then
-      { echo "configure: error: $varname: invalid shell variable name" 1>&2; exit 1; }
-    fi
-    val="`echo "$ac_option"|sed 's/[^=]*=//'`"
-    test -n "$verbose" && echo "	setting shell variable $varname to $val"
-    eval "$varname='$val'"
-    eval "export $varname" ;;
-
-  *)
-    if test -n "`echo $ac_option| sed 's/[-a-z0-9.]//g'`"; then
-      echo "configure: warning: $ac_option: invalid host type" 1>&2
-    fi
-    if test "x$nonopt" != xNONE; then
-      { echo "configure: error: can only configure for one host and one target at a time" 1>&2; exit 1; }
-    fi
-    nonopt="$ac_option"
-    ;;
-
-  esac
-done
-
-if test -n "$ac_prev"; then
-  { echo "configure: error: missing argument to --`echo $ac_prev | sed 's/_/-/g'`" 1>&2; exit 1; }
-fi
-
-trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15
-
-# File descriptor usage:
-# 0 standard input
-# 1 file creation
-# 2 errors and warnings
-# 3 some systems may open it to /dev/tty
-# 4 used on the Kubota Titan
-# 6 checking for... messages and results
-# 5 compiler messages saved in config.log
-if test "$silent" = yes; then
-  exec 6>/dev/null
-else
-  exec 6>&1
-fi
-exec 5>./config.log
-
-echo "\
-This file contains any messages produced by compilers while
-running configure, to aid debugging if configure makes a mistake.
-" 1>&5
-
-# Strip out --no-create and --no-recursion so they do not pile up.
-# Also quote any args containing shell metacharacters.
-ac_configure_args=
-for ac_arg
-do
-  case "$ac_arg" in
-  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
-  | --no-cr | --no-c) ;;
-  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
-  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) ;;
-  *" "*|*"	"*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*)
-  ac_configure_args="$ac_configure_args '$ac_arg'" ;;
-  *) ac_configure_args="$ac_configure_args $ac_arg" ;;
-  esac
-done
-
-# NLS nuisances.
-# Only set these to C if already set.  These must not be set unconditionally
-# because not all systems understand e.g. LANG=C (notably SCO).
-# Fixing LC_MESSAGES prevents Solaris sh from translating var values in `set'!
-# Non-C LC_CTYPE values break the ctype check.
-if test "${LANG+set}"   = set; then LANG=C;   export LANG;   fi
-if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi
-if test "${LC_MESSAGES+set}" = set; then LC_MESSAGES=C; export LC_MESSAGES; fi
-if test "${LC_CTYPE+set}"    = set; then LC_CTYPE=C;    export LC_CTYPE;    fi
-
-# confdefs.h avoids OS command line length limits that DEFS can exceed.
-rm -rf conftest* confdefs.h
-# AIX cpp loses on an empty file, so make sure it contains at least a newline.
-echo > confdefs.h
-
-# A filename unique to this package, relative to the directory that
-# configure is in, which we can look for to find out if srcdir is correct.
-ac_unique_file=jcmaster.c
-
-# Find the source files, if location was not specified.
-if test -z "$srcdir"; then
-  ac_srcdir_defaulted=yes
-  # Try the directory containing this script, then its parent.
-  ac_prog=$0
-  ac_confdir=`echo $ac_prog|sed 's%/[^/][^/]*$%%'`
-  test "x$ac_confdir" = "x$ac_prog" && ac_confdir=.
-  srcdir=$ac_confdir
-  if test ! -r $srcdir/$ac_unique_file; then
-    srcdir=..
-  fi
-else
-  ac_srcdir_defaulted=no
-fi
-if test ! -r $srcdir/$ac_unique_file; then
-  if test "$ac_srcdir_defaulted" = yes; then
-    { echo "configure: error: can not find sources in $ac_confdir or .." 1>&2; exit 1; }
-  else
-    { echo "configure: error: can not find sources in $srcdir" 1>&2; exit 1; }
-  fi
-fi
-srcdir=`echo "${srcdir}" | sed 's%\([^/]\)/*$%\1%'`
-
-# Prefer explicitly selected file to automatically selected ones.
-if test -z "$CONFIG_SITE"; then
-  if test "x$prefix" != xNONE; then
-    CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site"
-  else
-    CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site"
-  fi
-fi
-for ac_site_file in $CONFIG_SITE; do
-  if test -r "$ac_site_file"; then
-    echo "loading site script $ac_site_file"
-    . "$ac_site_file"
-  fi
-done
-
-
-ac_ext=c
-# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
-ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
-cross_compiling=$ac_cv_prog_cc_cross
-
-if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
-  # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu.
-  if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
-    ac_n= ac_c='
-' ac_t='	'
-  else
-    ac_n=-n ac_c= ac_t=
-  fi
-else
-  ac_n= ac_c='\c' ac_t=
-fi
-
-
-
-# Extract the first word of "gcc", so it can be a program name with args.
-set dummy gcc; ac_word=$2
-echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:538: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-  if test -n "$CC"; then
-  ac_cv_prog_CC="$CC" # Let the user override the test.
-else
-  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:"
-  for ac_dir in $PATH; do
-    test -z "$ac_dir" && ac_dir=.
-    if test -f $ac_dir/$ac_word; then
-      ac_cv_prog_CC="gcc"
-      break
-    fi
-  done
-  IFS="$ac_save_ifs"
-fi
-fi
-CC="$ac_cv_prog_CC"
-if test -n "$CC"; then
-  echo "$ac_t""$CC" 1>&6
-else
-  echo "$ac_t""no" 1>&6
-fi
-
-if test -z "$CC"; then
-  # Extract the first word of "cc", so it can be a program name with args.
-set dummy cc; ac_word=$2
-echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:567: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-  if test -n "$CC"; then
-  ac_cv_prog_CC="$CC" # Let the user override the test.
-else
-  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:"
-  ac_prog_rejected=no
-  for ac_dir in $PATH; do
-    test -z "$ac_dir" && ac_dir=.
-    if test -f $ac_dir/$ac_word; then
-      if test "$ac_dir/$ac_word" = "/usr/ucb/cc"; then
-        ac_prog_rejected=yes
-	continue
-      fi
-      ac_cv_prog_CC="cc"
-      break
-    fi
-  done
-  IFS="$ac_save_ifs"
-if test $ac_prog_rejected = yes; then
-  # We found a bogon in the path, so make sure we never use it.
-  set dummy $ac_cv_prog_CC
-  shift
-  if test $# -gt 0; then
-    # We chose a different compiler from the bogus one.
-    # However, it has the same basename, so the bogon will be chosen
-    # first if we set CC to just the basename; use the full file name.
-    shift
-    set dummy "$ac_dir/$ac_word" "$@"
-    shift
-    ac_cv_prog_CC="$@"
-  fi
-fi
-fi
-fi
-CC="$ac_cv_prog_CC"
-if test -n "$CC"; then
-  echo "$ac_t""$CC" 1>&6
-else
-  echo "$ac_t""no" 1>&6
-fi
-
-  test -z "$CC" && { echo "configure: error: no acceptable cc found in \$PATH" 1>&2; exit 1; }
-fi
-
-echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6
-echo "configure:615: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5
-
-ac_ext=c
-# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
-ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
-cross_compiling=$ac_cv_prog_cc_cross
-
-cat > conftest.$ac_ext <<EOF
-#line 625 "configure"
-#include "confdefs.h"
-main(){return(0);}
-EOF
-if { (eval echo configure:629: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
-  ac_cv_prog_cc_works=yes
-  # If we can't run a trivial program, we are probably using a cross compiler.
-  if (./conftest; exit) 2>/dev/null; then
-    ac_cv_prog_cc_cross=no
-  else
-    ac_cv_prog_cc_cross=yes
-  fi
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  ac_cv_prog_cc_works=no
-fi
-rm -fr conftest*
-
-echo "$ac_t""$ac_cv_prog_cc_works" 1>&6
-if test $ac_cv_prog_cc_works = no; then
-  { echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; }
-fi
-echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6
-echo "configure:649: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5
-echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6
-cross_compiling=$ac_cv_prog_cc_cross
-
-echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6
-echo "configure:654: checking whether we are using GNU C" >&5
-if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-  cat > conftest.c <<EOF
-#ifdef __GNUC__
-  yes;
-#endif
-EOF
-if { ac_try='${CC-cc} -E conftest.c'; { (eval echo configure:663: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
-  ac_cv_prog_gcc=yes
-else
-  ac_cv_prog_gcc=no
-fi
-fi
-
-echo "$ac_t""$ac_cv_prog_gcc" 1>&6
-
-if test $ac_cv_prog_gcc = yes; then
-  GCC=yes
-  test "${CFLAGS+set}" = set || CFLAGS="-O2"
-else
-  GCC=
-  test "${CFLAGS+set}" = set || CFLAGS="-O"
-fi
-
-echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6
-echo "configure:681: checking how to run the C preprocessor" >&5
-# On Suns, sometimes $CPP names a directory.
-if test -n "$CPP" && test -d "$CPP"; then
-  CPP=
-fi
-if test -z "$CPP"; then
-if eval "test \"`echo '$''{'ac_cv_prog_CPP'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-    # This must be in double quotes, not single quotes, because CPP may get
-  # substituted into the Makefile and "${CC-cc}" will confuse make.
-  CPP="${CC-cc} -E"
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp.
-  cat > conftest.$ac_ext <<EOF
-#line 696 "configure"
-#include "confdefs.h"
-#include <assert.h>
-Syntax Error
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:702: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
-if test -z "$ac_err"; then
-  :
-else
-  echo "$ac_err" >&5
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  CPP="${CC-cc} -E -traditional-cpp"
-  cat > conftest.$ac_ext <<EOF
-#line 713 "configure"
-#include "confdefs.h"
-#include <assert.h>
-Syntax Error
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:719: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
-if test -z "$ac_err"; then
-  :
-else
-  echo "$ac_err" >&5
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  CPP=/lib/cpp
-fi
-rm -f conftest*
-fi
-rm -f conftest*
-  ac_cv_prog_CPP="$CPP"
-fi
-  CPP="$ac_cv_prog_CPP"
-else
-  ac_cv_prog_CPP="$CPP"
-fi
-echo "$ac_t""$CPP" 1>&6
-
-echo $ac_n "checking for function prototypes""... $ac_c" 1>&6
-echo "configure:742: checking for function prototypes" >&5
-if eval "test \"`echo '$''{'ijg_cv_have_prototypes'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-  cat > conftest.$ac_ext <<EOF
-#line 747 "configure"
-#include "confdefs.h"
-
-int testfunction (int arg1, int * arg2); /* check prototypes */
-struct methods_struct {		/* check method-pointer declarations */
-  int (*error_exit) (char *msgtext);
-  int (*trace_message) (char *msgtext);
-  int (*another_method) (void);
-};
-int testfunction (int arg1, int * arg2) /* check definitions */
-{ return arg2[arg1]; }
-int test2function (void)	/* check void arg list */
-{ return 0; }
-
-int main() {
- 
-; return 0; }
-EOF
-if { (eval echo configure:765: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
-  rm -rf conftest*
-  ijg_cv_have_prototypes=yes
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  ijg_cv_have_prototypes=no
-fi
-rm -f conftest*
-fi
-
-echo "$ac_t""$ijg_cv_have_prototypes" 1>&6
-if test $ijg_cv_have_prototypes = yes; then
-  cat >> confdefs.h <<\EOF
-#define HAVE_PROTOTYPES 
-EOF
-
-else
-  echo Your compiler does not seem to know about function prototypes.
-  echo Perhaps it needs a special switch to enable ANSI C mode.
-  echo If so, we recommend running configure like this:
-  echo "   ./configure  CC='cc -switch'"
-  echo where -switch is the proper switch.
-fi
-ac_safe=`echo "stddef.h" | sed 'y%./+-%__p_%'`
-echo $ac_n "checking for stddef.h""... $ac_c" 1>&6
-echo "configure:792: checking for stddef.h" >&5
-if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-  cat > conftest.$ac_ext <<EOF
-#line 797 "configure"
-#include "confdefs.h"
-#include <stddef.h>
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:802: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
-if test -z "$ac_err"; then
-  rm -rf conftest*
-  eval "ac_cv_header_$ac_safe=yes"
-else
-  echo "$ac_err" >&5
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  eval "ac_cv_header_$ac_safe=no"
-fi
-rm -f conftest*
-fi
-if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
-  echo "$ac_t""yes" 1>&6
-  cat >> confdefs.h <<\EOF
-#define HAVE_STDDEF_H 
-EOF
-
-else
-  echo "$ac_t""no" 1>&6
-fi
-
-ac_safe=`echo "stdlib.h" | sed 'y%./+-%__p_%'`
-echo $ac_n "checking for stdlib.h""... $ac_c" 1>&6
-echo "configure:828: checking for stdlib.h" >&5
-if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-  cat > conftest.$ac_ext <<EOF
-#line 833 "configure"
-#include "confdefs.h"
-#include <stdlib.h>
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:838: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
-if test -z "$ac_err"; then
-  rm -rf conftest*
-  eval "ac_cv_header_$ac_safe=yes"
-else
-  echo "$ac_err" >&5
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  eval "ac_cv_header_$ac_safe=no"
-fi
-rm -f conftest*
-fi
-if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
-  echo "$ac_t""yes" 1>&6
-  cat >> confdefs.h <<\EOF
-#define HAVE_STDLIB_H 
-EOF
-
-else
-  echo "$ac_t""no" 1>&6
-fi
-
-ac_safe=`echo "string.h" | sed 'y%./+-%__p_%'`
-echo $ac_n "checking for string.h""... $ac_c" 1>&6
-echo "configure:864: checking for string.h" >&5
-if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-  cat > conftest.$ac_ext <<EOF
-#line 869 "configure"
-#include "confdefs.h"
-#include <string.h>
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:874: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
-if test -z "$ac_err"; then
-  rm -rf conftest*
-  eval "ac_cv_header_$ac_safe=yes"
-else
-  echo "$ac_err" >&5
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  eval "ac_cv_header_$ac_safe=no"
-fi
-rm -f conftest*
-fi
-if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
-  echo "$ac_t""yes" 1>&6
-  :
-else
-  echo "$ac_t""no" 1>&6
-cat >> confdefs.h <<\EOF
-#define NEED_BSD_STRINGS 
-EOF
-
-fi
-
-echo $ac_n "checking for size_t""... $ac_c" 1>&6
-echo "configure:900: checking for size_t" >&5
-cat > conftest.$ac_ext <<EOF
-#line 902 "configure"
-#include "confdefs.h"
-
-#ifdef HAVE_STDDEF_H
-#include <stddef.h>
-#endif
-#ifdef HAVE_STDLIB_H
-#include <stdlib.h>
-#endif
-#include <stdio.h>
-#ifdef NEED_BSD_STRINGS
-#include <strings.h>
-#else
-#include <string.h>
-#endif
-typedef size_t my_size_t;
-
-int main() {
- my_size_t foovar; 
-; return 0; }
-EOF
-if { (eval echo configure:923: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
-  rm -rf conftest*
-  ijg_size_t_ok=yes
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  ijg_size_t_ok="not ANSI, perhaps it is in sys/types.h"
-fi
-rm -f conftest*
-echo "$ac_t""$ijg_size_t_ok" 1>&6
-if test "$ijg_size_t_ok" != yes; then
-ac_safe=`echo "sys/types.h" | sed 'y%./+-%__p_%'`
-echo $ac_n "checking for sys/types.h""... $ac_c" 1>&6
-echo "configure:937: checking for sys/types.h" >&5
-if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-  cat > conftest.$ac_ext <<EOF
-#line 942 "configure"
-#include "confdefs.h"
-#include <sys/types.h>
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:947: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
-if test -z "$ac_err"; then
-  rm -rf conftest*
-  eval "ac_cv_header_$ac_safe=yes"
-else
-  echo "$ac_err" >&5
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  eval "ac_cv_header_$ac_safe=no"
-fi
-rm -f conftest*
-fi
-if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
-  echo "$ac_t""yes" 1>&6
-  cat >> confdefs.h <<\EOF
-#define NEED_SYS_TYPES_H 
-EOF
-
-cat > conftest.$ac_ext <<EOF
-#line 968 "configure"
-#include "confdefs.h"
-#include <sys/types.h>
-EOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  egrep "size_t" >/dev/null 2>&1; then
-  rm -rf conftest*
-  ijg_size_t_ok="size_t is in sys/types.h"
-else
-  rm -rf conftest*
-  ijg_size_t_ok=no
-fi
-rm -f conftest*
-
-else
-  echo "$ac_t""no" 1>&6
-ijg_size_t_ok=no
-fi
-
-echo "$ac_t""$ijg_size_t_ok" 1>&6
-if test "$ijg_size_t_ok" = no; then
-  echo Type size_t is not defined in any of the usual places.
-  echo Try putting '"typedef unsigned int size_t;"' in jconfig.h.
-fi
-fi
-echo $ac_n "checking for type unsigned char""... $ac_c" 1>&6
-echo "configure:994: checking for type unsigned char" >&5
-cat > conftest.$ac_ext <<EOF
-#line 996 "configure"
-#include "confdefs.h"
-
-int main() {
- unsigned char un_char; 
-; return 0; }
-EOF
-if { (eval echo configure:1003: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
-  rm -rf conftest*
-  echo "$ac_t""yes" 1>&6
-cat >> confdefs.h <<\EOF
-#define HAVE_UNSIGNED_CHAR 
-EOF
-
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  echo "$ac_t""no" 1>&6
-fi
-rm -f conftest*
-echo $ac_n "checking for type unsigned short""... $ac_c" 1>&6
-echo "configure:1018: checking for type unsigned short" >&5
-cat > conftest.$ac_ext <<EOF
-#line 1020 "configure"
-#include "confdefs.h"
-
-int main() {
- unsigned short un_short; 
-; return 0; }
-EOF
-if { (eval echo configure:1027: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
-  rm -rf conftest*
-  echo "$ac_t""yes" 1>&6
-cat >> confdefs.h <<\EOF
-#define HAVE_UNSIGNED_SHORT 
-EOF
-
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  echo "$ac_t""no" 1>&6
-fi
-rm -f conftest*
-echo $ac_n "checking for type void""... $ac_c" 1>&6
-echo "configure:1042: checking for type void" >&5
-cat > conftest.$ac_ext <<EOF
-#line 1044 "configure"
-#include "confdefs.h"
-
-/* Caution: a C++ compiler will insist on valid prototypes */
-typedef void * void_ptr;	/* check void * */
-#ifdef HAVE_PROTOTYPES		/* check ptr to function returning void */
-typedef void (*void_func) (int a, int b);
-#else
-typedef void (*void_func) ();
-#endif
-
-#ifdef HAVE_PROTOTYPES		/* check void function result */
-void test3function (void_ptr arg1, void_func arg2)
-#else
-void test3function (arg1, arg2)
-     void_ptr arg1;
-     void_func arg2;
-#endif
-{
-  char * locptr = (char *) arg1; /* check casting to and from void * */
-  arg1 = (void *) locptr;
-  (*arg2) (1, 2);		/* check call of fcn returning void */
-}
-
-int main() {
- 
-; return 0; }
-EOF
-if { (eval echo configure:1072: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
-  rm -rf conftest*
-  echo "$ac_t""yes" 1>&6
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  echo "$ac_t""no" 1>&6
-cat >> confdefs.h <<\EOF
-#define void char
-EOF
-
-fi
-rm -f conftest*
-
-echo $ac_n "checking for working const""... $ac_c" 1>&6
-echo "configure:1088: checking for working const" >&5
-if eval "test \"`echo '$''{'ac_cv_c_const'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-  cat > conftest.$ac_ext <<EOF
-#line 1093 "configure"
-#include "confdefs.h"
-
-int main() {
-
-/* Ultrix mips cc rejects this.  */
-typedef int charset[2]; const charset x;
-/* SunOS 4.1.1 cc rejects this.  */
-char const *const *ccp;
-char **p;
-/* NEC SVR4.0.2 mips cc rejects this.  */
-struct point {int x, y;};
-static struct point const zero = {0,0};
-/* AIX XL C 1.02.0.0 rejects this.
-   It does not let you subtract one const X* pointer from another in an arm
-   of an if-expression whose if-part is not a constant expression */
-const char *g = "string";
-ccp = &g + (g ? g-g : 0);
-/* HPUX 7.0 cc rejects these. */
-++ccp;
-p = (char**) ccp;
-ccp = (char const *const *) p;
-{ /* SCO 3.2v4 cc rejects this.  */
-  char *t;
-  char const *s = 0 ? (char *) 0 : (char const *) 0;
-
-  *t++ = 0;
-}
-{ /* Someone thinks the Sun supposedly-ANSI compiler will reject this.  */
-  int x[] = {25, 17};
-  const int *foo = &x[0];
-  ++foo;
-}
-{ /* Sun SC1.0 ANSI compiler rejects this -- but not the above. */
-  typedef const int *iptr;
-  iptr p = 0;
-  ++p;
-}
-{ /* AIX XL C 1.02.0.0 rejects this saying
-     "k.c", line 2.27: 1506-025 (S) Operand must be a modifiable lvalue. */
-  struct s { int j; const int *ap[3]; };
-  struct s *b; b->j = 5;
-}
-{ /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */
-  const int foo = 10;
-}
-
-; return 0; }
-EOF
-if { (eval echo configure:1142: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
-  rm -rf conftest*
-  ac_cv_c_const=yes
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  ac_cv_c_const=no
-fi
-rm -f conftest*
-fi
-
-echo "$ac_t""$ac_cv_c_const" 1>&6
-if test $ac_cv_c_const = no; then
-  cat >> confdefs.h <<\EOF
-#define const 
-EOF
-
-fi
-
-echo $ac_n "checking for inline""... $ac_c" 1>&6
-echo "configure:1163: checking for inline" >&5
-ijg_cv_inline=""
-cat > conftest.$ac_ext <<EOF
-#line 1166 "configure"
-#include "confdefs.h"
-
-int main() {
-} __inline__ int foo() { return 0; }
-int bar() { return foo();
-; return 0; }
-EOF
-if { (eval echo configure:1174: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
-  rm -rf conftest*
-  ijg_cv_inline="__inline__"
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  cat > conftest.$ac_ext <<EOF
-#line 1182 "configure"
-#include "confdefs.h"
-
-int main() {
-} __inline int foo() { return 0; }
-int bar() { return foo();
-; return 0; }
-EOF
-if { (eval echo configure:1190: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
-  rm -rf conftest*
-  ijg_cv_inline="__inline"
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  cat > conftest.$ac_ext <<EOF
-#line 1198 "configure"
-#include "confdefs.h"
-
-int main() {
-} inline int foo() { return 0; }
-int bar() { return foo();
-; return 0; }
-EOF
-if { (eval echo configure:1206: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
-  rm -rf conftest*
-  ijg_cv_inline="inline"
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-fi
-rm -f conftest*
-fi
-rm -f conftest*
-fi
-rm -f conftest*
-echo "$ac_t""$ijg_cv_inline" 1>&6
-cat >> confdefs.h <<EOF
-#define INLINE $ijg_cv_inline
-EOF
-
-echo $ac_n "checking for broken incomplete types""... $ac_c" 1>&6
-echo "configure:1224: checking for broken incomplete types" >&5
-cat > conftest.$ac_ext <<EOF
-#line 1226 "configure"
-#include "confdefs.h"
- typedef struct undefined_structure * undef_struct_ptr; 
-int main() {
-
-; return 0; }
-EOF
-if { (eval echo configure:1233: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
-  rm -rf conftest*
-  echo "$ac_t""ok" 1>&6
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  echo "$ac_t""broken" 1>&6
-cat >> confdefs.h <<\EOF
-#define INCOMPLETE_TYPES_BROKEN 
-EOF
-
-fi
-rm -f conftest*
-echo $ac_n "checking for short external names""... $ac_c" 1>&6
-echo "configure:1248: checking for short external names" >&5
-cat > conftest.$ac_ext <<EOF
-#line 1250 "configure"
-#include "confdefs.h"
-
-int possibly_duplicate_function () { return 0; }
-int possibly_dupli_function () { return 1; }
-
-int main() {
- 
-; return 0; }
-EOF
-if { (eval echo configure:1260: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
-  rm -rf conftest*
-  echo "$ac_t""ok" 1>&6
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  echo "$ac_t""short" 1>&6
-cat >> confdefs.h <<\EOF
-#define NEED_SHORT_EXTERNAL_NAMES 
-EOF
-
-fi
-rm -f conftest*
-echo $ac_n "checking to see if char is signed""... $ac_c" 1>&6
-echo "configure:1275: checking to see if char is signed" >&5
-if test "$cross_compiling" = yes; then
-  echo Assuming that char is signed on target machine.
-echo If it is unsigned, this will be a little bit inefficient.
-
-else
-  cat > conftest.$ac_ext <<EOF
-#line 1282 "configure"
-#include "confdefs.h"
-
-#ifdef HAVE_PROTOTYPES
-int is_char_signed (int arg)
-#else
-int is_char_signed (arg)
-     int arg;
-#endif
-{
-  if (arg == 189) {		/* expected result for unsigned char */
-    return 0;			/* type char is unsigned */
-  }
-  else if (arg != -67) {	/* expected result for signed char */
-    printf("Hmm, it seems 'char' is not eight bits wide on your machine.\n");
-    printf("I fear the JPEG software will not work at all.\n\n");
-  }
-  return 1;			/* assume char is signed otherwise */
-}
-char signed_char_check = (char) (-67);
-main() {
-  exit(is_char_signed((int) signed_char_check));
-}
-EOF
-if { (eval echo configure:1306: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
-then
-  echo "$ac_t""no" 1>&6
-cat >> confdefs.h <<\EOF
-#define CHAR_IS_UNSIGNED 
-EOF
-
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -fr conftest*
-  echo "$ac_t""yes" 1>&6
-fi
-rm -fr conftest*
-fi
-
-echo $ac_n "checking to see if right shift is signed""... $ac_c" 1>&6
-echo "configure:1323: checking to see if right shift is signed" >&5
-if test "$cross_compiling" = yes; then
-  echo "$ac_t""Assuming that right shift is signed on target machine." 1>&6
-else
-  cat > conftest.$ac_ext <<EOF
-#line 1328 "configure"
-#include "confdefs.h"
-
-#ifdef HAVE_PROTOTYPES
-int is_shifting_signed (long arg)
-#else
-int is_shifting_signed (arg)
-     long arg;
-#endif
-/* See whether right-shift on a long is signed or not. */
-{
-  long res = arg >> 4;
-
-  if (res == -0x7F7E80CL) {	/* expected result for signed shift */
-    return 1;			/* right shift is signed */
-  }
-  /* see if unsigned-shift hack will fix it. */
-  /* we can't just test exact value since it depends on width of long... */
-  res |= (~0L) << (32-4);
-  if (res == -0x7F7E80CL) {	/* expected result now? */
-    return 0;			/* right shift is unsigned */
-  }
-  printf("Right shift isn't acting as I expect it to.\n");
-  printf("I fear the JPEG software will not work at all.\n\n");
-  return 0;			/* try it with unsigned anyway */
-}
-main() {
-  exit(is_shifting_signed(-0x7F7E80B1L));
-}
-EOF
-if { (eval echo configure:1358: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
-then
-  echo "$ac_t""no" 1>&6
-cat >> confdefs.h <<\EOF
-#define RIGHT_SHIFT_IS_UNSIGNED 
-EOF
-
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -fr conftest*
-  echo "$ac_t""yes" 1>&6
-fi
-rm -fr conftest*
-fi
-
-echo $ac_n "checking to see if fopen accepts b spec""... $ac_c" 1>&6
-echo "configure:1375: checking to see if fopen accepts b spec" >&5
-if test "$cross_compiling" = yes; then
-  echo "$ac_t""Assuming that it does." 1>&6
-else
-  cat > conftest.$ac_ext <<EOF
-#line 1380 "configure"
-#include "confdefs.h"
-
-#include <stdio.h>
-main() {
-  if (fopen("conftestdata", "wb") != NULL)
-    exit(0);
-  exit(1);
-}
-EOF
-if { (eval echo configure:1390: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
-then
-  echo "$ac_t""yes" 1>&6
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -fr conftest*
-  echo "$ac_t""no" 1>&6
-cat >> confdefs.h <<\EOF
-#define DONT_USE_B_MODE 
-EOF
-
-fi
-rm -fr conftest*
-fi
-
-ac_aux_dir=
-for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do
-  if test -f $ac_dir/install-sh; then
-    ac_aux_dir=$ac_dir
-    ac_install_sh="$ac_aux_dir/install-sh -c"
-    break
-  elif test -f $ac_dir/install.sh; then
-    ac_aux_dir=$ac_dir
-    ac_install_sh="$ac_aux_dir/install.sh -c"
-    break
-  fi
-done
-if test -z "$ac_aux_dir"; then
-  { echo "configure: error: can not find install-sh or install.sh in $srcdir $srcdir/.. $srcdir/../.." 1>&2; exit 1; }
-fi
-ac_config_guess=$ac_aux_dir/config.guess
-ac_config_sub=$ac_aux_dir/config.sub
-ac_configure=$ac_aux_dir/configure # This should be Cygnus configure.
-
-# Find a good install program.  We prefer a C program (faster),
-# so one script is as good as another.  But avoid the broken or
-# incompatible versions:
-# SysV /etc/install, /usr/sbin/install
-# SunOS /usr/etc/install
-# IRIX /sbin/install
-# AIX /bin/install
-# AFS /usr/afsws/bin/install, which mishandles nonexistent args
-# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
-# ./install, which can be erroneously created by make from ./install.sh.
-echo $ac_n "checking for a BSD compatible install""... $ac_c" 1>&6
-echo "configure:1436: checking for a BSD compatible install" >&5
-if test -z "$INSTALL"; then
-if eval "test \"`echo '$''{'ac_cv_path_install'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-    IFS="${IFS= 	}"; ac_save_IFS="$IFS"; IFS="${IFS}:"
-  for ac_dir in $PATH; do
-    # Account for people who put trailing slashes in PATH elements.
-    case "$ac_dir/" in
-    /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;;
-    *)
-      # OSF1 and SCO ODT 3.0 have their own names for install.
-      for ac_prog in ginstall installbsd scoinst install; do
-        if test -f $ac_dir/$ac_prog; then
-	  if test $ac_prog = install &&
-            grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then
-	    # AIX install.  It has an incompatible calling convention.
-	    # OSF/1 installbsd also uses dspmsg, but is usable.
-	    :
-	  else
-	    ac_cv_path_install="$ac_dir/$ac_prog -c"
-	    break 2
-	  fi
-	fi
-      done
-      ;;
-    esac
-  done
-  IFS="$ac_save_IFS"
-
-fi
-  if test "${ac_cv_path_install+set}" = set; then
-    INSTALL="$ac_cv_path_install"
-  else
-    # As a last resort, use the slow shell script.  We don't cache a
-    # path for INSTALL within a source directory, because that will
-    # break other packages using the cache if that directory is
-    # removed, or if the path is relative.
-    INSTALL="$ac_install_sh"
-  fi
-fi
-echo "$ac_t""$INSTALL" 1>&6
-
-# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
-# It thinks the first close brace ends the variable substitution.
-test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
-
-test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
-
-# Extract the first word of "ranlib", so it can be a program name with args.
-set dummy ranlib; ac_word=$2
-echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1488: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
-  echo $ac_n "(cached) $ac_c" 1>&6
-else
-  if test -n "$RANLIB"; then
-  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
-else
-  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:"
-  for ac_dir in $PATH; do
-    test -z "$ac_dir" && ac_dir=.
-    if test -f $ac_dir/$ac_word; then
-      ac_cv_prog_RANLIB="ranlib"
-      break
-    fi
-  done
-  IFS="$ac_save_ifs"
-  test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":"
-fi
-fi
-RANLIB="$ac_cv_prog_RANLIB"
-if test -n "$RANLIB"; then
-  echo "$ac_t""$RANLIB" 1>&6
-else
-  echo "$ac_t""no" 1>&6
-fi
-
-
-# Decide whether to use libtool,
-# and if so whether to build shared, static, or both flavors of library.
-LTSHARED="no"
-# Check whether --enable-shared or --disable-shared was given.
-if test "${enable_shared+set}" = set; then
-  enableval="$enable_shared"
-  LTSHARED="$enableval"
-fi
-
-LTSTATIC="no"
-# Check whether --enable-static or --disable-static was given.
-if test "${enable_static+set}" = set; then
-  enableval="$enable_static"
-  LTSTATIC="$enableval"
-fi
-
-if test "x$LTSHARED" != xno  -o  "x$LTSTATIC" != xno; then
-  USELIBTOOL="yes"
-  LIBTOOL="./libtool"
-  O="lo"
-  A="la"
-  LN='$(LIBTOOL) --mode=link $(CC)'
-  INSTALL_LIB='$(LIBTOOL) --mode=install ${INSTALL}'
-  INSTALL_PROGRAM="\$(LIBTOOL) --mode=install $INSTALL_PROGRAM"
-else
-  USELIBTOOL="no"
-  LIBTOOL=""
-  O="o"
-  A="a"
-  LN='$(CC)'
-  INSTALL_LIB="$INSTALL_DATA"
-fi
-
-
-
-
-
-
-# Configure libtool if needed.
-if test $USELIBTOOL = yes; then
-  disable_shared=
-  disable_static=
-  if test "x$LTSHARED" = xno; then
-    disable_shared="--disable-shared"
-  fi
-  if test "x$LTSTATIC" = xno; then
-    disable_static="--disable-static"
-  fi
-  $srcdir/ltconfig $disable_shared $disable_static $srcdir/ltmain.sh
-fi
-
-# Select memory manager depending on user input.
-# If no "-enable-maxmem", use jmemnobs
-MEMORYMGR='jmemnobs.$(O)'
-MAXMEM="no"
-# Check whether --enable-maxmem or --disable-maxmem was given.
-if test "${enable_maxmem+set}" = set; then
-  enableval="$enable_maxmem"
-  MAXMEM="$enableval"
-fi
-
-# support --with-maxmem for backwards compatibility with IJG V5.
-# Check whether --with-maxmem or --without-maxmem was given.
-if test "${with_maxmem+set}" = set; then
-  withval="$with_maxmem"
-  MAXMEM="$withval"
-fi
-
-if test "x$MAXMEM" = xyes; then
-  MAXMEM=1
-fi
-if test "x$MAXMEM" != xno; then
-  if test -n "`echo $MAXMEM | sed 's/[0-9]//g'`"; then
-    { echo "configure: error: non-numeric argument to --enable-maxmem" 1>&2; exit 1; }
-  fi
-  DEFAULTMAXMEM=`expr $MAXMEM \* 1048576`
-cat >> confdefs.h <<EOF
-#define DEFAULT_MAX_MEM ${DEFAULTMAXMEM}
-EOF
-
-echo $ac_n "checking for 'tmpfile()'""... $ac_c" 1>&6
-echo "configure:1596: checking for 'tmpfile()'" >&5
-cat > conftest.$ac_ext <<EOF
-#line 1598 "configure"
-#include "confdefs.h"
-#include <stdio.h>
-int main() {
- FILE * tfile = tmpfile(); 
-; return 0; }
-EOF
-if { (eval echo configure:1605: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
-  rm -rf conftest*
-  echo "$ac_t""yes" 1>&6
-MEMORYMGR='jmemansi.$(O)'
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  echo "$ac_t""no" 1>&6
-MEMORYMGR='jmemname.$(O)'
-cat >> confdefs.h <<\EOF
-#define NEED_SIGNAL_CATCHER 
-EOF
-
-echo $ac_n "checking for 'mktemp()'""... $ac_c" 1>&6
-echo "configure:1620: checking for 'mktemp()'" >&5
-cat > conftest.$ac_ext <<EOF
-#line 1622 "configure"
-#include "confdefs.h"
-
-int main() {
- char fname[80]; mktemp(fname); 
-; return 0; }
-EOF
-if { (eval echo configure:1629: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
-  rm -rf conftest*
-  echo "$ac_t""yes" 1>&6
-else
-  echo "configure: failed program was:" >&5
-  cat conftest.$ac_ext >&5
-  rm -rf conftest*
-  echo "$ac_t""no" 1>&6
-cat >> confdefs.h <<\EOF
-#define NO_MKTEMP 
-EOF
-
-fi
-rm -f conftest*
-fi
-rm -f conftest*
-fi
-
-
-# Extract the library version ID from jpeglib.h.
-echo $ac_n "checking libjpeg version number""... $ac_c" 1>&6
-echo "configure:1650: checking libjpeg version number" >&5
-JPEG_LIB_VERSION=`sed -e '/^#define JPEG_LIB_VERSION/!d' -e 's/^[^0-9]*\([0-9][0-9]*\).*$/\1/' $srcdir/jpeglib.h`
-echo "$ac_t""$JPEG_LIB_VERSION" 1>&6
-
-
-# Prepare to massage makefile.cfg correctly.
-if test $ijg_cv_have_prototypes = yes; then
-  A2K_DEPS=""
-  COM_A2K="# "
-else
-  A2K_DEPS="ansi2knr"
-  COM_A2K=""
-fi
-
-
-# ansi2knr needs -DBSD if string.h is missing
-if test $ac_cv_header_string_h = no; then
-  ANSI2KNRFLAGS="-DBSD"
-else
-  ANSI2KNRFLAGS=""
-fi
-
-# Substitutions to enable or disable libtool-related stuff
-if test $USELIBTOOL = yes -a $ijg_cv_have_prototypes = yes; then
-  COM_LT=""
-else
-  COM_LT="# "
-fi
-
-if test "x$LTSHARED" != xno; then
-  FORCE_INSTALL_LIB="install-lib"
-else
-  FORCE_INSTALL_LIB=""
-fi
-
-# Set up -I directives
-if test "x$srcdir" = x.; then
-  INCLUDEFLAGS='-I$(srcdir)'
-else
-  INCLUDEFLAGS='-I. -I$(srcdir)'
-fi
-
-trap '' 1 2 15
-
-trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15
-
-test "x$prefix" = xNONE && prefix=$ac_default_prefix
-# Let make expand exec_prefix.
-test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
-
-# Any assignment to VPATH causes Sun make to only execute
-# the first set of double-colon rules, so remove it if not needed.
-# If there is a colon in the path, we need to keep it.
-if test "x$srcdir" = x.; then
-  ac_vpsub='/^[ 	]*VPATH[ 	]*=[^:]*$/d'
-fi
-
-trap 'rm -f $CONFIG_STATUS conftest*; exit 1' 1 2 15
-
-DEFS=-DHAVE_CONFIG_H
-
-# Without the "./", some shells look in PATH for config.status.
-: ${CONFIG_STATUS=./config.status}
-
-echo creating $CONFIG_STATUS
-rm -f $CONFIG_STATUS
-cat > $CONFIG_STATUS <<EOF
-#! /bin/sh
-# Generated automatically by configure.
-# Run this file to recreate the current configuration.
-# This directory was configured as follows,
-# on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
-#
-# $0 $ac_configure_args
-#
-# Compiler output produced by configure, useful for debugging
-# configure, is in ./config.log if it exists.
-
-ac_cs_usage="Usage: $CONFIG_STATUS [--recheck] [--version] [--help]"
-for ac_option
-do
-  case "\$ac_option" in
-  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
-    echo "running \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion"
-    exec \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion ;;
-  -version | --version | --versio | --versi | --vers | --ver | --ve | --v)
-    echo "$CONFIG_STATUS generated by autoconf version 2.12"
-    exit 0 ;;
-  -help | --help | --hel | --he | --h)
-    echo "\$ac_cs_usage"; exit 0 ;;
-  *) echo "\$ac_cs_usage"; exit 1 ;;
-  esac
-done
-
-ac_given_srcdir=$srcdir
-ac_given_INSTALL="$INSTALL"
-
-trap 'rm -fr `echo "Makefile:makefile.cfg jconfig.h:jconfig.cfg" | sed "s/:[^ ]*//g"` conftest*; exit 1' 1 2 15
-EOF
-cat >> $CONFIG_STATUS <<EOF
-
-# Protect against being on the right side of a sed subst in config.status.
-sed 's/%@/@@/; s/@%/@@/; s/%g\$/@g/; /@g\$/s/[\\\\&%]/\\\\&/g;
- s/@@/%@/; s/@@/@%/; s/@g\$/%g/' > conftest.subs <<\\CEOF
-$ac_vpsub
-$extrasub
-s%@CFLAGS@%$CFLAGS%g
-s%@CPPFLAGS@%$CPPFLAGS%g
-s%@CXXFLAGS@%$CXXFLAGS%g
-s%@DEFS@%$DEFS%g
-s%@LDFLAGS@%$LDFLAGS%g
-s%@LIBS@%$LIBS%g
-s%@exec_prefix@%$exec_prefix%g
-s%@prefix@%$prefix%g
-s%@program_transform_name@%$program_transform_name%g
-s%@bindir@%$bindir%g
-s%@sbindir@%$sbindir%g
-s%@libexecdir@%$libexecdir%g
-s%@datadir@%$datadir%g
-s%@sysconfdir@%$sysconfdir%g
-s%@sharedstatedir@%$sharedstatedir%g
-s%@localstatedir@%$localstatedir%g
-s%@libdir@%$libdir%g
-s%@includedir@%$includedir%g
-s%@oldincludedir@%$oldincludedir%g
-s%@infodir@%$infodir%g
-s%@mandir@%$mandir%g
-s%@CC@%$CC%g
-s%@CPP@%$CPP%g
-s%@INSTALL_PROGRAM@%$INSTALL_PROGRAM%g
-s%@INSTALL_DATA@%$INSTALL_DATA%g
-s%@RANLIB@%$RANLIB%g
-s%@LIBTOOL@%$LIBTOOL%g
-s%@O@%$O%g
-s%@A@%$A%g
-s%@LN@%$LN%g
-s%@INSTALL_LIB@%$INSTALL_LIB%g
-s%@MEMORYMGR@%$MEMORYMGR%g
-s%@JPEG_LIB_VERSION@%$JPEG_LIB_VERSION%g
-s%@A2K_DEPS@%$A2K_DEPS%g
-s%@COM_A2K@%$COM_A2K%g
-s%@ANSI2KNRFLAGS@%$ANSI2KNRFLAGS%g
-s%@COM_LT@%$COM_LT%g
-s%@FORCE_INSTALL_LIB@%$FORCE_INSTALL_LIB%g
-s%@INCLUDEFLAGS@%$INCLUDEFLAGS%g
-
-CEOF
-EOF
-
-cat >> $CONFIG_STATUS <<\EOF
-
-# Split the substitutions into bite-sized pieces for seds with
-# small command number limits, like on Digital OSF/1 and HP-UX.
-ac_max_sed_cmds=90 # Maximum number of lines to put in a sed script.
-ac_file=1 # Number of current file.
-ac_beg=1 # First line for current file.
-ac_end=$ac_max_sed_cmds # Line after last line for current file.
-ac_more_lines=:
-ac_sed_cmds=""
-while $ac_more_lines; do
-  if test $ac_beg -gt 1; then
-    sed "1,${ac_beg}d; ${ac_end}q" conftest.subs > conftest.s$ac_file
-  else
-    sed "${ac_end}q" conftest.subs > conftest.s$ac_file
-  fi
-  if test ! -s conftest.s$ac_file; then
-    ac_more_lines=false
-    rm -f conftest.s$ac_file
-  else
-    if test -z "$ac_sed_cmds"; then
-      ac_sed_cmds="sed -f conftest.s$ac_file"
-    else
-      ac_sed_cmds="$ac_sed_cmds | sed -f conftest.s$ac_file"
-    fi
-    ac_file=`expr $ac_file + 1`
-    ac_beg=$ac_end
-    ac_end=`expr $ac_end + $ac_max_sed_cmds`
-  fi
-done
-if test -z "$ac_sed_cmds"; then
-  ac_sed_cmds=cat
-fi
-EOF
-
-cat >> $CONFIG_STATUS <<EOF
-
-CONFIG_FILES=\${CONFIG_FILES-"Makefile:makefile.cfg"}
-EOF
-cat >> $CONFIG_STATUS <<\EOF
-for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then
-  # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in".
-  case "$ac_file" in
-  *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'`
-       ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;;
-  *) ac_file_in="${ac_file}.in" ;;
-  esac
-
-  # Adjust a relative srcdir, top_srcdir, and INSTALL for subdirectories.
-
-  # Remove last slash and all that follows it.  Not all systems have dirname.
-  ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'`
-  if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then
-    # The file is in a subdirectory.
-    test ! -d "$ac_dir" && mkdir "$ac_dir"
-    ac_dir_suffix="/`echo $ac_dir|sed 's%^\./%%'`"
-    # A "../" for each directory in $ac_dir_suffix.
-    ac_dots=`echo $ac_dir_suffix|sed 's%/[^/]*%../%g'`
-  else
-    ac_dir_suffix= ac_dots=
-  fi
-
-  case "$ac_given_srcdir" in
-  .)  srcdir=.
-      if test -z "$ac_dots"; then top_srcdir=.
-      else top_srcdir=`echo $ac_dots|sed 's%/$%%'`; fi ;;
-  /*) srcdir="$ac_given_srcdir$ac_dir_suffix"; top_srcdir="$ac_given_srcdir" ;;
-  *) # Relative path.
-    srcdir="$ac_dots$ac_given_srcdir$ac_dir_suffix"
-    top_srcdir="$ac_dots$ac_given_srcdir" ;;
-  esac
-
-  case "$ac_given_INSTALL" in
-  [/$]*) INSTALL="$ac_given_INSTALL" ;;
-  *) INSTALL="$ac_dots$ac_given_INSTALL" ;;
-  esac
-
-  echo creating "$ac_file"
-  rm -f "$ac_file"
-  configure_input="Generated automatically from `echo $ac_file_in|sed 's%.*/%%'` by configure."
-  case "$ac_file" in
-  *Makefile*) ac_comsub="1i\\
-# $configure_input" ;;
-  *) ac_comsub= ;;
-  esac
-
-  ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"`
-  sed -e "$ac_comsub
-s%@configure_input@%$configure_input%g
-s%@srcdir@%$srcdir%g
-s%@top_srcdir@%$top_srcdir%g
-s%@INSTALL@%$INSTALL%g
-" $ac_file_inputs | (eval "$ac_sed_cmds") > $ac_file
-fi; done
-rm -f conftest.s*
-
-# These sed commands are passed to sed as "A NAME B NAME C VALUE D", where
-# NAME is the cpp macro being defined and VALUE is the value it is being given.
-#
-# ac_d sets the value in "#define NAME VALUE" lines.
-ac_dA='s%^\([ 	]*\)#\([ 	]*define[ 	][ 	]*\)'
-ac_dB='\([ 	][ 	]*\)[^ 	]*%\1#\2'
-ac_dC='\3'
-ac_dD='%g'
-# ac_u turns "#undef NAME" with trailing blanks into "#define NAME VALUE".
-ac_uA='s%^\([ 	]*\)#\([ 	]*\)undef\([ 	][ 	]*\)'
-ac_uB='\([ 	]\)%\1#\2define\3'
-ac_uC=' '
-ac_uD='\4%g'
-# ac_e turns "#undef NAME" without trailing blanks into "#define NAME VALUE".
-ac_eA='s%^\([ 	]*\)#\([ 	]*\)undef\([ 	][ 	]*\)'
-ac_eB='$%\1#\2define\3'
-ac_eC=' '
-ac_eD='%g'
-
-if test "${CONFIG_HEADERS+set}" != set; then
-EOF
-cat >> $CONFIG_STATUS <<EOF
-  CONFIG_HEADERS="jconfig.h:jconfig.cfg"
-EOF
-cat >> $CONFIG_STATUS <<\EOF
-fi
-for ac_file in .. $CONFIG_HEADERS; do if test "x$ac_file" != x..; then
-  # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in".
-  case "$ac_file" in
-  *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'`
-       ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;;
-  *) ac_file_in="${ac_file}.in" ;;
-  esac
-
-  echo creating $ac_file
-
-  rm -f conftest.frag conftest.in conftest.out
-  ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"`
-  cat $ac_file_inputs > conftest.in
-
-EOF
-
-# Transform confdefs.h into a sed script conftest.vals that substitutes
-# the proper values into config.h.in to produce config.h.  And first:
-# Protect against being on the right side of a sed subst in config.status.
-# Protect against being in an unquoted here document in config.status.
-rm -f conftest.vals
-cat > conftest.hdr <<\EOF
-s/[\\&%]/\\&/g
-s%[\\$`]%\\&%g
-s%#define \([A-Za-z_][A-Za-z0-9_]*\) *\(.*\)%${ac_dA}\1${ac_dB}\1${ac_dC}\2${ac_dD}%gp
-s%ac_d%ac_u%gp
-s%ac_u%ac_e%gp
-EOF
-sed -n -f conftest.hdr confdefs.h > conftest.vals
-rm -f conftest.hdr
-
-# This sed command replaces #undef with comments.  This is necessary, for
-# example, in the case of _POSIX_SOURCE, which is predefined and required
-# on some systems where configure will not decide to define it.
-cat >> conftest.vals <<\EOF
-EOF
-
-# Break up conftest.vals because some shells have a limit on
-# the size of here documents, and old seds have small limits too.
-
-rm -f conftest.tail
-while :
-do
-  ac_lines=`grep -c . conftest.vals`
-  # grep -c gives empty output for an empty file on some AIX systems.
-  if test -z "$ac_lines" || test "$ac_lines" -eq 0; then break; fi
-  # Write a limited-size here document to conftest.frag.
-  echo '  cat > conftest.frag <<CEOF' >> $CONFIG_STATUS
-  sed ${ac_max_here_lines}q conftest.vals >> $CONFIG_STATUS
-  echo 'CEOF
-  sed -f conftest.frag conftest.in > conftest.out
-  rm -f conftest.in
-  mv conftest.out conftest.in
-' >> $CONFIG_STATUS
-  sed 1,${ac_max_here_lines}d conftest.vals > conftest.tail
-  rm -f conftest.vals
-  mv conftest.tail conftest.vals
-done
-rm -f conftest.vals
-
-cat >> $CONFIG_STATUS <<\EOF
-  rm -f conftest.frag conftest.h
-  echo "/* $ac_file.  Generated automatically by configure.  */" > conftest.h
-  cat conftest.in >> conftest.h
-  rm -f conftest.in
-  if cmp -s $ac_file conftest.h 2>/dev/null; then
-    echo "$ac_file is unchanged"
-    rm -f conftest.h
-  else
-    # Remove last slash and all that follows it.  Not all systems have dirname.
-      ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'`
-      if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then
-      # The file is in a subdirectory.
-      test ! -d "$ac_dir" && mkdir "$ac_dir"
-    fi
-    rm -f $ac_file
-    mv conftest.h $ac_file
-  fi
-fi; done
-
-EOF
-cat >> $CONFIG_STATUS <<EOF
-
-EOF
-cat >> $CONFIG_STATUS <<\EOF
-
-exit 0
-EOF
-chmod +x $CONFIG_STATUS
-rm -fr confdefs* $ac_clean_files
-test "$no_create" = yes || ${CONFIG_SHELL-/bin/sh} $CONFIG_STATUS || exit 1
-

diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..cd32445
--- /dev/null
+++ b/configure.ac

@@ -0,0 +1,124 @@
+#                                               -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ([2.57])
+AC_INIT([libjpeg], [6.b])
+
+AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])
+
+# Always build with prototypes
+AC_DEFINE([HAVE_PROTOTYPES], 1, [Define if your compiler supports prototypes])
+# Don't use undefined types
+AC_DEFINE([INCOMPLETE_TYPES_BROKEN], 1, [Define if you want use complete types])
+
+# Checks for programs.
+AC_PROG_CPP
+AC_PROG_CC
+AC_PROG_CXX
+AC_PROG_INSTALL
+AC_PROG_LIBTOOL
+AC_PROG_LN_S
+
+# Checks for libraries.
+
+# Checks for header files.
+AC_HEADER_STDC
+AC_CHECK_HEADERS([stddef.h stdlib.h string.h])
+AC_CHECK_HEADER([sys/types.h], AC_DEFINE([NEED_SYS_TYPES_H], 1, [Define if you have sys/types.h]))
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_C_CONST
+AC_C_CHAR_UNSIGNED
+AC_C_INLINE
+AC_TYPE_SIZE_T
+AC_CHECK_TYPES([unsigned char, unsigned short])
+
+AC_MSG_CHECKING([if right shift is signed])
+AC_TRY_RUN(
+	[#include <stdio.h>
+	 int is_shifting_signed (long arg) {
+	 long res = arg >> 4;
+
+	 if (res == -0x7F7E80CL)
+		return 1; /* right shift is signed */
+
+	 /* see if unsigned-shift hack will fix it. */
+	 /* we can't just test exact value since it depends on width of long... */
+	 res |= (~0L) << (32-4);
+	 if (res == -0x7F7E80CL)
+		return 0; /* right shift is unsigned */
+
+	 printf("Right shift isn't acting as I expect it to.\n");
+	 printf("I fear the JPEG software will not work at all.\n\n");
+	 return 0; /* try it with unsigned anyway */
+	 }
+	 int main (void) {
+		exit(is_shifting_signed(-0x7F7E80B1L));
+	 }],
+	[AC_MSG_RESULT(no)
+	 AC_DEFINE([RIGHT_SHIFT_IS_UNSIGNED], 1, [Define if shift is unsigned])],
+	[AC_MSG_RESULT(yes)],
+	[AC_MSG_RESULT(Assuming that right shift is signed on target machine.)])
+
+# test whether global names are unique to at least 15 chars
+AC_MSG_CHECKING([for short external names])
+AC_TRY_LINK(
+	[int possibly_duplicate_function () { return 0; }
+	 int possibly_dupli_function () { return 1; }], [ ],
+	[AC_MSG_RESULT(ok)],
+	[AC_MSG_RESULT(short)
+	 AC_DEFINE([NEED_SHORT_EXTERNAL_NAMES], 1, [Define if you need short function names])])
+
+# Checks for library functions.
+AC_CHECK_FUNCS([memset memcpy], [],
+	[AC_DEFINE([NEED_BSD_STRINGS], 1,
+		   [Define if you have BSD-like bzero and bcopy])])
+
+# Set flags to indicate platform
+case "$host_os" in
+  cygwin* | mingw* | pw32* | interix*)
+    is_win32=1
+  ;;
+esac
+AM_CONDITIONAL([IS_WIN32], [test "x$is_win32" = "x1"])
+
+# SIMD is optional
+AC_ARG_WITH([simd],
+    AC_HELP_STRING([--without-simd],[Omit accelerated SIMD routines.]))
+if test "x${with_simd}" != "xno"; then
+  # Check if we're on a supported CPU
+  AC_MSG_CHECKING([if we have SIMD optimisations for cpu type])
+  case "$host_cpu" in
+    x86_64)
+      AC_MSG_RESULT([yes (x86_64)])
+      AC_PROG_NASM
+      simd_arch=x86_64
+    ;;
+    i*86 | x86 | ia32)
+      AC_MSG_RESULT([yes (i386)])
+      AC_PROG_NASM
+      simd_arch=i386
+    ;;
+    *)
+      AC_MSG_RESULT([no ("$host_cpu")])
+      AC_MSG_ERROR([CPU is not supported])
+    ;;
+  esac
+
+  if test "x${with_simd}" != "xno"; then
+    AC_DEFINE([WITH_SIMD], [1], [Use accelerated SIMD routines.])
+  fi
+fi
+
+AM_CONDITIONAL([WITH_SIMD], [test "x$with_simd" != "xno"])
+AM_CONDITIONAL([SIMD_I386], [test "x$simd_arch" = "xi386"])
+AM_CONDITIONAL([SIMD_X86_64], [test "x$simd_arch" = "xx86_64"])
+
+# jconfig.h is the file we use, but we have another before that to
+# fool autoheader. the reason is that we include this header in our
+# API headers, which can screw things up for users of the lib.
+# jconfig.h is a minimal version that allows this package to be built
+AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_HEADERS([jconfig.h])
+AC_CONFIG_FILES([Makefile simd/Makefile])
+AC_OUTPUT

diff --git a/djpeg.1 b/djpeg.1
deleted file mode 100644
index 11beb6a..0000000
--- a/djpeg.1
+++ /dev/null

@@ -1,253 +0,0 @@
-.TH DJPEG 1 "22 August 1997"
-.SH NAME
-djpeg \- decompress a JPEG file to an image file
-.SH SYNOPSIS
-.B djpeg
-[
-.I options
-]
-[
-.I filename
-]
-.LP
-.SH DESCRIPTION
-.LP
-.B djpeg
-decompresses the named JPEG file, or the standard input if no file is named,
-and produces an image file on the standard output.  PBMPLUS (PPM/PGM), BMP,
-GIF, Targa, or RLE (Utah Raster Toolkit) output format can be selected.
-(RLE is supported only if the URT library is available.)
-.SH OPTIONS
-All switch names may be abbreviated; for example,
-.B \-grayscale
-may be written
-.B \-gray
-or
-.BR \-gr .
-Most of the "basic" switches can be abbreviated to as little as one letter.
-Upper and lower case are equivalent (thus
-.B \-BMP
-is the same as
-.BR \-bmp ).
-British spellings are also accepted (e.g.,
-.BR \-greyscale ),
-though for brevity these are not mentioned below.
-.PP
-The basic switches are:
-.TP
-.BI \-colors " N"
-Reduce image to at most N colors.  This reduces the number of colors used in
-the output image, so that it can be displayed on a colormapped display or
-stored in a colormapped file format.  For example, if you have an 8-bit
-display, you'd need to reduce to 256 or fewer colors.
-.TP
-.BI \-quantize " N"
-Same as
-.BR \-colors .
-.B \-colors
-is the recommended name,
-.B \-quantize
-is provided only for backwards compatibility.
-.TP
-.B \-fast
-Select recommended processing options for fast, low quality output.  (The
-default options are chosen for highest quality output.)  Currently, this is
-equivalent to \fB\-dct fast \-nosmooth \-onepass \-dither ordered\fR.
-.TP
-.B \-grayscale
-Force gray-scale output even if JPEG file is color.  Useful for viewing on
-monochrome displays; also,
-.B djpeg
-runs noticeably faster in this mode.
-.TP
-.BI \-scale " M/N"
-Scale the output image by a factor M/N.  Currently the scale factor must be
-1/1, 1/2, 1/4, or 1/8.  Scaling is handy if the image is larger than your
-screen; also,
-.B djpeg
-runs much faster when scaling down the output.
-.TP
-.B \-bmp
-Select BMP output format (Windows flavor).  8-bit colormapped format is
-emitted if
-.B \-colors
-or
-.B \-grayscale
-is specified, or if the JPEG file is gray-scale; otherwise, 24-bit full-color
-format is emitted.
-.TP
-.B \-gif
-Select GIF output format.  Since GIF does not support more than 256 colors,
-.B \-colors 256
-is assumed (unless you specify a smaller number of colors).
-.TP
-.B \-os2
-Select BMP output format (OS/2 1.x flavor).  8-bit colormapped format is
-emitted if
-.B \-colors
-or
-.B \-grayscale
-is specified, or if the JPEG file is gray-scale; otherwise, 24-bit full-color
-format is emitted.
-.TP
-.B \-pnm
-Select PBMPLUS (PPM/PGM) output format (this is the default format).
-PGM is emitted if the JPEG file is gray-scale or if
-.B \-grayscale
-is specified; otherwise PPM is emitted.
-.TP
-.B \-rle
-Select RLE output format.  (Requires URT library.)
-.TP
-.B \-targa
-Select Targa output format.  Gray-scale format is emitted if the JPEG file is
-gray-scale or if
-.B \-grayscale
-is specified; otherwise, colormapped format is emitted if
-.B \-colors
-is specified; otherwise, 24-bit full-color format is emitted.
-.PP
-Switches for advanced users:
-.TP
-.B \-dct int
-Use integer DCT method (default).
-.TP
-.B \-dct fast
-Use fast integer DCT (less accurate).
-.TP
-.B \-dct float
-Use floating-point DCT method.
-The float method is very slightly more accurate than the int method, but is
-much slower unless your machine has very fast floating-point hardware.  Also
-note that results of the floating-point method may vary slightly across
-machines, while the integer methods should give the same results everywhere.
-The fast integer method is much less accurate than the other two.
-.TP
-.B \-dither fs
-Use Floyd-Steinberg dithering in color quantization.
-.TP
-.B \-dither ordered
-Use ordered dithering in color quantization.
-.TP
-.B \-dither none
-Do not use dithering in color quantization.
-By default, Floyd-Steinberg dithering is applied when quantizing colors; this
-is slow but usually produces the best results.  Ordered dither is a compromise
-between speed and quality; no dithering is fast but usually looks awful.  Note
-that these switches have no effect unless color quantization is being done.
-Ordered dither is only available in
-.B \-onepass
-mode.
-.TP
-.BI \-map " file"
-Quantize to the colors used in the specified image file.  This is useful for
-producing multiple files with identical color maps, or for forcing a
-predefined set of colors to be used.  The
-.I file
-must be a GIF or PPM file. This option overrides
-.B \-colors
-and
-.BR \-onepass .
-.TP
-.B \-nosmooth
-Use a faster, lower-quality upsampling routine.
-.TP
-.B \-onepass
-Use one-pass instead of two-pass color quantization.  The one-pass method is
-faster and needs less memory, but it produces a lower-quality image.
-.B \-onepass
-is ignored unless you also say
-.B \-colors
-.IR N .
-Also, the one-pass method is always used for gray-scale output (the two-pass
-method is no improvement then).
-.TP
-.BI \-maxmemory " N"
-Set limit for amount of memory to use in processing large images.  Value is
-in thousands of bytes, or millions of bytes if "M" is attached to the
-number.  For example,
-.B \-max 4m
-selects 4000000 bytes.  If more space is needed, temporary files will be used.
-.TP
-.BI \-outfile " name"
-Send output image to the named file, not to standard output.
-.TP
-.B \-verbose
-Enable debug printout.  More
-.BR \-v 's
-give more output.  Also, version information is printed at startup.
-.TP
-.B \-debug
-Same as
-.BR \-verbose .
-.SH EXAMPLES
-.LP
-This example decompresses the JPEG file foo.jpg, quantizes it to
-256 colors, and saves the output in 8-bit BMP format in foo.bmp:
-.IP
-.B djpeg \-colors 256 \-bmp
-.I foo.jpg
-.B >
-.I foo.bmp
-.SH HINTS
-To get a quick preview of an image, use the
-.B \-grayscale
-and/or
-.B \-scale
-switches.
-.B \-grayscale \-scale 1/8
-is the fastest case.
-.PP
-Several options are available that trade off image quality to gain speed.
-.B \-fast
-turns on the recommended settings.
-.PP
-.B \-dct fast
-and/or
-.B \-nosmooth
-gain speed at a small sacrifice in quality.
-When producing a color-quantized image,
-.B \-onepass \-dither ordered
-is fast but much lower quality than the default behavior.
-.B \-dither none
-may give acceptable results in two-pass mode, but is seldom tolerable in
-one-pass mode.
-.PP
-If you are fortunate enough to have very fast floating point hardware,
-\fB\-dct float\fR may be even faster than \fB\-dct fast\fR.  But on most
-machines \fB\-dct float\fR is slower than \fB\-dct int\fR; in this case it is
-not worth using, because its theoretical accuracy advantage is too small to be
-significant in practice.
-.SH ENVIRONMENT
-.TP
-.B JPEGMEM
-If this environment variable is set, its value is the default memory limit.
-The value is specified as described for the
-.B \-maxmemory
-switch.
-.B JPEGMEM
-overrides the default value specified when the program was compiled, and
-itself is overridden by an explicit
-.BR \-maxmemory .
-.SH SEE ALSO
-.BR cjpeg (1),
-.BR jpegtran (1),
-.BR rdjpgcom (1),
-.BR wrjpgcom (1)
-.br
-.BR ppm (5),
-.BR pgm (5)
-.br
-Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
-Communications of the ACM, April 1991 (vol. 34, no. 4), pp. 30-44.
-.SH AUTHOR
-Independent JPEG Group
-.SH BUGS
-Arithmetic coding is not supported for legal reasons.
-.PP
-To avoid the Unisys LZW patent,
-.B djpeg
-produces uncompressed GIF files.  These are larger than they should be, but
-are readable by standard GIF decoders.
-.PP
-Still not as fast as we'd like.

diff --git a/install-sh b/install-sh
deleted file mode 100755
index e843669..0000000
--- a/install-sh
+++ /dev/null

@@ -1,250 +0,0 @@
-#!/bin/sh
-#
-# install - install a program, script, or datafile
-# This comes from X11R5 (mit/util/scripts/install.sh).
-#
-# Copyright 1991 by the Massachusetts Institute of Technology
-#
-# Permission to use, copy, modify, distribute, and sell this software and its
-# documentation for any purpose is hereby granted without fee, provided that
-# the above copyright notice appear in all copies and that both that
-# copyright notice and this permission notice appear in supporting
-# documentation, and that the name of M.I.T. not be used in advertising or
-# publicity pertaining to distribution of the software without specific,
-# written prior permission.  M.I.T. makes no representations about the
-# suitability of this software for any purpose.  It is provided "as is"
-# without express or implied warranty.
-#
-# Calling this script install-sh is preferred over install.sh, to prevent
-# `make' implicit rules from creating a file called install from it
-# when there is no Makefile.
-#
-# This script is compatible with the BSD install script, but was written
-# from scratch.  It can only install one file at a time, a restriction
-# shared with many OS's install programs.
-
-
-# set DOITPROG to echo to test this script
-
-# Don't use :- since 4.3BSD and earlier shells don't like it.
-doit="${DOITPROG-}"
-
-
-# put in absolute paths if you don't have them in your path; or use env. vars.
-
-mvprog="${MVPROG-mv}"
-cpprog="${CPPROG-cp}"
-chmodprog="${CHMODPROG-chmod}"
-chownprog="${CHOWNPROG-chown}"
-chgrpprog="${CHGRPPROG-chgrp}"
-stripprog="${STRIPPROG-strip}"
-rmprog="${RMPROG-rm}"
-mkdirprog="${MKDIRPROG-mkdir}"
-
-transformbasename=""
-transform_arg=""
-instcmd="$mvprog"
-chmodcmd="$chmodprog 0755"
-chowncmd=""
-chgrpcmd=""
-stripcmd=""
-rmcmd="$rmprog -f"
-mvcmd="$mvprog"
-src=""
-dst=""
-dir_arg=""
-
-while [ x"$1" != x ]; do
-    case $1 in
-	-c) instcmd="$cpprog"
-	    shift
-	    continue;;
-
-	-d) dir_arg=true
-	    shift
-	    continue;;
-
-	-m) chmodcmd="$chmodprog $2"
-	    shift
-	    shift
-	    continue;;
-
-	-o) chowncmd="$chownprog $2"
-	    shift
-	    shift
-	    continue;;
-
-	-g) chgrpcmd="$chgrpprog $2"
-	    shift
-	    shift
-	    continue;;
-
-	-s) stripcmd="$stripprog"
-	    shift
-	    continue;;
-
-	-t=*) transformarg=`echo $1 | sed 's/-t=//'`
-	    shift
-	    continue;;
-
-	-b=*) transformbasename=`echo $1 | sed 's/-b=//'`
-	    shift
-	    continue;;
-
-	*)  if [ x"$src" = x ]
-	    then
-		src=$1
-	    else
-		# this colon is to work around a 386BSD /bin/sh bug
-		:
-		dst=$1
-	    fi
-	    shift
-	    continue;;
-    esac
-done
-
-if [ x"$src" = x ]
-then
-	echo "install:	no input file specified"
-	exit 1
-else
-	true
-fi
-
-if [ x"$dir_arg" != x ]; then
-	dst=$src
-	src=""
-	
-	if [ -d $dst ]; then
-		instcmd=:
-	else
-		instcmd=mkdir
-	fi
-else
-
-# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
-# might cause directories to be created, which would be especially bad 
-# if $src (and thus $dsttmp) contains '*'.
-
-	if [ -f $src -o -d $src ]
-	then
-		true
-	else
-		echo "install:  $src does not exist"
-		exit 1
-	fi
-	
-	if [ x"$dst" = x ]
-	then
-		echo "install:	no destination specified"
-		exit 1
-	else
-		true
-	fi
-
-# If destination is a directory, append the input filename; if your system
-# does not like double slashes in filenames, you may need to add some logic
-
-	if [ -d $dst ]
-	then
-		dst="$dst"/`basename $src`
-	else
-		true
-	fi
-fi
-
-## this sed command emulates the dirname command
-dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
-
-# Make sure that the destination directory exists.
-#  this part is taken from Noah Friedman's mkinstalldirs script
-
-# Skip lots of stat calls in the usual case.
-if [ ! -d "$dstdir" ]; then
-defaultIFS='	
-'
-IFS="${IFS-${defaultIFS}}"
-
-oIFS="${IFS}"
-# Some sh's can't handle IFS=/ for some reason.
-IFS='%'
-set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
-IFS="${oIFS}"
-
-pathcomp=''
-
-while [ $# -ne 0 ] ; do
-	pathcomp="${pathcomp}${1}"
-	shift
-
-	if [ ! -d "${pathcomp}" ] ;
-        then
-		$mkdirprog "${pathcomp}"
-	else
-		true
-	fi
-
-	pathcomp="${pathcomp}/"
-done
-fi
-
-if [ x"$dir_arg" != x ]
-then
-	$doit $instcmd $dst &&
-
-	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
-	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
-	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
-	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
-else
-
-# If we're going to rename the final executable, determine the name now.
-
-	if [ x"$transformarg" = x ] 
-	then
-		dstfile=`basename $dst`
-	else
-		dstfile=`basename $dst $transformbasename | 
-			sed $transformarg`$transformbasename
-	fi
-
-# don't allow the sed command to completely eliminate the filename
-
-	if [ x"$dstfile" = x ] 
-	then
-		dstfile=`basename $dst`
-	else
-		true
-	fi
-
-# Make a temp file name in the proper directory.
-
-	dsttmp=$dstdir/#inst.$$#
-
-# Move or copy the file name to the temp name
-
-	$doit $instcmd $src $dsttmp &&
-
-	trap "rm -f ${dsttmp}" 0 &&
-
-# and set any options; do chmod last to preserve setuid bits
-
-# If any of these fail, we abort the whole thing.  If we want to
-# ignore errors from any of these, just make sure not to ignore
-# errors from the above "$doit $instcmd $src $dsttmp" command.
-
-	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
-	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
-	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
-	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
-
-# Now rename the file to the real destination.
-
-	$doit $rmcmd -f $dstdir/$dstfile &&
-	$doit $mvcmd $dsttmp $dstdir/$dstfile 
-
-fi &&
-
-
-exit 0

diff --git a/jccolor.c b/jccolor.c
index 0a8a4b5..2e2bfd2 100644
--- a/jccolor.c
+++ b/jccolor.c

@@ -2,6 +2,8 @@
  * jccolor.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -11,6 +13,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"
 
 
 /* Private subobject */
@@ -78,6 +81,74 @@
 #define TABLE_SIZE	(8*(MAXJSAMPLE+1))
 
 
+#if BITS_IN_JSAMPLE == 8
+
+const unsigned char red_lut[256] = {
+  0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
+  5 , 5 , 5 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 8 , 8 , 8 , 9 , 9 , 9 ,
+  10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14,
+  14, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19,
+  19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 24,
+  24, 24, 25, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 28,
+  29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33,
+  33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38,
+  38, 39, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 42, 43,
+  43, 43, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48,
+  48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52,
+  53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 56, 56, 56, 57, 57, 57,
+  57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 60, 61, 61, 61, 62, 62,
+  62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 67,
+  67, 67, 68, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71,
+  72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 76, 76, 76
+};
+
+const unsigned char green_lut[256] = {
+  0  , 1  , 1  , 2  , 2  , 3  , 4  , 4  , 5  , 5  , 6  , 6  ,
+  7  , 8  , 8  , 9  , 9  , 10 , 11 , 11 , 12 , 12 , 13 , 14 ,
+  14 , 15 , 15 , 16 , 16 , 17 , 18 , 18 , 19 , 19 , 20 , 21 ,
+  21 , 22 , 22 , 23 , 23 , 24 , 25 , 25 , 26 , 26 , 27 , 28 ,
+  28 , 29 , 29 , 30 , 31 , 31 , 32 , 32 , 33 , 33 , 34 , 35 ,
+  35 , 36 , 36 , 37 , 38 , 38 , 39 , 39 , 40 , 41 , 41 , 42 ,
+  42 , 43 , 43 , 44 , 45 , 45 , 46 , 46 , 47 , 48 , 48 , 49 ,
+  49 , 50 , 50 , 51 , 52 , 52 , 53 , 53 , 54 , 55 , 55 , 56 ,
+  56 , 57 , 58 , 58 , 59 , 59 , 60 , 60 , 61 , 62 , 62 , 63 ,
+  63 , 64 , 65 , 65 , 66 , 66 , 67 , 68 , 68 , 69 , 69 , 70 ,
+  70 , 71 , 72 , 72 , 73 , 73 , 74 , 75 , 75 , 76 , 76 , 77 ,
+  77 , 78 , 79 , 79 , 80 , 80 , 81 , 82 , 82 , 83 , 83 , 84 ,
+  85 , 85 , 86 , 86 , 87 , 87 , 88 , 89 , 89 , 90 , 90 , 91 ,
+  92 , 92 , 93 , 93 , 94 , 95 , 95 , 96 , 96 , 97 , 97 , 98 ,
+  99 , 99 , 100, 100, 101, 102, 102, 103, 103, 104, 104, 105,
+  106, 106, 107, 107, 108, 109, 109, 110, 110, 111, 112, 112,
+  113, 113, 114, 114, 115, 116, 116, 117, 117, 118, 119, 119,
+  120, 120, 121, 122, 122, 123, 123, 124, 124, 125, 126, 126,
+  127, 127, 128, 129, 129, 130, 130, 131, 131, 132, 133, 133,
+  34, 134, 135, 136, 136, 137, 137, 138, 139, 139, 140, 140,
+  141, 141, 142, 143, 143, 144, 144, 145, 146, 146, 147, 147,
+  148, 149, 149, 150
+};
+
+const unsigned char blue_lut[256] = {
+  0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 ,
+  2 , 2 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 ,
+  4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 ,
+  5 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 7 , 7 ,
+  7 , 7 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 9 ,
+  9 , 9 , 9 , 9 , 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
+  11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
+  18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
+  20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+  22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24,
+  24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+  26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27,
+  27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29
+};
+
+#endif
+
+
 /*
  * Initialize for RGB->YCC colorspace conversion.
  */
@@ -146,10 +217,10 @@
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
-      inptr += RGB_PIXELSIZE;
+      r = GETJSAMPLE(inptr[rgb_red[cinfo->in_color_space]]);
+      g = GETJSAMPLE(inptr[rgb_green[cinfo->in_color_space]]);
+      b = GETJSAMPLE(inptr[rgb_blue[cinfo->in_color_space]]);
+      inptr += rgb_pixelsize[cinfo->in_color_space];
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
        * Hence the value being shifted is never negative, and we don't
@@ -188,26 +259,36 @@
 		  JDIMENSION output_row, int num_rows)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  register int r, g, b;
+  #if BITS_IN_JSAMPLE != 8
   register INT32 * ctab = cconvert->rgb_ycc_tab;
+  #endif
   register JSAMPROW inptr;
   register JSAMPROW outptr;
+  JSAMPLE *maxoutptr;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->image_width;
+  int rindex = rgb_red[cinfo->in_color_space];
+  int gindex = rgb_green[cinfo->in_color_space];
+  int bindex = rgb_blue[cinfo->in_color_space];
+  int rgbstride = rgb_pixelsize[cinfo->in_color_space];
 
   while (--num_rows >= 0) {
     inptr = *input_buf++;
     outptr = output_buf[0][output_row];
+    maxoutptr = &outptr[num_cols];
     output_row++;
-    for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
-      inptr += RGB_PIXELSIZE;
+    for (; outptr < maxoutptr; outptr++, inptr += rgbstride) {
       /* Y */
-      outptr[col] = (JSAMPLE)
-		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-		 >> SCALEBITS);
+      #if BITS_IN_JSAMPLE == 8
+      *outptr = red_lut[inptr[rindex]] + green_lut[inptr[gindex]]
+	    + blue_lut[inptr[bindex]];
+      #else
+      *outptr = (JSAMPLE)
+	    ((ctab[GETJSAMPLE(inptr[rindex])+R_Y_OFF]
+	     + ctab[GETJSAMPLE(inptr[gindex])+G_Y_OFF]
+	     + ctab[GETJSAMPLE(inptr[bindex])+B_Y_OFF])
+	     >> SCALEBITS);
+      #endif
     }
   }
 }
@@ -368,11 +449,15 @@
     break;
 
   case JCS_RGB:
-#if RGB_PIXELSIZE != 3
-    if (cinfo->input_components != RGB_PIXELSIZE)
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
+    if (cinfo->input_components != rgb_pixelsize[cinfo->in_color_space])
       ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
     break;
-#endif /* else share code with YCbCr */
 
   case JCS_YCbCr:
     if (cinfo->input_components != 3)
@@ -398,7 +483,13 @@
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
     if (cinfo->in_color_space == JCS_GRAYSCALE)
       cconvert->pub.color_convert = grayscale_convert;
-    else if (cinfo->in_color_space == JCS_RGB) {
+    else if (cinfo->in_color_space == JCS_RGB ||
+             cinfo->in_color_space == JCS_EXT_RGB ||
+             cinfo->in_color_space == JCS_EXT_RGBX ||
+             cinfo->in_color_space == JCS_EXT_BGR ||
+             cinfo->in_color_space == JCS_EXT_BGRX ||
+             cinfo->in_color_space == JCS_EXT_XBGR ||
+             cinfo->in_color_space == JCS_EXT_XRGB) {
       cconvert->pub.start_pass = rgb_ycc_start;
       cconvert->pub.color_convert = rgb_gray_convert;
     } else if (cinfo->in_color_space == JCS_YCbCr)
@@ -408,9 +499,16 @@
     break;
 
   case JCS_RGB:
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
     if (cinfo->num_components != 3)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == JCS_RGB && RGB_PIXELSIZE == 3)
+    if (cinfo->in_color_space == cinfo->jpeg_color_space &&
+      rgb_pixelsize[cinfo->in_color_space] == 3)
       cconvert->pub.color_convert = null_convert;
     else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
@@ -419,9 +517,19 @@
   case JCS_YCbCr:
     if (cinfo->num_components != 3)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == JCS_RGB) {
-      cconvert->pub.start_pass = rgb_ycc_start;
-      cconvert->pub.color_convert = rgb_ycc_convert;
+    if (cinfo->in_color_space == JCS_RGB ||
+        cinfo->in_color_space == JCS_EXT_RGB ||
+        cinfo->in_color_space == JCS_EXT_RGBX ||
+        cinfo->in_color_space == JCS_EXT_BGR ||
+        cinfo->in_color_space == JCS_EXT_BGRX ||
+        cinfo->in_color_space == JCS_EXT_XBGR ||
+        cinfo->in_color_space == JCS_EXT_XRGB) {
+      if (jsimd_can_rgb_ycc())
+        cconvert->pub.color_convert = jsimd_rgb_ycc_convert;
+      else {
+        cconvert->pub.start_pass = rgb_ycc_start;
+        cconvert->pub.color_convert = rgb_ycc_convert;
+      }
     } else if (cinfo->in_color_space == JCS_YCbCr)
       cconvert->pub.color_convert = null_convert;
     else

diff --git a/jcdctmgr.c b/jcdctmgr.c
index 61fa79b..156957a 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c

@@ -2,6 +2,8 @@
  * jcdctmgr.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -15,15 +17,35 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jsimddct.h"
 
 
 /* Private subobject for this module */
 
+typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
+typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
+
+typedef JMETHOD(void, convsamp_method_ptr,
+                (JSAMPARRAY sample_data, JDIMENSION start_col,
+                 DCTELEM * workspace));
+typedef JMETHOD(void, float_convsamp_method_ptr,
+                (JSAMPARRAY sample_data, JDIMENSION start_col,
+                 FAST_FLOAT *workspace));
+
+typedef JMETHOD(void, quantize_method_ptr,
+                (JCOEFPTR coef_block, DCTELEM * divisors,
+                 DCTELEM * workspace));
+typedef JMETHOD(void, float_quantize_method_ptr,
+                (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                 FAST_FLOAT * workspace));
+
 typedef struct {
   struct jpeg_forward_dct pub;	/* public fields */
 
   /* Pointer to the DCT routine actually in use */
-  forward_DCT_method_ptr do_dct;
+  forward_DCT_method_ptr dct;
+  convsamp_method_ptr convsamp;
+  quantize_method_ptr quantize;
 
   /* The actual post-DCT divisors --- not identical to the quant table
    * entries, because of scaling (especially for an unnormalized DCT).
@@ -31,10 +53,16 @@
    */
   DCTELEM * divisors[NUM_QUANT_TBLS];
 
+  /* work area for FDCT subroutine */
+  DCTELEM * workspace;
+
 #ifdef DCT_FLOAT_SUPPORTED
   /* Same as above for the floating-point case. */
-  float_DCT_method_ptr do_float_dct;
+  float_DCT_method_ptr float_dct;
+  float_convsamp_method_ptr float_convsamp;
+  float_quantize_method_ptr float_quantize;
   FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
+  FAST_FLOAT * float_workspace;
 #endif
 } my_fdct_controller;
 
@@ -42,6 +70,128 @@
 
 
 /*
+ * Find the highest bit in an integer through binary search.
+ */
+LOCAL(int)
+flss (UINT16 val)
+{
+  int bit;
+
+  bit = 16;
+
+  if (!val)
+    return 0;
+
+  if (!(val & 0xff00)) {
+    bit -= 8;
+    val <<= 8;
+  }
+  if (!(val & 0xf000)) {
+    bit -= 4;
+    val <<= 4;
+  }
+  if (!(val & 0xc000)) {
+    bit -= 2;
+    val <<= 2;
+  }
+  if (!(val & 0x8000)) {
+    bit -= 1;
+    val <<= 1;
+  }
+
+  return bit;
+}
+
+/*
+ * Compute values to do a division using reciprocal.
+ *
+ * This implementation is based on an algorithm described in
+ *   "How to optimize for the Pentium family of microprocessors"
+ *   (http://www.agner.org/assem/).
+ * More information about the basic algorithm can be found in
+ * the paper "Integer Division Using Reciprocals" by Robert Alverson.
+ *
+ * The basic idea is to replace x/d by x * d^-1. In order to store
+ * d^-1 with enough precision we shift it left a few places. It turns
+ * out that this algoright gives just enough precision, and also fits
+ * into DCTELEM:
+ *
+ *   b = (the number of significant bits in divisor) - 1
+ *   r = (word size) + b
+ *   f = 2^r / divisor
+ *
+ * f will not be an integer for most cases, so we need to compensate
+ * for the rounding error introduced:
+ *
+ *   no fractional part:
+ *
+ *       result = input >> r
+ *
+ *   fractional part of f < 0.5:
+ *
+ *       round f down to nearest integer
+ *       result = ((input + 1) * f) >> r
+ *
+ *   fractional part of f > 0.5:
+ *
+ *       round f up to nearest integer
+ *       result = (input * f) >> r
+ *
+ * This is the original algorithm that gives truncated results. But we
+ * want properly rounded results, so we replace "input" with
+ * "input + divisor/2".
+ *
+ * In order to allow SIMD implementations we also tweak the values to
+ * allow the same calculation to be made at all times:
+ * 
+ *   dctbl[0] = f rounded to nearest integer
+ *   dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
+ *   dctbl[2] = 1 << ((word size) * 2 - r)
+ *   dctbl[3] = r - (word size)
+ *
+ * dctbl[2] is for stupid instruction sets where the shift operation
+ * isn't member wise (e.g. MMX).
+ *
+ * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
+ * is that most SIMD implementations have a "multiply and store top
+ * half" operation.
+ *
+ * Lastly, we store each of the values in their own table instead
+ * of in a consecutive manner, yet again in order to allow SIMD
+ * routines.
+ */
+LOCAL(void)
+compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
+{
+  UDCTELEM2 fq, fr;
+  UDCTELEM c;
+  int b, r;
+
+  b = flss(divisor) - 1;
+  r  = sizeof(DCTELEM) * 8 + b;
+
+  fq = ((UDCTELEM2)1 << r) / divisor;
+  fr = ((UDCTELEM2)1 << r) % divisor;
+
+  c = divisor / 2; /* for rounding */
+
+  if (fr == 0) { /* divisor is power of two */
+    /* fq will be one bit too large to fit in DCTELEM, so adjust */
+    fq >>= 1;
+    r--;
+  } else if (fr <= (divisor / 2)) { /* fractional part is < 0.5 */
+    c++;
+  } else { /* fractional part is > 0.5 */
+    fq++;
+  }
+
+  dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;      /* reciprocal */
+  dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
+  dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
+  dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+}
+
+/*
  * Initialize for a processing pass.
  * Verify that all referenced Q-tables are present, and set up
  * the divisor table for each one.
@@ -78,11 +228,11 @@
       if (fdct->divisors[qtblno] == NULL) {
 	fdct->divisors[qtblno] = (DCTELEM *)
 	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      DCTSIZE2 * SIZEOF(DCTELEM));
+				      (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
       }
       dtbl = fdct->divisors[qtblno];
       for (i = 0; i < DCTSIZE2; i++) {
-	dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
+	compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
       }
       break;
 #endif
@@ -112,14 +262,14 @@
 	if (fdct->divisors[qtblno] == NULL) {
 	  fdct->divisors[qtblno] = (DCTELEM *)
 	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-					DCTSIZE2 * SIZEOF(DCTELEM));
+					(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
 	}
 	dtbl = fdct->divisors[qtblno];
 	for (i = 0; i < DCTSIZE2; i++) {
-	  dtbl[i] = (DCTELEM)
+	  compute_reciprocal(
 	    DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
 				  (INT32) aanscales[i]),
-		    CONST_BITS-3);
+		    CONST_BITS-3), &dtbl[i]);
 	}
       }
       break;
@@ -169,6 +319,77 @@
 
 
 /*
+ * Load data into workspace, applying unsigned->signed conversion.
+ */
+
+METHODDEF(void)
+convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
+{
+  register DCTELEM *workspaceptr;
+  register JSAMPROW elemptr;
+  register int elemr;
+
+  workspaceptr = workspace;
+  for (elemr = 0; elemr < DCTSIZE; elemr++) {
+    elemptr = sample_data[elemr] + start_col;
+
+#if DCTSIZE == 8		/* unroll the inner loop */
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+#else
+    {
+      register int elemc;
+      for (elemc = DCTSIZE; elemc > 0; elemc--)
+        *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    }
+#endif
+  }
+}
+
+
+/*
+ * Quantize/descale the coefficients, and store into coef_blocks[].
+ */
+
+METHODDEF(void)
+quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
+{
+  int i;
+  DCTELEM temp;
+  UDCTELEM recip, corr, shift;
+  UDCTELEM2 product;
+  JCOEFPTR output_ptr = coef_block;
+
+  for (i = 0; i < DCTSIZE2; i++) {
+    temp = workspace[i];
+    recip = divisors[i + DCTSIZE2 * 0];
+    corr =  divisors[i + DCTSIZE2 * 1];
+    shift = divisors[i + DCTSIZE2 * 3];
+
+    if (temp < 0) {
+      temp = -temp;
+      product = (UDCTELEM2)(temp + corr) * recip;
+      product >>= shift + sizeof(DCTELEM)*8;
+      temp = product;
+      temp = -temp;
+    } else {
+      product = (UDCTELEM2)(temp + corr) * recip;
+      product >>= shift + sizeof(DCTELEM)*8;
+      temp = product;
+    }
+
+    output_ptr[i] = (JCOEF) temp;
+  }
+}
+
+
+/*
  * Perform forward DCT on one or more blocks of a component.
  *
  * The input samples are taken from the sample_data[] array starting at
@@ -185,87 +406,87 @@
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  forward_DCT_method_ptr do_dct = fdct->do_dct;
   DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
-  DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
+  DCTELEM * workspace;
   JDIMENSION bi;
 
+  /* Make sure the compiler doesn't look up these every pass */
+  forward_DCT_method_ptr do_dct = fdct->dct;
+  convsamp_method_ptr do_convsamp = fdct->convsamp;
+  quantize_method_ptr do_quantize = fdct->quantize;
+  workspace = fdct->workspace;
+
   sample_data += start_row;	/* fold in the vertical offset once */
 
   for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
     /* Load data into workspace, applying unsigned->signed conversion */
-    { register DCTELEM *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	  }
-	}
-#endif
-      }
-    }
+    (*do_convsamp) (sample_data, start_col, workspace);
 
     /* Perform the DCT */
     (*do_dct) (workspace);
 
     /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register DCTELEM temp, qval;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	qval = divisors[i];
-	temp = workspace[i];
-	/* Divide the coefficient value by qval, ensuring proper rounding.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 *
-	 * In most files, at least half of the output values will be zero
-	 * (at default quantization settings, more like three-quarters...)
-	 * so we should ensure that this case is fast.  On many machines,
-	 * a comparison is enough cheaper than a divide to make a special test
-	 * a win.  Since both inputs will be nonnegative, we need only test
-	 * for a < b to discover whether a/b is 0.
-	 * If your machine's division is fast enough, define FAST_DIVIDE.
-	 */
-#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b)	a /= b
-#else
-#define DIVIDE_BY(a,b)	if (a >= b) a /= b; else a = 0
-#endif
-	if (temp < 0) {
-	  temp = -temp;
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	  temp = -temp;
-	} else {
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	}
-	output_ptr[i] = (JCOEF) temp;
-      }
-    }
+    (*do_quantize) (coef_blocks[bi], divisors, workspace);
   }
 }
 
 
 #ifdef DCT_FLOAT_SUPPORTED
 
+
+METHODDEF(void)
+convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)
+{
+  register FAST_FLOAT *workspaceptr;
+  register JSAMPROW elemptr;
+  register int elemr;
+
+  workspaceptr = workspace;
+  for (elemr = 0; elemr < DCTSIZE; elemr++) {
+    elemptr = sample_data[elemr] + start_col;
+#if DCTSIZE == 8		/* unroll the inner loop */
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+#else
+    {
+      register int elemc;
+      for (elemc = DCTSIZE; elemc > 0; elemc--)
+        *workspaceptr++ = (FAST_FLOAT)
+                          (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    }
+#endif
+  }
+}
+
+
+METHODDEF(void)
+quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)
+{
+  register FAST_FLOAT temp;
+  register int i;
+  register JCOEFPTR output_ptr = coef_block;
+
+  for (i = 0; i < DCTSIZE2; i++) {
+    /* Apply the quantization and scaling factor */
+    temp = workspace[i] * divisors[i];
+
+    /* Round to nearest integer.
+     * Since C does not specify the direction of rounding for negative
+     * quotients, we have to force the dividend positive for portability.
+     * The maximum coefficient size is +-16K (for 12-bit data), so this
+     * code should work for either 16-bit or 32-bit ints.
+     */
+    output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
+  }
+}
+
+
 METHODDEF(void)
 forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
 		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
@@ -275,62 +496,28 @@
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  float_DCT_method_ptr do_dct = fdct->do_float_dct;
   FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
-  FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
+  FAST_FLOAT * workspace;
   JDIMENSION bi;
 
+
+  /* Make sure the compiler doesn't look up these every pass */
+  float_DCT_method_ptr do_dct = fdct->float_dct;
+  float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
+  float_quantize_method_ptr do_quantize = fdct->float_quantize;
+  workspace = fdct->float_workspace;
+
   sample_data += start_row;	/* fold in the vertical offset once */
 
   for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
     /* Load data into workspace, applying unsigned->signed conversion */
-    { register FAST_FLOAT *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = (FAST_FLOAT)
-	      (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	  }
-	}
-#endif
-      }
-    }
+    (*do_convsamp) (sample_data, start_col, workspace);
 
     /* Perform the DCT */
     (*do_dct) (workspace);
 
     /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register FAST_FLOAT temp;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	/* Apply the quantization and scaling factor */
-	temp = workspace[i] * divisors[i];
-	/* Round to nearest integer.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 * The maximum coefficient size is +-16K (for 12-bit data), so this
-	 * code should work for either 16-bit or 32-bit ints.
-	 */
-	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
-      }
-    }
+    (*do_quantize) (coef_blocks[bi], divisors, workspace);
   }
 }
 
@@ -353,23 +540,33 @@
   cinfo->fdct = (struct jpeg_forward_dct *) fdct;
   fdct->pub.start_pass = start_pass_fdctmgr;
 
+  /* First determine the DCT... */
   switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
   case JDCT_ISLOW:
     fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_islow;
+    if (jsimd_can_fdct_islow())
+      fdct->dct = jsimd_fdct_islow;
+    else
+      fdct->dct = jpeg_fdct_islow;
     break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
   case JDCT_IFAST:
     fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_ifast;
+    if (jsimd_can_fdct_ifast())
+      fdct->dct = jsimd_fdct_ifast;
+    else
+      fdct->dct = jpeg_fdct_ifast;
     break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
   case JDCT_FLOAT:
     fdct->pub.forward_DCT = forward_DCT_float;
-    fdct->do_float_dct = jpeg_fdct_float;
+    if (jsimd_can_fdct_float())
+      fdct->float_dct = jsimd_fdct_float;
+    else
+      fdct->float_dct = jpeg_fdct_float;
     break;
 #endif
   default:
@@ -377,6 +574,54 @@
     break;
   }
 
+  /* ...then the supporting stages. */
+  switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+  case JDCT_ISLOW:
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+  case JDCT_IFAST:
+#endif
+#if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
+    if (jsimd_can_convsamp())
+      fdct->convsamp = jsimd_convsamp;
+    else
+      fdct->convsamp = convsamp;
+    if (jsimd_can_quantize())
+      fdct->quantize = jsimd_quantize;
+    else
+      fdct->quantize = quantize;
+    break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+  case JDCT_FLOAT:
+    if (jsimd_can_convsamp_float())
+      fdct->float_convsamp = jsimd_convsamp_float;
+    else
+      fdct->float_convsamp = convsamp_float;
+    if (jsimd_can_quantize_float())
+      fdct->float_quantize = jsimd_quantize_float;
+    else
+      fdct->float_quantize = quantize_float;
+    break;
+#endif
+  default:
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
+    break;
+  }
+
+  /* Allocate workspace memory */
+#ifdef DCT_FLOAT_SUPPORTED
+  if (cinfo->dct_method == JDCT_FLOAT)
+    fdct->float_workspace = (FAST_FLOAT *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  SIZEOF(FAST_FLOAT) * DCTSIZE2);
+  else
+#endif
+    fdct->workspace = (DCTELEM *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  SIZEOF(DCTELEM) * DCTSIZE2);
+
   /* Mark divisor tables unallocated */
   for (i = 0; i < NUM_QUANT_TBLS; i++) {
     fdct->divisors[i] = NULL;

diff --git a/jchuff.c b/jchuff.c
index f235250..daf0736 100644
--- a/jchuff.c
+++ b/jchuff.c

@@ -14,11 +14,33 @@
  * permanent JPEG objects only upon successful completion of an MCU.
  */
 
+/* Modifications:
+ * Copyright (C)2007 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version.  The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * wxWindows Library License for more details.
+ */
+
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jchuff.h"		/* Declarations shared with jcphuff.c */
+#include <limits.h>
 
+static unsigned char jpeg_first_bit_table[65536];
+int jpeg_first_bit_table_init=0;
+
+#ifndef min
+ #define min(a,b) ((a)<(b)?(a):(b))
+#endif
 
 /* Expanded entropy encoder object for Huffman encoding.
  *
@@ -27,7 +49,7 @@
  */
 
 typedef struct {
-  INT32 put_buffer;		/* current bit-accumulation buffer */
+  long put_buffer;		/* current bit-accumulation buffer */
   int put_bits;			/* # of bits now in it */
   int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
 } savable_state;
@@ -159,6 +181,7 @@
   }
 
   /* Initialize bit buffer to empty */
+
   entropy->saved.put_buffer = 0;
   entropy->saved.put_bits = 0;
 
@@ -261,6 +284,15 @@
     dtbl->ehufco[i] = huffcode[p];
     dtbl->ehufsi[i] = huffsize[p];
   }
+
+  if(!jpeg_first_bit_table_init) {
+    for(i = 0; i < 65536; i++) {
+      int bit = 0, val = i;
+      while (val) {val >>= 1;  bit++;}
+      jpeg_first_bit_table[i] = bit;
+    }
+    jpeg_first_bit_table_init = 1;
+  }
 }
 
 
@@ -280,6 +312,8 @@
 {
   struct jpeg_destination_mgr * dest = state->cinfo->dest;
 
+  dest->free_in_buffer = state->free_in_buffer;
+
   if (! (*dest->empty_output_buffer) (state->cinfo))
     return FALSE;
   /* After a successful buffer dump, must reset buffer pointers */
@@ -297,147 +331,250 @@
  * between calls, so 24 bits are sufficient.
  */
 
-INLINE
-LOCAL(boolean)
-emit_bits (working_state * state, unsigned int code, int size)
-/* Emit some bits; return TRUE if successful, FALSE if must suspend */
-{
-  /* This routine is heavily used, so it's worth coding tightly. */
-  register INT32 put_buffer = (INT32) code;
-  register int put_bits = state->cur.put_bits;
+/***************************************************************/
 
-  /* if size is 0, caller used an invalid Huffman table entry */
-  if (size == 0)
-    ERREXIT(state->cinfo, JERR_HUFF_MISSING_CODE);
+#define EMIT_BYTE() {                                           \
+  if (0xFF == (*buffer++ =  (unsigned char)(put_buffer >> (put_bits -= 8))))  \
+    *buffer++ = 0;                                              \
+ }
 
-  put_buffer &= (((INT32) 1)<<size) - 1; /* mask off any extra bits in code */
-  
-  put_bits += size;		/* new number of bits in buffer */
-  
-  put_buffer <<= 24 - put_bits; /* align incoming bits */
+/***************************************************************/
 
-  put_buffer |= state->cur.put_buffer; /* and merge with old buffer contents */
-  
-  while (put_bits >= 8) {
-    int c = (int) ((put_buffer >> 16) & 0xFF);
-    
-    emit_byte(state, c, return FALSE);
-    if (c == 0xFF) {		/* need to stuff a zero byte? */
-      emit_byte(state, 0, return FALSE);
-    }
-    put_buffer <<= 8;
-    put_bits -= 8;
-  }
+#define DUMP_BITS_(code, size) {                                \
+  put_bits += size;                                             \
+  put_buffer = (put_buffer << size) | code;                     \
+  if (put_bits > 7)                                             \
+    while(put_bits > 7)                                         \
+      EMIT_BYTE()                                               \
+ }
 
-  state->cur.put_buffer = put_buffer; /* update state variables */
-  state->cur.put_bits = put_bits;
+/***************************************************************/
 
-  return TRUE;
+#define CHECKBUF15() {                                          \
+  if (put_bits > 15) {                                          \
+    EMIT_BYTE()                                                 \
+    EMIT_BYTE()                                                 \
+  }                                                             \
 }
 
+#define CHECKBUF47() {                                          \
+  if (put_bits > 47) {                                          \
+    EMIT_BYTE()                                                 \
+    EMIT_BYTE()                                                 \
+    EMIT_BYTE()                                                 \
+    EMIT_BYTE()                                                 \
+    EMIT_BYTE()                                                 \
+    EMIT_BYTE()                                                 \
+  }                                                             \
+}
+
+#define CHECKBUF31() {                                          \
+  if (put_bits > 31) {                                          \
+    EMIT_BYTE()                                                 \
+    EMIT_BYTE()                                                 \
+    EMIT_BYTE()                                                 \
+    EMIT_BYTE()                                                 \
+  }                                                             \
+}
+
+/***************************************************************/
+
+#define DUMP_BITS_NOCHECK(code, size) {                         \
+  put_bits += size;                                             \
+  put_buffer = (put_buffer << size) | code;                     \
+ }
+
+#if __WORDSIZE==64
+
+#define DUMP_BITS(code, size) {                                 \
+  CHECKBUF47()                                                  \
+  put_bits += size;                                             \
+  put_buffer = (put_buffer << size) | code;                     \
+ }
+
+#else
+
+#define DUMP_BITS(code, size) {                                 \
+  put_bits += size;                                             \
+  put_buffer = (put_buffer << size) | code;                     \
+  CHECKBUF15()                                                  \
+ }
+
+#endif
+
+/***************************************************************/
+
+#define DUMP_SINGLE_VALUE(ht, codevalue) { \
+  size = ht->ehufsi[codevalue];            \
+  code = ht->ehufco[codevalue];            \
+                                           \
+  DUMP_BITS(code, size)                    \
+ }
+
+/***************************************************************/
+
+#define DUMP_VALUE_SLOW(ht, codevalue, t, nbits) { \
+  size = ht->ehufsi[codevalue];               \
+  code = ht->ehufco[codevalue];               \
+  t &= ~(-1 << nbits);                        \
+  DUMP_BITS_NOCHECK(code, size)               \
+  CHECKBUF15()                                \
+  DUMP_BITS_NOCHECK(t, nbits)                 \
+  CHECKBUF15()                                \
+ }
+
+int _max=0;
+
+#if __WORDSIZE==64
+
+#define DUMP_VALUE(ht, codevalue, t, nbits) { \
+  size = ht->ehufsi[codevalue];               \
+  code = ht->ehufco[codevalue];               \
+  t &= ~(-1 << nbits);                        \
+  CHECKBUF31()                                \
+  DUMP_BITS_NOCHECK(code, size)               \
+  DUMP_BITS_NOCHECK(t, nbits)                 \
+ }
+
+#else
+
+#define DUMP_VALUE(ht, codevalue, t, nbits) { \
+  size = ht->ehufsi[codevalue];               \
+  code = ht->ehufco[codevalue];               \
+  t &= ~(-1 << nbits);                        \
+  DUMP_BITS_NOCHECK(code, size)               \
+  CHECKBUF15()                                \
+  DUMP_BITS_NOCHECK(t, nbits)                 \
+  CHECKBUF15()                                \
+ }
+
+#endif
+
+/***************************************************************/
+
+#define BUFSIZE (DCTSIZE2 * 2)
+
+#define LOAD_BUFFER() {                                           \
+  if (state->free_in_buffer < BUFSIZE) {                          \
+    localbuf = 1;                                                 \
+    buffer = _buffer;                                             \
+  }                                                               \
+  else buffer = state->next_output_byte;                          \
+ }
+
+#define STORE_BUFFER() {                                          \
+  if (localbuf) {                                                 \
+    bytes = buffer - _buffer;                                     \
+    buffer = _buffer;                                             \
+    while (bytes > 0) {                                           \
+      bytestocopy = min(bytes, state->free_in_buffer);            \
+      MEMCOPY(state->next_output_byte, buffer, bytestocopy);      \
+      state->next_output_byte += bytestocopy;                     \
+      buffer += bytestocopy;                                      \
+      state->free_in_buffer -= bytestocopy;                       \
+      if (state->free_in_buffer == 0)                             \
+        if (! dump_buffer(state)) return FALSE;                   \
+      bytes -= bytestocopy;                                       \
+    }                                                             \
+  }                                                               \
+  else {                                                          \
+    state->free_in_buffer -= (buffer - state->next_output_byte);  \
+    state->next_output_byte = buffer;                             \
+  }                                                               \
+ }
+
+/***************************************************************/
 
 LOCAL(boolean)
 flush_bits (working_state * state)
 {
-  if (! emit_bits(state, 0x7F, 7)) /* fill any partial byte with ones */
-    return FALSE;
+  unsigned char _buffer[BUFSIZE], *buffer;
+  long put_buffer;  int put_bits;
+  int bytes, bytestocopy, localbuf = 0;
+
+  put_buffer = state->cur.put_buffer;
+  put_bits = state->cur.put_bits;
+  LOAD_BUFFER()
+
+  DUMP_BITS_(0x7F, 7)
+
   state->cur.put_buffer = 0;	/* and reset bit-buffer to empty */
   state->cur.put_bits = 0;
+  STORE_BUFFER()
+
   return TRUE;
 }
 
-
 /* Encode a single block's worth of coefficients */
 
 LOCAL(boolean)
 encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
 		  c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
-  register int temp, temp2;
-  register int nbits;
-  register int k, r, i;
-  
+  int temp, temp2;
+  int nbits;
+  int r, sflag, size, code;
+  unsigned char _buffer[BUFSIZE], *buffer;
+  long put_buffer;  int put_bits;
+  int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0];
+  int bytes, bytestocopy, localbuf = 0;
+
+  put_buffer = state->cur.put_buffer;
+  put_bits = state->cur.put_bits;
+  LOAD_BUFFER()
+
   /* Encode the DC coefficient difference per section F.1.2.1 */
   
   temp = temp2 = block[0] - last_dc_val;
 
-  if (temp < 0) {
-    temp = -temp;		/* temp is abs value of input */
-    /* For a negative input, want temp2 = bitwise complement of abs(input) */
-    /* This code assumes we are on a two's complement machine */
-    temp2--;
-  }
-  
-  /* Find the number of bits needed for the magnitude of the coefficient */
-  nbits = 0;
-  while (temp) {
-    nbits++;
-    temp >>= 1;
-  }
-  /* Check for out-of-range coefficient values.
-   * Since we're encoding a difference, the range limit is twice as much.
-   */
-  if (nbits > MAX_COEF_BITS+1)
-    ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-  
-  /* Emit the Huffman-coded symbol for the number of bits */
-  if (! emit_bits(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
-    return FALSE;
-
-  /* Emit that number of bits of the value, if positive, */
-  /* or the complement of its magnitude, if negative. */
-  if (nbits)			/* emit_bits rejects calls with size 0 */
-    if (! emit_bits(state, (unsigned int) temp2, nbits))
-      return FALSE;
+  sflag = temp >> 31;
+  temp -= ((temp + temp) & sflag);
+  temp2 += sflag;
+  nbits = jpeg_first_bit_table[temp];
+  DUMP_VALUE_SLOW(dctbl, nbits, temp2, nbits)
 
   /* Encode the AC coefficients per section F.1.2.2 */
   
   r = 0;			/* r = run length of zeros */
-  
-  for (k = 1; k < DCTSIZE2; k++) {
-    if ((temp = block[jpeg_natural_order[k]]) == 0) {
-      r++;
-    } else {
-      /* if run length > 15, must emit special run-length-16 codes (0xF0) */
-      while (r > 15) {
-	if (! emit_bits(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
-	  return FALSE;
-	r -= 16;
-      }
 
-      temp2 = temp;
-      if (temp < 0) {
-	temp = -temp;		/* temp is abs value of input */
-	/* This code assumes we are on a two's complement machine */
-	temp2--;
-      }
-      
-      /* Find the number of bits needed for the magnitude of the coefficient */
-      nbits = 1;		/* there must be at least one 1 bit */
-      while ((temp >>= 1))
-	nbits++;
-      /* Check for out-of-range coefficient values */
-      if (nbits > MAX_COEF_BITS)
-	ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-      
-      /* Emit Huffman symbol for run length / number of bits */
-      i = (r << 4) + nbits;
-      if (! emit_bits(state, actbl->ehufco[i], actbl->ehufsi[i]))
-	return FALSE;
+#define innerloop(order) {  \
+  temp2  = *(JCOEF*)((unsigned char*)block + order);  \
+  if(temp2 == 0) r++;  \
+  else {  \
+    temp = (JCOEF)temp2;  \
+    sflag = temp >> 31;  \
+    temp = (temp ^ sflag) - sflag;  \
+    temp2 += sflag;  \
+    nbits = jpeg_first_bit_table[temp];  \
+    for(; r > 15; r -= 16) DUMP_BITS(code_0xf0, size_0xf0)  \
+    sflag = (r << 4) + nbits;  \
+    DUMP_VALUE(actbl, sflag, temp2, nbits)  \
+    r = 0;  \
+  }}
 
-      /* Emit that number of bits of the value, if positive, */
-      /* or the complement of its magnitude, if negative. */
-      if (! emit_bits(state, (unsigned int) temp2, nbits))
-	return FALSE;
-      
-      r = 0;
-    }
-  }
+  innerloop(2*1);   innerloop(2*8);   innerloop(2*16);  innerloop(2*9);
+  innerloop(2*2);   innerloop(2*3);   innerloop(2*10);  innerloop(2*17);
+  innerloop(2*24);  innerloop(2*32);  innerloop(2*25);  innerloop(2*18);
+  innerloop(2*11);  innerloop(2*4);   innerloop(2*5);   innerloop(2*12);
+  innerloop(2*19);  innerloop(2*26);  innerloop(2*33);  innerloop(2*40);
+  innerloop(2*48);  innerloop(2*41);  innerloop(2*34);  innerloop(2*27);
+  innerloop(2*20);  innerloop(2*13);  innerloop(2*6);   innerloop(2*7);
+  innerloop(2*14);  innerloop(2*21);  innerloop(2*28);  innerloop(2*35);
+  innerloop(2*42);  innerloop(2*49);  innerloop(2*56);  innerloop(2*57);
+  innerloop(2*50);  innerloop(2*43);  innerloop(2*36);  innerloop(2*29);
+  innerloop(2*22);  innerloop(2*15);  innerloop(2*23);  innerloop(2*30);
+  innerloop(2*37);  innerloop(2*44);  innerloop(2*51);  innerloop(2*58);
+  innerloop(2*59);  innerloop(2*52);  innerloop(2*45);  innerloop(2*38);
+  innerloop(2*31);  innerloop(2*39);  innerloop(2*46);  innerloop(2*53);
+  innerloop(2*60);  innerloop(2*61);  innerloop(2*54);  innerloop(2*47);
+  innerloop(2*55);  innerloop(2*62);  innerloop(2*63);
 
   /* If the last coef(s) were zero, emit an end-of-block code */
-  if (r > 0)
-    if (! emit_bits(state, actbl->ehufco[0], actbl->ehufsi[0]))
-      return FALSE;
+  if (r > 0) DUMP_SINGLE_VALUE(actbl, 0x0)
+
+  state->cur.put_buffer = put_buffer;
+  state->cur.put_bits = put_bits;
+  STORE_BUFFER()
 
   return TRUE;
 }

diff --git a/jconfig.bcc b/jconfig.bcc
deleted file mode 100644
index c6c53ff..0000000
--- a/jconfig.bcc
+++ /dev/null

@@ -1,48 +0,0 @@
-/* jconfig.bcc --- jconfig.h for Borland C (Turbo C) on MS-DOS or OS/2. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#ifdef __MSDOS__
-#define NEED_FAR_POINTERS	/* for small or medium memory model */
-#endif
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN	/* this assumes you have -w-stu in CFLAGS */
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#ifdef __MSDOS__
-#define USE_MSDOS_MEMMGR	/* Define this if you use jmemdos.c */
-#define MAX_ALLOC_CHUNK 65520L	/* Maximum request to malloc() */
-#define USE_FMEM		/* Borland has _fmemcpy() and _fmemset() */
-#endif
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE
-#define USE_SETMODE		/* Borland has setmode() */
-#ifdef __MSDOS__
-#define NEED_SIGNAL_CATCHER	/* Define this if you use jmemdos.c */
-#endif
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */

diff --git a/jconfig.cfg b/jconfig.cfg
deleted file mode 100644
index 36a04fa..0000000
--- a/jconfig.cfg
+++ /dev/null

@@ -1,44 +0,0 @@
-/* jconfig.cfg --- source file edited by configure script */
-/* see jconfig.doc for explanations */
-
-#undef HAVE_PROTOTYPES
-#undef HAVE_UNSIGNED_CHAR
-#undef HAVE_UNSIGNED_SHORT
-#undef void
-#undef const
-#undef CHAR_IS_UNSIGNED
-#undef HAVE_STDDEF_H
-#undef HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-/* Define this if you get warnings about undefined structures. */
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-#undef INLINE
-/* These are for configuring the JPEG memory manager. */
-#undef DEFAULT_MAX_MEM
-#undef NO_MKTEMP
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#undef TWO_FILE_COMMANDLINE
-#undef NEED_SIGNAL_CATCHER
-#undef DONT_USE_B_MODE
-
-/* Define this if you want percent-done progress reports from cjpeg/djpeg. */
-#undef PROGRESS_REPORT
-
-#endif /* JPEG_CJPEG_DJPEG */

diff --git a/jconfig.dj b/jconfig.dj
deleted file mode 100644
index f759a9d..0000000
--- a/jconfig.dj
+++ /dev/null

@@ -1,38 +0,0 @@
-/* jconfig.dj --- jconfig.h for DJGPP (Delorie's GNU C port) on MS-DOS. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS	/* DJGPP uses flat 32-bit addressing */
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#undef TWO_FILE_COMMANDLINE	/* optional */
-#define USE_SETMODE		/* Needed to make one-file style work in DJGPP */
-#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */

diff --git a/jconfig.h.in b/jconfig.h.in
new file mode 100644
index 0000000..4e5e80e
--- /dev/null
+++ b/jconfig.h.in

@@ -0,0 +1,49 @@
+/* Define if your compiler supports prototypes */
+#undef HAVE_PROTOTYPES
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#undef HAVE_STDDEF_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if the system has the type `unsigned char'. */
+#undef HAVE_UNSIGNED_CHAR
+
+/* Define to 1 if the system has the type `unsigned short'. */
+#undef HAVE_UNSIGNED_SHORT
+
+/* Define if you want use complete types */
+#undef INCOMPLETE_TYPES_BROKEN
+
+/* Define if you have BSD-like bzero and bcopy */
+#undef NEED_BSD_STRINGS
+
+/* Define if you need short function names */
+#undef NEED_SHORT_EXTERNAL_NAMES
+
+/* Define if you have sys/types.h */
+#undef NEED_SYS_TYPES_H
+
+/* Define if shift is unsigned */
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+/* Use accelerated SIMD routines. */
+#undef WITH_SIMD
+
+/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
+#ifndef __CHAR_UNSIGNED__
+# undef __CHAR_UNSIGNED__
+#endif
+
+/* Define to empty if `const' does not conform to ANSI C. */
+#undef const
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+#undef inline
+#endif
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+#undef size_t

diff --git a/jconfig.mac b/jconfig.mac
deleted file mode 100644
index 0de3efe..0000000
--- a/jconfig.mac
+++ /dev/null

@@ -1,43 +0,0 @@
-/* jconfig.mac --- jconfig.h for CodeWarrior on Apple Macintosh */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#define USE_MAC_MEMMGR		/* Define this if you use jmemmac.c */
-
-#define ALIGN_TYPE long		/* Needed for 680x0 Macs */
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#define USE_CCOMMAND		/* Command line reader for Macintosh */
-#define TWO_FILE_COMMANDLINE	/* Binary I/O thru stdin/stdout doesn't work */
-
-#undef NEED_SIGNAL_CATCHER
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */

diff --git a/jconfig.manx b/jconfig.manx
deleted file mode 100644
index 6dd0d00..0000000
--- a/jconfig.manx
+++ /dev/null

@@ -1,43 +0,0 @@
-/* jconfig.manx --- jconfig.h for Amiga systems using Manx Aztec C ver 5.x. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#define TEMP_DIRECTORY "JPEGTMP:"	/* recommended setting for Amiga */
-
-#define SHORTxSHORT_32		/* produces better DCT code with Aztec C */
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE
-#define NEED_SIGNAL_CATCHER
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#define signal_catcher _abort	/* hack for Aztec C naming requirements */
-
-#endif /* JPEG_CJPEG_DJPEG */

diff --git a/jconfig.mc6 b/jconfig.mc6
deleted file mode 100644
index c55082d..0000000
--- a/jconfig.mc6
+++ /dev/null

@@ -1,52 +0,0 @@
-/* jconfig.mc6 --- jconfig.h for Microsoft C on MS-DOS, version 6.00A & up. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#define NEED_FAR_POINTERS	/* for small or medium memory model */
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#define USE_MSDOS_MEMMGR	/* Define this if you use jmemdos.c */
-
-#define MAX_ALLOC_CHUNK 65520L	/* Maximum request to malloc() */
-
-#define USE_FMEM		/* Microsoft has _fmemcpy() and _fmemset() */
-
-#define NEED_FHEAPMIN		/* far heap management routines are broken */
-
-#define SHORTxLCONST_32		/* enable compiler-specific DCT optimization */
-/* Note: the above define is known to improve the code with Microsoft C 6.00A.
- * I do not know whether it is good for later compiler versions.
- * Please report any info on this point to jpeg-info@uunet.uu.net.
- */
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE
-#define USE_SETMODE		/* Microsoft has setmode() */
-#define NEED_SIGNAL_CATCHER	/* Define this if you use jmemdos.c */
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */

diff --git a/jconfig.sas b/jconfig.sas
deleted file mode 100644
index efdac22..0000000
--- a/jconfig.sas
+++ /dev/null

@@ -1,43 +0,0 @@
-/* jconfig.sas --- jconfig.h for Amiga systems using SAS C 6.0 and up. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#define TEMP_DIRECTORY "JPEGTMP:"	/* recommended setting for Amiga */
-
-#define NO_MKTEMP		/* SAS C doesn't have mktemp() */
-
-#define SHORTxSHORT_32		/* produces better DCT code with SAS C */
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE
-#define NEED_SIGNAL_CATCHER
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */

diff --git a/jconfig.st b/jconfig.st
deleted file mode 100644
index 4421b7a..0000000
--- a/jconfig.st
+++ /dev/null

@@ -1,42 +0,0 @@
-/* jconfig.st --- jconfig.h for Atari ST/STE/TT using Pure C or Turbo C. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-#define INCOMPLETE_TYPES_BROKEN	/* suppress undefined-structure warnings */
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#define ALIGN_TYPE  long	/* apparently double is a weird size? */
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE	/* optional -- undef if you like Unix style */
-/* Note: if you undef TWO_FILE_COMMANDLINE, you may need to define
- * USE_SETMODE.  Some Atari compilers require it, some do not.
- */
-#define NEED_SIGNAL_CATCHER	/* needed if you use jmemname.c */
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */

diff --git a/jconfig.vms b/jconfig.vms
deleted file mode 100644
index 55a6ffb..0000000
--- a/jconfig.vms
+++ /dev/null

@@ -1,37 +0,0 @@
-/* jconfig.vms --- jconfig.h for use on Digital VMS. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE	/* Needed on VMS */
-#undef NEED_SIGNAL_CATCHER
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */

diff --git a/jconfig.wat b/jconfig.wat
deleted file mode 100644
index 6cc545b..0000000
--- a/jconfig.wat
+++ /dev/null

@@ -1,38 +0,0 @@
-/* jconfig.wat --- jconfig.h for Watcom C/C++ on MS-DOS or OS/2. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#define CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS	/* Watcom uses flat 32-bit addressing */
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#undef TWO_FILE_COMMANDLINE	/* optional */
-#define USE_SETMODE		/* Needed to make one-file style work in Watcom */
-#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */

diff --git a/jcparam.c b/jcparam.c
index 6fc48f5..a1d49d9 100644
--- a/jcparam.c
+++ b/jcparam.c

@@ -2,6 +2,7 @@
  * jcparam.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -363,6 +364,12 @@
     jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
     break;
   case JCS_RGB:
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
     jpeg_set_colorspace(cinfo, JCS_YCbCr);
     break;
   case JCS_YCbCr:

diff --git a/jcphuff.c b/jcphuff.c
index 07f9178..3102871 100644
--- a/jcphuff.c
+++ b/jcphuff.c

@@ -223,7 +223,6 @@
  * between calls, so 24 bits are sufficient.
  */
 
-INLINE
 LOCAL(void)
 emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
 /* Emit some bits, unless we are in gather mode */
@@ -276,7 +275,6 @@
  * Emit (or just count) a Huffman symbol.
  */
 
-INLINE
 LOCAL(void)
 emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
 {

diff --git a/jcsample.c b/jcsample.c
index 212ec87..eea376f 100644
--- a/jcsample.c
+++ b/jcsample.c

@@ -2,6 +2,7 @@
  * jcsample.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -48,6 +49,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"
 
 
 /* Pointer to routine to downsample a single component */
@@ -494,7 +496,10 @@
     } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
 	       compptr->v_samp_factor == cinfo->max_v_samp_factor) {
       smoothok = FALSE;
-      downsample->methods[ci] = h2v1_downsample;
+      if (jsimd_can_h2v1_downsample())
+        downsample->methods[ci] = jsimd_h2v1_downsample;
+      else
+        downsample->methods[ci] = h2v1_downsample;
     } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
 	       compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
@@ -503,7 +508,10 @@
 	downsample->pub.need_context_rows = TRUE;
       } else
 #endif
-	downsample->methods[ci] = h2v2_downsample;
+	if (jsimd_can_h2v2_downsample())
+	  downsample->methods[ci] = jsimd_h2v2_downsample;
+	else
+	  downsample->methods[ci] = h2v2_downsample;
     } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
 	       (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
       smoothok = FALSE;

diff --git a/jdcoefct.c b/jdcoefct.c
index 4938d20..f56af5f 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c

@@ -47,6 +47,9 @@
    */
   JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
 
+  /* Temporary workspace for one MCU */
+  JCOEF * workspace;
+
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* In multi-pass modes, we need a virtual block array for each component. */
   jvirt_barray_ptr whole_image[MAX_COMPONENTS];
@@ -471,13 +474,16 @@
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
   boolean first_row, last_row;
-  JBLOCK workspace;
+  JCOEF * workspace;
   int *coef_bits;
   JQUANT_TBL *quanttbl;
   INT32 Q00,Q01,Q02,Q10,Q11,Q20, num;
   int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
   int Al, pred;
 
+  /* Keep a local variable to avoid looking it up more than once */
+  workspace = coef->workspace;
+
   /* Force some input to be done if we are getting ahead of the input. */
   while (cinfo->input_scan_number <= cinfo->output_scan_number &&
 	 ! cinfo->inputctl->eoi_reached) {
@@ -733,4 +739,9 @@
     coef->pub.decompress_data = decompress_onepass;
     coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */
   }
+
+  /* Allocate the workspace buffer */
+  coef->workspace = (JCOEF *)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                SIZEOF(JCOEF) * DCTSIZE2);
 }

diff --git a/jdcolor.c b/jdcolor.c
index 6c04dfe..e02ea4f 100644
--- a/jdcolor.c
+++ b/jdcolor.c

@@ -2,6 +2,8 @@
  * jdcolor.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -11,6 +13,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"
 
 
 /* Private subobject */
@@ -146,12 +149,12 @@
       cb = GETJSAMPLE(inptr1[col]);
       cr = GETJSAMPLE(inptr2[col]);
       /* Range-limiting is essential due to noise introduced by DCT losses. */
-      outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
-      outptr[RGB_GREEN] = range_limit[y +
+      outptr[rgb_red[cinfo->out_color_space]] =   range_limit[y + Crrtab[cr]];
+      outptr[rgb_green[cinfo->out_color_space]] = range_limit[y +
 			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
 						 SCALEBITS))];
-      outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
-      outptr += RGB_PIXELSIZE;
+      outptr[rgb_blue[cinfo->out_color_space]] =  range_limit[y + Cbbtab[cb]];
+      outptr += rgb_pixelsize[cinfo->out_color_space];
     }
   }
 }
@@ -219,16 +222,21 @@
 		  JSAMPARRAY output_buf, int num_rows)
 {
   register JSAMPROW inptr, outptr;
+  JSAMPLE *maxinptr;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
+  int rindex = rgb_red[cinfo->out_color_space];
+  int gindex = rgb_green[cinfo->out_color_space];
+  int bindex = rgb_blue[cinfo->out_color_space];
+  int rgbstride = rgb_pixelsize[cinfo->out_color_space];
 
   while (--num_rows >= 0) {
     inptr = input_buf[0][input_row++];
+    maxinptr = &inptr[num_cols];
     outptr = *output_buf++;
-    for (col = 0; col < num_cols; col++) {
+    for (; inptr < maxinptr; inptr++, outptr += rgbstride) {
       /* We can dispense with GETJSAMPLE() here */
-      outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
-      outptr += RGB_PIXELSIZE;
+      outptr[rindex] = outptr[gindex] = outptr[bindex] = *inptr;
     }
   }
 }
@@ -356,13 +364,24 @@
     break;
 
   case JCS_RGB:
-    cinfo->out_color_components = RGB_PIXELSIZE;
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
+    cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space];
     if (cinfo->jpeg_color_space == JCS_YCbCr) {
-      cconvert->pub.color_convert = ycc_rgb_convert;
-      build_ycc_rgb_table(cinfo);
+      if (jsimd_can_ycc_rgb())
+        cconvert->pub.color_convert = jsimd_ycc_rgb_convert;
+      else {
+        cconvert->pub.color_convert = ycc_rgb_convert;
+        build_ycc_rgb_table(cinfo);
+      }
     } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
       cconvert->pub.color_convert = gray_rgb_convert;
-    } else if (cinfo->jpeg_color_space == JCS_RGB && RGB_PIXELSIZE == 3) {
+    } else if (cinfo->jpeg_color_space == cinfo->out_color_space &&
+      rgb_pixelsize[cinfo->out_color_space] == 3) {
       cconvert->pub.color_convert = null_convert;
     } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);

diff --git a/jdct.h b/jdct.h
index 04192a2..7b49a97 100644
--- a/jdct.h
+++ b/jdct.h

@@ -23,18 +23,26 @@
  * have a range of +-8K for 8-bit data, +-128K for 12-bit data.  This
  * convention improves accuracy in integer implementations and saves some
  * work in floating-point ones.
- * Quantization of the output coefficients is done by jcdctmgr.c.
+ * Quantization of the output coefficients is done by jcdctmgr.c. This
+ * step requires an unsigned type and also one with twice the bits.
  */
 
 #if BITS_IN_JSAMPLE == 8
+#ifndef WITH_SIMD
 typedef int DCTELEM;		/* 16 or 32 bits is fine */
+typedef unsigned int UDCTELEM;
+typedef unsigned long long UDCTELEM2;
+#else
+typedef short DCTELEM;  /* prefer 16 bit with SIMD for parellelism */
+typedef unsigned short UDCTELEM;
+typedef unsigned int UDCTELEM2;
+#endif
 #else
 typedef INT32 DCTELEM;		/* must have 32 bits */
+typedef UINT32 UDCTELEM;
+typedef unsigned long long UDCTELEM2;
 #endif
 
-typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
-typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
-
 
 /*
  * An inverse DCT routine is given a pointer to the input JBLOCK and a pointer

diff --git a/jddctmgr.c b/jddctmgr.c
index bbf8d0e..52f5090 100644
--- a/jddctmgr.c
+++ b/jddctmgr.c

@@ -2,6 +2,7 @@
  * jddctmgr.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -19,6 +20,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jsimddct.h"
 
 
 /*
@@ -105,11 +107,17 @@
       method = JDCT_ISLOW;	/* jidctred uses islow-style table */
       break;
     case 2:
-      method_ptr = jpeg_idct_2x2;
+      if (jsimd_can_idct_2x2())
+        method_ptr = jsimd_idct_2x2;
+      else
+        method_ptr = jpeg_idct_2x2;
       method = JDCT_ISLOW;	/* jidctred uses islow-style table */
       break;
     case 4:
-      method_ptr = jpeg_idct_4x4;
+      if (jsimd_can_idct_4x4())
+        method_ptr = jsimd_idct_4x4;
+      else
+        method_ptr = jpeg_idct_4x4;
       method = JDCT_ISLOW;	/* jidctred uses islow-style table */
       break;
 #endif
@@ -117,19 +125,28 @@
       switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
       case JDCT_ISLOW:
-	method_ptr = jpeg_idct_islow;
+	if (jsimd_can_idct_islow())
+	  method_ptr = jsimd_idct_islow;
+	else
+	  method_ptr = jpeg_idct_islow;
 	method = JDCT_ISLOW;
 	break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
       case JDCT_IFAST:
-	method_ptr = jpeg_idct_ifast;
+	if (jsimd_can_idct_ifast())
+	  method_ptr = jsimd_idct_ifast;
+	else
+	  method_ptr = jpeg_idct_ifast;
 	method = JDCT_IFAST;
 	break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
       case JDCT_FLOAT:
-	method_ptr = jpeg_idct_float;
+	if (jsimd_can_idct_float())
+	  method_ptr = jsimd_idct_float;
+	else
+	  method_ptr = jpeg_idct_float;
 	method = JDCT_FLOAT;
 	break;
 #endif

diff --git a/jdhuff.c b/jdhuff.c
index b5ba39f..18a0c7e 100644
--- a/jdhuff.c
+++ b/jdhuff.c

@@ -14,6 +14,21 @@
  * storage only upon successful completion of an MCU.
  */
 
+/* Modifications:
+ * Copyright (C)2007 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version.  The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * wxWindows Library License for more details.
+ */
+
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
@@ -234,7 +249,8 @@
    * with that code.
    */
 
-  MEMZERO(dtbl->look_nbits, SIZEOF(dtbl->look_nbits));
+   for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
+     dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
 
   p = 0;
   for (l = 1; l <= HUFF_LOOKAHEAD; l++) {
@@ -243,8 +259,7 @@
       /* Generate left-justified code followed by all possible bit sequences */
       lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l);
       for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) {
-	dtbl->look_nbits[lookbits] = l;
-	dtbl->look_sym[lookbits] = htbl->huffval[p];
+	dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p];
 	lookbits++;
       }
     }
@@ -438,9 +453,10 @@
  * On some machines, a shift and add will be faster than a table lookup.
  */
 
+#define AVOID_TABLES
 #ifdef AVOID_TABLES
 
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
+#define HUFF_EXTEND(x,s)  ((x) + ((((x) - (1<<((s)-1))) >> 31) & (((-1)<<(s)) + 1)))
 
 #else
 
@@ -498,6 +514,236 @@
 }
 
 
+LOCAL(boolean)
+decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  BITREAD_STATE_VARS;
+  int blkn;
+  savable_state state;
+  /* Outer loop handles each block in the MCU */
+
+  /* Load up working state */
+  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+  ASSIGN_STATE(state, entropy->saved);
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    JBLOCKROW block = MCU_data[blkn];
+    d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
+    d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
+    register int s, k, r;
+
+    /* Decode a single block's worth of coefficients */
+
+    /* Section F.2.2.1: decode the DC coefficient difference */
+    HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
+    if (s) {
+      CHECK_BIT_BUFFER(br_state, s, return FALSE);
+      r = GET_BITS(s);
+      s = HUFF_EXTEND(r, s);
+    }
+
+    if (entropy->dc_needed[blkn]) {
+      /* Convert DC difference to actual value, update last_dc_val */
+      int ci = cinfo->MCU_membership[blkn];
+      s += state.last_dc_val[ci];
+      state.last_dc_val[ci] = s;
+      /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
+      (*block)[0] = (JCOEF) s;
+    }
+
+    if (entropy->ac_needed[blkn]) {
+
+      /* Section F.2.2.2: decode the AC coefficients */
+      /* Since zeroes are skipped, output area must be cleared beforehand */
+      for (k = 1; k < DCTSIZE2; k++) {
+        HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
+
+        r = s >> 4;
+        s &= 15;
+      
+        if (s) {
+          k += r;
+          CHECK_BIT_BUFFER(br_state, s, return FALSE);
+          r = GET_BITS(s);
+          s = HUFF_EXTEND(r, s);
+          /* Output coefficient in natural (dezigzagged) order.
+           * Note: the extra entries in jpeg_natural_order[] will save us
+           * if k >= DCTSIZE2, which could happen if the data is corrupted.
+           */
+          (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+        } else {
+          if (r != 15)
+            break;
+          k += 15;
+        }
+      }
+
+    } else {
+
+      /* Section F.2.2.2: decode the AC coefficients */
+      /* In this path we just discard the values */
+      for (k = 1; k < DCTSIZE2; k++) {
+        HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
+
+        r = s >> 4;
+        s &= 15;
+
+        if (s) {
+          k += r;
+          CHECK_BIT_BUFFER(br_state, s, return FALSE);
+          DROP_BITS(s);
+        } else {
+          if (r != 15)
+            break;
+          k += 15;
+        }
+      }
+    }
+  }
+
+  /* Completed MCU, so update state */
+  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+  ASSIGN_STATE(entropy->saved, state);
+  return TRUE;
+}
+
+
+/***************************************************************/
+
+#define ADD_BYTE  {                                     \
+  int val0 = *(buffer++);                               \
+  int val1 = *(buffer);                                 \
+                                                        \
+  bits_left += 8;                                       \
+  get_buffer = (get_buffer << 8) | (val0);              \
+  if (val0 == 0xFF) {                                   \
+    buffer++;                                           \
+    if (val1 != 0) {                                    \
+      buffer   -= 2;                                    \
+      get_buffer      &= ~0xFF;                         \
+    }                                                   \
+  }                                                     \
+}
+
+/***************************************************************/
+
+#if __WORDSIZE == 64
+
+#define ENSURE_SHORT \
+  if (bits_left < 16) { \
+    ADD_BYTE ADD_BYTE ADD_BYTE ADD_BYTE ADD_BYTE ADD_BYTE \
+  }
+
+#else
+
+#define ENSURE_SHORT  if (bits_left < 16) { ADD_BYTE ADD_BYTE }
+
+#endif
+
+/***************************************************************/
+
+#define HUFF_DECODE_FAST(symbol, size, htbl) { \
+  ENSURE_SHORT \
+  symbol = PEEK_BITS(HUFF_LOOKAHEAD); \
+  symbol = htbl->lookup[symbol]; \
+  size = symbol >> 8; \
+  bits_left -= size; \
+  symbol = symbol & ((1 << HUFF_LOOKAHEAD) - 1); \
+  if (size == HUFF_LOOKAHEAD + 1) { \
+    symbol = (get_buffer >> bits_left) & ((1 << (size)) - 1); \
+    while (symbol > htbl->maxcode[size]) { \
+      symbol <<= 1; \
+      symbol |= GET_BITS(1); \
+      size++; \
+    } \
+    symbol = htbl->pub->huffval[ (int) (symbol + htbl->valoffset[size]) ]; \
+  } \
+}
+
+/***************************************************************/
+
+LOCAL(boolean)
+decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  BITREAD_STATE_VARS;
+  JOCTET *buffer;
+  int blkn;
+  savable_state state;
+  /* Outer loop handles each block in the MCU */
+
+  /* Load up working state */
+  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+  buffer = (JOCTET *) br_state.next_input_byte;
+  ASSIGN_STATE(state, entropy->saved);
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    JBLOCKROW block = MCU_data[blkn];
+    d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
+    d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
+    register int s, k, r, l;
+
+    HUFF_DECODE_FAST(s, l, dctbl);
+    if (s) {
+      ENSURE_SHORT
+      r = GET_BITS(s);
+      s = HUFF_EXTEND(r, s);
+    }
+
+    if (entropy->dc_needed[blkn]) {
+      int ci = cinfo->MCU_membership[blkn];
+      s += state.last_dc_val[ci];
+      state.last_dc_val[ci] = s;
+      (*block)[0] = (JCOEF) s;
+    }
+
+    if (entropy->ac_needed[blkn]) {
+
+      for (k = 1; k < DCTSIZE2; k++) {
+        HUFF_DECODE_FAST(s, l, actbl);
+        r = s >> 4;
+        s &= 15;
+      
+        if (s) {
+          k += r;
+          ENSURE_SHORT
+          r = GET_BITS(s);
+          s = HUFF_EXTEND(r, s);
+          (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+        } else {
+          if (r != 15) break;
+          k += 15;
+        }
+      }
+
+    } else {
+
+      for (k = 1; k < DCTSIZE2; k++) {
+        HUFF_DECODE_FAST(s, l, actbl);
+        r = s >> 4;
+        s &= 15;
+
+        if (s) {
+          k += r;
+          ENSURE_SHORT
+          DROP_BITS(s);
+        } else {
+          if (r != 15) break;
+          k += 15;
+        }
+      }
+    }
+  }
+
+  br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
+  br_state.next_input_byte = buffer;
+  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+  ASSIGN_STATE(entropy->saved, state);
+  return TRUE;
+}
+
+
 /*
  * Decode and return one MCU's worth of Huffman-compressed coefficients.
  * The coefficients are reordered from zigzag order into natural array order,
@@ -513,13 +759,12 @@
  * this module, since we'll just re-assign them on the next call.)
  */
 
+#define BUFSIZE (DCTSIZE2 * 2)
+
 METHODDEF(boolean)
 decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int blkn;
-  BITREAD_STATE_VARS;
-  savable_state state;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
@@ -533,91 +778,13 @@
    */
   if (! entropy->pub.insufficient_data) {
 
-    /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(state, entropy->saved);
-
-    /* Outer loop handles each block in the MCU */
-
-    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-      JBLOCKROW block = MCU_data[blkn];
-      d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
-      d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
-      register int s, k, r;
-
-      /* Decode a single block's worth of coefficients */
-
-      /* Section F.2.2.1: decode the DC coefficient difference */
-      HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
-      if (s) {
-	CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	r = GET_BITS(s);
-	s = HUFF_EXTEND(r, s);
-      }
-
-      if (entropy->dc_needed[blkn]) {
-	/* Convert DC difference to actual value, update last_dc_val */
-	int ci = cinfo->MCU_membership[blkn];
-	s += state.last_dc_val[ci];
-	state.last_dc_val[ci] = s;
-	/* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
-	(*block)[0] = (JCOEF) s;
-      }
-
-      if (entropy->ac_needed[blkn]) {
-
-	/* Section F.2.2.2: decode the AC coefficients */
-	/* Since zeroes are skipped, output area must be cleared beforehand */
-	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
-      
-	  r = s >> 4;
-	  s &= 15;
-      
-	  if (s) {
-	    k += r;
-	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	    r = GET_BITS(s);
-	    s = HUFF_EXTEND(r, s);
-	    /* Output coefficient in natural (dezigzagged) order.
-	     * Note: the extra entries in jpeg_natural_order[] will save us
-	     * if k >= DCTSIZE2, which could happen if the data is corrupted.
-	     */
-	    (*block)[jpeg_natural_order[k]] = (JCOEF) s;
-	  } else {
-	    if (r != 15)
-	      break;
-	    k += 15;
-	  }
-	}
-
-      } else {
-
-	/* Section F.2.2.2: decode the AC coefficients */
-	/* In this path we just discard the values */
-	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
-      
-	  r = s >> 4;
-	  s &= 15;
-      
-	  if (s) {
-	    k += r;
-	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	    DROP_BITS(s);
-	  } else {
-	    if (r != 15)
-	      break;
-	    k += 15;
-	  }
-	}
-
-      }
+    if (cinfo->src->bytes_in_buffer >= BUFSIZE) {
+      if (!decode_mcu_fast(cinfo, MCU_data)) return FALSE;
+    }
+    else {
+      if (!decode_mcu_slow(cinfo, MCU_data)) return FALSE;
     }
 
-    /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(entropy->saved, state);
   }
 
   /* Account for restart interval (no-op if not using restarts) */

diff --git a/jdhuff.h b/jdhuff.h
index ae19b6c..a7c8188 100644
--- a/jdhuff.h
+++ b/jdhuff.h

@@ -36,13 +36,17 @@
   /* Link to public Huffman table (needed only in jpeg_huff_decode) */
   JHUFF_TBL *pub;
 
-  /* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
+  /* Lookahead table: indexed by the next HUFF_LOOKAHEAD bits of
    * the input data stream.  If the next Huffman code is no more
    * than HUFF_LOOKAHEAD bits long, we can obtain its length and
-   * the corresponding symbol directly from these tables.
+   * the corresponding symbol directly from this tables.
+   *
+   * The lower 8 bits of each table entry contain the number of
+   * bits in the corresponding Huffman code, or HUFF_LOOKAHEAD + 1
+   * if too long.  The next 8 bits of each entry contain the
+   * symbol.
    */
-  int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
-  UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
+  int lookup[1<<HUFF_LOOKAHEAD];
 } d_derived_tbl;
 
 /* Expand a Huffman table definition into the derived format */
@@ -69,8 +73,17 @@
  * necessary.
  */
 
+#if __WORDSIZE == 64
+
+typedef long bit_buf_type;	/* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  64		/* size of buffer in bits */
+
+#else
+
 typedef INT32 bit_buf_type;	/* type of bit-extraction buffer */
-#define BIT_BUF_SIZE  32	/* size of buffer in bits */
+#define BIT_BUF_SIZE  32		/* size of buffer in bits */
+
+#endif
 
 /* If long is > 32 bits on your machine, and shifting/masking longs is
  * reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
@@ -183,11 +196,10 @@
     } \
   } \
   look = PEEK_BITS(HUFF_LOOKAHEAD); \
-  if ((nb = htbl->look_nbits[look]) != 0) { \
+  if ((nb = (htbl->lookup[look] >> HUFF_LOOKAHEAD)) <= HUFF_LOOKAHEAD) { \
     DROP_BITS(nb); \
-    result = htbl->look_sym[look]; \
+    result = htbl->lookup[look] & ((1 << HUFF_LOOKAHEAD) - 1); \
   } else { \
-    nb = HUFF_LOOKAHEAD+1; \
 slowlabel: \
     if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
 	{ failaction; } \

diff --git a/jdmaster.c b/jdmaster.c
index 2802c5b..8314b67 100644
--- a/jdmaster.c
+++ b/jdmaster.c

@@ -2,6 +2,7 @@
  * jdmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -49,8 +50,14 @@
     return FALSE;
   /* jdmerge.c only supports YCC=>RGB color conversion */
   if (cinfo->jpeg_color_space != JCS_YCbCr || cinfo->num_components != 3 ||
-      cinfo->out_color_space != JCS_RGB ||
-      cinfo->out_color_components != RGB_PIXELSIZE)
+      (cinfo->out_color_space != JCS_RGB &&
+      cinfo->out_color_space != JCS_EXT_RGB &&
+      cinfo->out_color_space != JCS_EXT_RGBX &&
+      cinfo->out_color_space != JCS_EXT_BGR &&
+      cinfo->out_color_space != JCS_EXT_BGRX &&
+      cinfo->out_color_space != JCS_EXT_XBGR &&
+      cinfo->out_color_space != JCS_EXT_XRGB) ||
+      cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space])
     return FALSE;
   /* and it only handles 2h1v or 2h2v sampling ratios */
   if (cinfo->comp_info[0].h_samp_factor != 2 ||
@@ -175,10 +182,14 @@
     cinfo->out_color_components = 1;
     break;
   case JCS_RGB:
-#if RGB_PIXELSIZE != 3
-    cinfo->out_color_components = RGB_PIXELSIZE;
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
+    cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space];
     break;
-#endif /* else share code with YCbCr */
   case JCS_YCbCr:
     cinfo->out_color_components = 3;
     break;

diff --git a/jdmerge.c b/jdmerge.c
index 3744446..edf061a 100644
--- a/jdmerge.c
+++ b/jdmerge.c

@@ -2,6 +2,8 @@
  * jdmerge.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -35,6 +37,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"
 
 #ifdef UPSAMPLE_MERGING_SUPPORTED
 
@@ -255,15 +258,15 @@
     cblue = Cbbtab[cb];
     /* Fetch 2 Y values and emit 2 pixels */
     y  = GETJSAMPLE(*inptr0++);
-    outptr[RGB_RED] =   range_limit[y + cred];
-    outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE] =  range_limit[y + cblue];
-    outptr += RGB_PIXELSIZE;
+    outptr[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
+    outptr += rgb_pixelsize[cinfo->out_color_space];
     y  = GETJSAMPLE(*inptr0++);
-    outptr[RGB_RED] =   range_limit[y + cred];
-    outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE] =  range_limit[y + cblue];
-    outptr += RGB_PIXELSIZE;
+    outptr[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
+    outptr += rgb_pixelsize[cinfo->out_color_space];
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
@@ -273,9 +276,9 @@
     cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     y  = GETJSAMPLE(*inptr0);
-    outptr[RGB_RED] =   range_limit[y + cred];
-    outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE] =  range_limit[y + cblue];
+    outptr[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
   }
 }
 
@@ -319,24 +322,24 @@
     cblue = Cbbtab[cb];
     /* Fetch 4 Y values and emit 4 pixels */
     y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+    outptr0[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr0[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr0[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
     outptr0 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+    outptr0[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr0[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr0[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
     outptr0 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+    outptr1[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr1[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr1[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
     outptr1 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+    outptr1[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr1[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr1[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
     outptr1 += RGB_PIXELSIZE;
   }
   /* If image width is odd, do the last output column separately */
@@ -347,13 +350,13 @@
     cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     y  = GETJSAMPLE(*inptr00);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+    outptr0[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr0[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr0[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
     y  = GETJSAMPLE(*inptr01);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+    outptr1[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr1[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr1[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
   }
 }
 
@@ -382,14 +385,20 @@
 
   if (cinfo->max_v_samp_factor == 2) {
     upsample->pub.upsample = merged_2v_upsample;
-    upsample->upmethod = h2v2_merged_upsample;
+    if (jsimd_can_h2v2_merged_upsample())
+      upsample->upmethod = jsimd_h2v2_merged_upsample;
+    else
+      upsample->upmethod = h2v2_merged_upsample;
     /* Allocate a spare row buffer */
     upsample->spare_row = (JSAMPROW)
       (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 		(size_t) (upsample->out_row_width * SIZEOF(JSAMPLE)));
   } else {
     upsample->pub.upsample = merged_1v_upsample;
-    upsample->upmethod = h2v1_merged_upsample;
+    if (jsimd_can_h2v1_merged_upsample())
+      upsample->upmethod = jsimd_h2v1_merged_upsample;
+    else
+      upsample->upmethod = h2v1_merged_upsample;
     /* No spare row needed */
     upsample->spare_row = NULL;
   }

diff --git a/jdsample.c b/jdsample.c
index 80ffefb..4e0b8b4 100644
--- a/jdsample.c
+++ b/jdsample.c

@@ -2,6 +2,7 @@
  * jdsample.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -21,6 +22,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"
 
 
 /* Pointer to routine to upsample a single component */
@@ -447,18 +449,32 @@
     } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group == v_out_group) {
       /* Special cases for 2h1v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2)
-	upsample->methods[ci] = h2v1_fancy_upsample;
-      else
-	upsample->methods[ci] = h2v1_upsample;
+      if (do_fancy && compptr->downsampled_width > 2) {
+	if (jsimd_can_h2v1_fancy_upsample())
+	  upsample->methods[ci] = jsimd_h2v1_fancy_upsample;
+	else
+	  upsample->methods[ci] = h2v1_fancy_upsample;
+      } else {
+	if (jsimd_can_h2v1_upsample())
+	  upsample->methods[ci] = jsimd_h2v1_upsample;
+	else
+	  upsample->methods[ci] = h2v1_upsample;
+      }
     } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group * 2 == v_out_group) {
       /* Special cases for 2h2v upsampling */
       if (do_fancy && compptr->downsampled_width > 2) {
-	upsample->methods[ci] = h2v2_fancy_upsample;
+	if (jsimd_can_h2v2_fancy_upsample())
+	  upsample->methods[ci] = jsimd_h2v2_fancy_upsample;
+	else
+	  upsample->methods[ci] = h2v2_fancy_upsample;
 	upsample->pub.need_context_rows = TRUE;
-      } else
-	upsample->methods[ci] = h2v2_upsample;
+      } else {
+	if (jsimd_can_h2v2_upsample())
+	  upsample->methods[ci] = jsimd_h2v2_upsample;
+	else
+	  upsample->methods[ci] = h2v2_upsample;
+      }
     } else if ((h_out_group % h_in_group) == 0 &&
 	       (v_out_group % v_in_group) == 0) {
       /* Generic integral-factors upsampling method */

diff --git a/jmemansi.c b/jmemansi.c
deleted file mode 100644
index 2d93e49..0000000
--- a/jmemansi.c
+++ /dev/null

@@ -1,167 +0,0 @@
-/*
- * jmemansi.c
- *
- * Copyright (C) 1992-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file provides a simple generic implementation of the system-
- * dependent portion of the JPEG memory manager.  This implementation
- * assumes that you have the ANSI-standard library routine tmpfile().
- * Also, the problem of determining the amount of memory available
- * is shoved onto the user.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h"		/* import the system-dependent declarations */
-
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
-#endif
-
-#ifndef SEEK_SET		/* pre-ANSI systems may not define this; */
-#define SEEK_SET  0		/* if not, assume 0 is correct */
-#endif
-
-
-/*
- * Memory allocation and freeing are controlled by the regular library
- * routines malloc() and free().
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * "Large" objects are treated the same as "small" ones.
- * NB: although we include FAR keywords in the routine declarations,
- * this file won't actually work in 80x86 small/medium model; at least,
- * you probably won't be able to process useful-size images in only 64KB.
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void FAR *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- * It's impossible to do this in a portable way; our current solution is
- * to make the user tell us (with a default value set at compile time).
- * If you can actually get the available space, it's a good idea to subtract
- * a slop factor of 5% or so.
- */
-
-#ifndef DEFAULT_MAX_MEM		/* so can override from makefile */
-#define DEFAULT_MAX_MEM		1000000L /* default: one megabyte */
-#endif
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
-		    long max_bytes_needed, long already_allocated)
-{
-  return cinfo->mem->max_memory_to_use - already_allocated;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed.  You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-
-METHODDEF(void)
-read_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		    void FAR * buffer_address,
-		    long file_offset, long byte_count)
-{
-  if (fseek(info->temp_file, file_offset, SEEK_SET))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  if (JFREAD(info->temp_file, buffer_address, byte_count)
-      != (size_t) byte_count)
-    ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		     void FAR * buffer_address,
-		     long file_offset, long byte_count)
-{
-  if (fseek(info->temp_file, file_offset, SEEK_SET))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  if (JFWRITE(info->temp_file, buffer_address, byte_count)
-      != (size_t) byte_count)
-    ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_backing_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  fclose(info->temp_file);
-  /* Since this implementation uses tmpfile() to create the file,
-   * no explicit file deletion is needed.
-   */
-}
-
-
-/*
- * Initial opening of a backing-store object.
- *
- * This version uses tmpfile(), which constructs a suitable file name
- * behind the scenes.  We don't have to use info->temp_name[] at all;
- * indeed, we can't even find out the actual name of the temp file.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-			 long total_bytes_needed)
-{
-  if ((info->temp_file = tmpfile()) == NULL)
-    ERREXITS(cinfo, JERR_TFILE_CREATE, "");
-  info->read_backing_store = read_backing_store;
-  info->write_backing_store = write_backing_store;
-  info->close_backing_store = close_backing_store;
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
-  return DEFAULT_MAX_MEM;	/* default for max_memory_to_use */
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
-  /* no work */
-}

diff --git a/jmemdos.c b/jmemdos.c
deleted file mode 100644
index 60b45c6..0000000
--- a/jmemdos.c
+++ /dev/null

@@ -1,638 +0,0 @@
-/*
- * jmemdos.c
- *
- * Copyright (C) 1992-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file provides an MS-DOS-compatible implementation of the system-
- * dependent portion of the JPEG memory manager.  Temporary data can be
- * stored in extended or expanded memory as well as in regular DOS files.
- *
- * If you use this file, you must be sure that NEED_FAR_POINTERS is defined
- * if you compile in a small-data memory model; it should NOT be defined if
- * you use a large-data memory model.  This file is not recommended if you
- * are using a flat-memory-space 386 environment such as DJGCC or Watcom C.
- * Also, this code will NOT work if struct fields are aligned on greater than
- * 2-byte boundaries.
- *
- * Based on code contributed by Ge' Weijers.
- */
-
-/*
- * If you have both extended and expanded memory, you may want to change the
- * order in which they are tried in jopen_backing_store.  On a 286 machine
- * expanded memory is usually faster, since extended memory access involves
- * an expensive protected-mode-and-back switch.  On 386 and better, extended
- * memory is usually faster.  As distributed, the code tries extended memory
- * first (what? not everyone has a 386? :-).
- *
- * You can disable use of extended/expanded memory entirely by altering these
- * definitions or overriding them from the Makefile (eg, -DEMS_SUPPORTED=0).
- */
-
-#ifndef XMS_SUPPORTED
-#define XMS_SUPPORTED  1
-#endif
-#ifndef EMS_SUPPORTED
-#define EMS_SUPPORTED  1
-#endif
-
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h"		/* import the system-dependent declarations */
-
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare these */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
-extern char * getenv JPP((const char * name));
-#endif
-
-#ifdef NEED_FAR_POINTERS
-
-#ifdef __TURBOC__
-/* These definitions work for Borland C (Turbo C) */
-#include <alloc.h>		/* need farmalloc(), farfree() */
-#define far_malloc(x)	farmalloc(x)
-#define far_free(x)	farfree(x)
-#else
-/* These definitions work for Microsoft C and compatible compilers */
-#include <malloc.h>		/* need _fmalloc(), _ffree() */
-#define far_malloc(x)	_fmalloc(x)
-#define far_free(x)	_ffree(x)
-#endif
-
-#else /* not NEED_FAR_POINTERS */
-
-#define far_malloc(x)	malloc(x)
-#define far_free(x)	free(x)
-
-#endif /* NEED_FAR_POINTERS */
-
-#ifdef DONT_USE_B_MODE		/* define mode parameters for fopen() */
-#define READ_BINARY	"r"
-#else
-#define READ_BINARY	"rb"
-#endif
-
-#ifndef USE_MSDOS_MEMMGR	/* make sure user got configuration right */
-  You forgot to define USE_MSDOS_MEMMGR in jconfig.h. /* deliberate syntax error */
-#endif
-
-#if MAX_ALLOC_CHUNK >= 65535L	/* make sure jconfig.h got this right */
-  MAX_ALLOC_CHUNK should be less than 64K. /* deliberate syntax error */
-#endif
-
-
-/*
- * Declarations for assembly-language support routines (see jmemdosa.asm).
- *
- * The functions are declared "far" as are all their pointer arguments;
- * this ensures the assembly source code will work regardless of the
- * compiler memory model.  We assume "short" is 16 bits, "long" is 32.
- */
-
-typedef void far * XMSDRIVER;	/* actually a pointer to code */
-typedef struct {		/* registers for calling XMS driver */
-	unsigned short ax, dx, bx;
-	void far * ds_si;
-      } XMScontext;
-typedef struct {		/* registers for calling EMS driver */
-	unsigned short ax, dx, bx;
-	void far * ds_si;
-      } EMScontext;
-
-extern short far jdos_open JPP((short far * handle, char far * filename));
-extern short far jdos_close JPP((short handle));
-extern short far jdos_seek JPP((short handle, long offset));
-extern short far jdos_read JPP((short handle, void far * buffer,
-				unsigned short count));
-extern short far jdos_write JPP((short handle, void far * buffer,
-				 unsigned short count));
-extern void far jxms_getdriver JPP((XMSDRIVER far *));
-extern void far jxms_calldriver JPP((XMSDRIVER, XMScontext far *));
-extern short far jems_available JPP((void));
-extern void far jems_calldriver JPP((EMScontext far *));
-
-
-/*
- * Selection of a file name for a temporary file.
- * This is highly system-dependent, and you may want to customize it.
- */
-
-static int next_file_num;	/* to distinguish among several temp files */
-
-LOCAL(void)
-select_file_name (char * fname)
-{
-  const char * env;
-  char * ptr;
-  FILE * tfile;
-
-  /* Keep generating file names till we find one that's not in use */
-  for (;;) {
-    /* Get temp directory name from environment TMP or TEMP variable;
-     * if none, use "."
-     */
-    if ((env = (const char *) getenv("TMP")) == NULL)
-      if ((env = (const char *) getenv("TEMP")) == NULL)
-	env = ".";
-    if (*env == '\0')		/* null string means "." */
-      env = ".";
-    ptr = fname;		/* copy name to fname */
-    while (*env != '\0')
-      *ptr++ = *env++;
-    if (ptr[-1] != '\\' && ptr[-1] != '/')
-      *ptr++ = '\\';		/* append backslash if not in env variable */
-    /* Append a suitable file name */
-    next_file_num++;		/* advance counter */
-    sprintf(ptr, "JPG%03d.TMP", next_file_num);
-    /* Probe to see if file name is already in use */
-    if ((tfile = fopen(fname, READ_BINARY)) == NULL)
-      break;
-    fclose(tfile);		/* oops, it's there; close tfile & try again */
-  }
-}
-
-
-/*
- * Near-memory allocation and freeing are controlled by the regular library
- * routines malloc() and free().
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * "Large" objects are allocated in far memory, if possible
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void FAR *) far_malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
-  far_free(object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- * It's impossible to do this in a portable way; our current solution is
- * to make the user tell us (with a default value set at compile time).
- * If you can actually get the available space, it's a good idea to subtract
- * a slop factor of 5% or so.
- */
-
-#ifndef DEFAULT_MAX_MEM		/* so can override from makefile */
-#define DEFAULT_MAX_MEM		300000L /* for total usage about 450K */
-#endif
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
-		    long max_bytes_needed, long already_allocated)
-{
-  return cinfo->mem->max_memory_to_use - already_allocated;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed.  You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-/*
- * For MS-DOS we support three types of backing storage:
- *   1. Conventional DOS files.  We access these by direct DOS calls rather
- *      than via the stdio package.  This provides a bit better performance,
- *      but the real reason is that the buffers to be read or written are FAR.
- *      The stdio library for small-data memory models can't cope with that.
- *   2. Extended memory, accessed per the XMS V2.0 specification.
- *   3. Expanded memory, accessed per the LIM/EMS 4.0 specification.
- * You'll need copies of those specs to make sense of the related code.
- * The specs are available by Internet FTP from the SIMTEL archives 
- * (oak.oakland.edu and its various mirror sites).  See files
- * pub/msdos/microsoft/xms20.arc and pub/msdos/info/limems41.zip.
- */
-
-
-/*
- * Access methods for a DOS file.
- */
-
-
-METHODDEF(void)
-read_file_store (j_common_ptr cinfo, backing_store_ptr info,
-		 void FAR * buffer_address,
-		 long file_offset, long byte_count)
-{
-  if (jdos_seek(info->handle.file_handle, file_offset))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  /* Since MAX_ALLOC_CHUNK is less than 64K, byte_count will be too. */
-  if (byte_count > 65535L)	/* safety check */
-    ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
-  if (jdos_read(info->handle.file_handle, buffer_address,
-		(unsigned short) byte_count))
-    ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_file_store (j_common_ptr cinfo, backing_store_ptr info,
-		  void FAR * buffer_address,
-		  long file_offset, long byte_count)
-{
-  if (jdos_seek(info->handle.file_handle, file_offset))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  /* Since MAX_ALLOC_CHUNK is less than 64K, byte_count will be too. */
-  if (byte_count > 65535L)	/* safety check */
-    ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
-  if (jdos_write(info->handle.file_handle, buffer_address,
-		 (unsigned short) byte_count))
-    ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_file_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  jdos_close(info->handle.file_handle);	/* close the file */
-  remove(info->temp_name);	/* delete the file */
-/* If your system doesn't have remove(), try unlink() instead.
- * remove() is the ANSI-standard name for this function, but
- * unlink() was more common in pre-ANSI systems.
- */
-  TRACEMSS(cinfo, 1, JTRC_TFILE_CLOSE, info->temp_name);
-}
-
-
-LOCAL(boolean)
-open_file_store (j_common_ptr cinfo, backing_store_ptr info,
-		 long total_bytes_needed)
-{
-  short handle;
-
-  select_file_name(info->temp_name);
-  if (jdos_open((short far *) & handle, (char far *) info->temp_name)) {
-    /* might as well exit since jpeg_open_backing_store will fail anyway */
-    ERREXITS(cinfo, JERR_TFILE_CREATE, info->temp_name);
-    return FALSE;
-  }
-  info->handle.file_handle = handle;
-  info->read_backing_store = read_file_store;
-  info->write_backing_store = write_file_store;
-  info->close_backing_store = close_file_store;
-  TRACEMSS(cinfo, 1, JTRC_TFILE_OPEN, info->temp_name);
-  return TRUE;			/* succeeded */
-}
-
-
-/*
- * Access methods for extended memory.
- */
-
-#if XMS_SUPPORTED
-
-static XMSDRIVER xms_driver;	/* saved address of XMS driver */
-
-typedef union {			/* either long offset or real-mode pointer */
-	long offset;
-	void far * ptr;
-      } XMSPTR;
-
-typedef struct {		/* XMS move specification structure */
-	long length;
-	XMSH src_handle;
-	XMSPTR src;
-	XMSH dst_handle;
-	XMSPTR dst;
-      } XMSspec;
-
-#define ODD(X)	(((X) & 1L) != 0)
-
-
-METHODDEF(void)
-read_xms_store (j_common_ptr cinfo, backing_store_ptr info,
-		void FAR * buffer_address,
-		long file_offset, long byte_count)
-{
-  XMScontext ctx;
-  XMSspec spec;
-  char endbuffer[2];
-
-  /* The XMS driver can't cope with an odd length, so handle the last byte
-   * specially if byte_count is odd.  We don't expect this to be common.
-   */
-
-  spec.length = byte_count & (~ 1L);
-  spec.src_handle = info->handle.xms_handle;
-  spec.src.offset = file_offset;
-  spec.dst_handle = 0;
-  spec.dst.ptr = buffer_address;
-  
-  ctx.ds_si = (void far *) & spec;
-  ctx.ax = 0x0b00;		/* EMB move */
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  if (ctx.ax != 1)
-    ERREXIT(cinfo, JERR_XMS_READ);
-
-  if (ODD(byte_count)) {
-    read_xms_store(cinfo, info, (void FAR *) endbuffer,
-		   file_offset + byte_count - 1L, 2L);
-    ((char FAR *) buffer_address)[byte_count - 1L] = endbuffer[0];
-  }
-}
-
-
-METHODDEF(void)
-write_xms_store (j_common_ptr cinfo, backing_store_ptr info,
-		 void FAR * buffer_address,
-		 long file_offset, long byte_count)
-{
-  XMScontext ctx;
-  XMSspec spec;
-  char endbuffer[2];
-
-  /* The XMS driver can't cope with an odd length, so handle the last byte
-   * specially if byte_count is odd.  We don't expect this to be common.
-   */
-
-  spec.length = byte_count & (~ 1L);
-  spec.src_handle = 0;
-  spec.src.ptr = buffer_address;
-  spec.dst_handle = info->handle.xms_handle;
-  spec.dst.offset = file_offset;
-
-  ctx.ds_si = (void far *) & spec;
-  ctx.ax = 0x0b00;		/* EMB move */
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  if (ctx.ax != 1)
-    ERREXIT(cinfo, JERR_XMS_WRITE);
-
-  if (ODD(byte_count)) {
-    read_xms_store(cinfo, info, (void FAR *) endbuffer,
-		   file_offset + byte_count - 1L, 2L);
-    endbuffer[0] = ((char FAR *) buffer_address)[byte_count - 1L];
-    write_xms_store(cinfo, info, (void FAR *) endbuffer,
-		    file_offset + byte_count - 1L, 2L);
-  }
-}
-
-
-METHODDEF(void)
-close_xms_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  XMScontext ctx;
-
-  ctx.dx = info->handle.xms_handle;
-  ctx.ax = 0x0a00;
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  TRACEMS1(cinfo, 1, JTRC_XMS_CLOSE, info->handle.xms_handle);
-  /* we ignore any error return from the driver */
-}
-
-
-LOCAL(boolean)
-open_xms_store (j_common_ptr cinfo, backing_store_ptr info,
-		long total_bytes_needed)
-{
-  XMScontext ctx;
-
-  /* Get address of XMS driver */
-  jxms_getdriver((XMSDRIVER far *) & xms_driver);
-  if (xms_driver == NULL)
-    return FALSE;		/* no driver to be had */
-
-  /* Get version number, must be >= 2.00 */
-  ctx.ax = 0x0000;
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  if (ctx.ax < (unsigned short) 0x0200)
-    return FALSE;
-
-  /* Try to get space (expressed in kilobytes) */
-  ctx.dx = (unsigned short) ((total_bytes_needed + 1023L) >> 10);
-  ctx.ax = 0x0900;
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  if (ctx.ax != 1)
-    return FALSE;
-
-  /* Succeeded, save the handle and away we go */
-  info->handle.xms_handle = ctx.dx;
-  info->read_backing_store = read_xms_store;
-  info->write_backing_store = write_xms_store;
-  info->close_backing_store = close_xms_store;
-  TRACEMS1(cinfo, 1, JTRC_XMS_OPEN, ctx.dx);
-  return TRUE;			/* succeeded */
-}
-
-#endif /* XMS_SUPPORTED */
-
-
-/*
- * Access methods for expanded memory.
- */
-
-#if EMS_SUPPORTED
-
-/* The EMS move specification structure requires word and long fields aligned
- * at odd byte boundaries.  Some compilers will align struct fields at even
- * byte boundaries.  While it's usually possible to force byte alignment,
- * that causes an overall performance penalty and may pose problems in merging
- * JPEG into a larger application.  Instead we accept some rather dirty code
- * here.  Note this code would fail if the hardware did not allow odd-byte
- * word & long accesses, but all 80x86 CPUs do.
- */
-
-typedef void far * EMSPTR;
-
-typedef union {			/* EMS move specification structure */
-	long length;		/* It's easy to access first 4 bytes */
-	char bytes[18];		/* Misaligned fields in here! */
-      } EMSspec;
-
-/* Macros for accessing misaligned fields */
-#define FIELD_AT(spec,offset,type)  (*((type *) &(spec.bytes[offset])))
-#define SRC_TYPE(spec)		FIELD_AT(spec,4,char)
-#define SRC_HANDLE(spec)	FIELD_AT(spec,5,EMSH)
-#define SRC_OFFSET(spec)	FIELD_AT(spec,7,unsigned short)
-#define SRC_PAGE(spec)		FIELD_AT(spec,9,unsigned short)
-#define SRC_PTR(spec)		FIELD_AT(spec,7,EMSPTR)
-#define DST_TYPE(spec)		FIELD_AT(spec,11,char)
-#define DST_HANDLE(spec)	FIELD_AT(spec,12,EMSH)
-#define DST_OFFSET(spec)	FIELD_AT(spec,14,unsigned short)
-#define DST_PAGE(spec)		FIELD_AT(spec,16,unsigned short)
-#define DST_PTR(spec)		FIELD_AT(spec,14,EMSPTR)
-
-#define EMSPAGESIZE	16384L	/* gospel, see the EMS specs */
-
-#define HIBYTE(W)  (((W) >> 8) & 0xFF)
-#define LOBYTE(W)  ((W) & 0xFF)
-
-
-METHODDEF(void)
-read_ems_store (j_common_ptr cinfo, backing_store_ptr info,
-		void FAR * buffer_address,
-		long file_offset, long byte_count)
-{
-  EMScontext ctx;
-  EMSspec spec;
-
-  spec.length = byte_count;
-  SRC_TYPE(spec) = 1;
-  SRC_HANDLE(spec) = info->handle.ems_handle;
-  SRC_PAGE(spec)   = (unsigned short) (file_offset / EMSPAGESIZE);
-  SRC_OFFSET(spec) = (unsigned short) (file_offset % EMSPAGESIZE);
-  DST_TYPE(spec) = 0;
-  DST_HANDLE(spec) = 0;
-  DST_PTR(spec)    = buffer_address;
-  
-  ctx.ds_si = (void far *) & spec;
-  ctx.ax = 0x5700;		/* move memory region */
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0)
-    ERREXIT(cinfo, JERR_EMS_READ);
-}
-
-
-METHODDEF(void)
-write_ems_store (j_common_ptr cinfo, backing_store_ptr info,
-		 void FAR * buffer_address,
-		 long file_offset, long byte_count)
-{
-  EMScontext ctx;
-  EMSspec spec;
-
-  spec.length = byte_count;
-  SRC_TYPE(spec) = 0;
-  SRC_HANDLE(spec) = 0;
-  SRC_PTR(spec)    = buffer_address;
-  DST_TYPE(spec) = 1;
-  DST_HANDLE(spec) = info->handle.ems_handle;
-  DST_PAGE(spec)   = (unsigned short) (file_offset / EMSPAGESIZE);
-  DST_OFFSET(spec) = (unsigned short) (file_offset % EMSPAGESIZE);
-  
-  ctx.ds_si = (void far *) & spec;
-  ctx.ax = 0x5700;		/* move memory region */
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0)
-    ERREXIT(cinfo, JERR_EMS_WRITE);
-}
-
-
-METHODDEF(void)
-close_ems_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  EMScontext ctx;
-
-  ctx.ax = 0x4500;
-  ctx.dx = info->handle.ems_handle;
-  jems_calldriver((EMScontext far *) & ctx);
-  TRACEMS1(cinfo, 1, JTRC_EMS_CLOSE, info->handle.ems_handle);
-  /* we ignore any error return from the driver */
-}
-
-
-LOCAL(boolean)
-open_ems_store (j_common_ptr cinfo, backing_store_ptr info,
-		long total_bytes_needed)
-{
-  EMScontext ctx;
-
-  /* Is EMS driver there? */
-  if (! jems_available())
-    return FALSE;
-
-  /* Get status, make sure EMS is OK */
-  ctx.ax = 0x4000;
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0)
-    return FALSE;
-
-  /* Get version, must be >= 4.0 */
-  ctx.ax = 0x4600;
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0 || LOBYTE(ctx.ax) < 0x40)
-    return FALSE;
-
-  /* Try to allocate requested space */
-  ctx.ax = 0x4300;
-  ctx.bx = (unsigned short) ((total_bytes_needed + EMSPAGESIZE-1L) / EMSPAGESIZE);
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0)
-    return FALSE;
-
-  /* Succeeded, save the handle and away we go */
-  info->handle.ems_handle = ctx.dx;
-  info->read_backing_store = read_ems_store;
-  info->write_backing_store = write_ems_store;
-  info->close_backing_store = close_ems_store;
-  TRACEMS1(cinfo, 1, JTRC_EMS_OPEN, ctx.dx);
-  return TRUE;			/* succeeded */
-}
-
-#endif /* EMS_SUPPORTED */
-
-
-/*
- * Initial opening of a backing-store object.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-			 long total_bytes_needed)
-{
-  /* Try extended memory, then expanded memory, then regular file. */
-#if XMS_SUPPORTED
-  if (open_xms_store(cinfo, info, total_bytes_needed))
-    return;
-#endif
-#if EMS_SUPPORTED
-  if (open_ems_store(cinfo, info, total_bytes_needed))
-    return;
-#endif
-  if (open_file_store(cinfo, info, total_bytes_needed))
-    return;
-  ERREXITS(cinfo, JERR_TFILE_CREATE, "");
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
-  next_file_num = 0;		/* initialize temp file name generator */
-  return DEFAULT_MAX_MEM;	/* default for max_memory_to_use */
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
-  /* Microsoft C, at least in v6.00A, will not successfully reclaim freed
-   * blocks of size > 32Kbytes unless we give it a kick in the rear, like so:
-   */
-#ifdef NEED_FHEAPMIN
-  _fheapmin();
-#endif
-}

diff --git a/jmemdosa.asm b/jmemdosa.asm
deleted file mode 100644
index ecd4372..0000000
--- a/jmemdosa.asm
+++ /dev/null

@@ -1,379 +0,0 @@
-;
-; jmemdosa.asm
-;
-; Copyright (C) 1992, Thomas G. Lane.
-; This file is part of the Independent JPEG Group's software.
-; For conditions of distribution and use, see the accompanying README file.
-;
-; This file contains low-level interface routines to support the MS-DOS
-; backing store manager (jmemdos.c).  Routines are provided to access disk
-; files through direct DOS calls, and to access XMS and EMS drivers.
-;
-; This file should assemble with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).  If you haven't got
-; a compatible assembler, better fall back to jmemansi.c or jmemname.c.
-;
-; To minimize dependence on the C compiler's register usage conventions,
-; we save and restore all 8086 registers, even though most compilers only
-; require SI,DI,DS to be preserved.  Also, we use only 16-bit-wide return
-; values, which everybody returns in AX.
-;
-; Based on code contributed by Ge' Weijers.
-;
-
-JMEMDOSA_TXT	segment byte public 'CODE'
-
-		assume	cs:JMEMDOSA_TXT
-
-		public	_jdos_open
-		public	_jdos_close
-		public	_jdos_seek
-		public	_jdos_read
-		public	_jdos_write
-		public	_jxms_getdriver
-		public	_jxms_calldriver
-		public	_jems_available
-		public	_jems_calldriver
-
-;
-; short far jdos_open (short far * handle, char far * filename)
-;
-; Create and open a temporary file
-;
-_jdos_open	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	cx,0			; normal file attributes
-		lds	dx,dword ptr [bp+10]	; get filename pointer
-		mov	ah,3ch			; create file
-		int	21h
-		jc	open_err		; if failed, return error code
-		lds	bx,dword ptr [bp+6]	; get handle pointer
-		mov	word ptr [bx],ax	; save the handle
-		xor	ax,ax			; return zero for OK
-open_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_open	endp
-
-
-;
-; short far jdos_close (short handle)
-;
-; Close the file handle
-;
-_jdos_close	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	bx,word ptr [bp+6]	; file handle
-		mov	ah,3eh			; close file
-		int	21h
-		jc	close_err		; if failed, return error code
-		xor	ax,ax			; return zero for OK
-close_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_close	endp
-
-
-;
-; short far jdos_seek (short handle, long offset)
-;
-; Set file position
-;
-_jdos_seek	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	bx,word ptr [bp+6]	; file handle
-		mov	dx,word ptr [bp+8]	; LS offset
-		mov	cx,word ptr [bp+10]	; MS offset
-		mov	ax,4200h		; absolute seek
-		int	21h
-		jc	seek_err		; if failed, return error code
-		xor	ax,ax			; return zero for OK
-seek_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_seek	endp
-
-
-;
-; short far jdos_read (short handle, void far * buffer, unsigned short count)
-;
-; Read from file
-;
-_jdos_read	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	bx,word ptr [bp+6]	; file handle
-		lds	dx,dword ptr [bp+8]	; buffer address
-		mov	cx,word ptr [bp+12]	; number of bytes
-		mov	ah,3fh			; read file
-		int	21h
-		jc	read_err		; if failed, return error code
-		cmp	ax,word ptr [bp+12]	; make sure all bytes were read
-		je	read_ok
-		mov	ax,1			; else return 1 for not OK
-		jmp	short read_err
-read_ok:	xor	ax,ax			; return zero for OK
-read_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_read	endp
-
-
-;
-; short far jdos_write (short handle, void far * buffer, unsigned short count)
-;
-; Write to file
-;
-_jdos_write	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	bx,word ptr [bp+6]	; file handle
-		lds	dx,dword ptr [bp+8]	; buffer address
-		mov	cx,word ptr [bp+12]	; number of bytes
-		mov	ah,40h			; write file
-		int	21h
-		jc	write_err		; if failed, return error code
-		cmp	ax,word ptr [bp+12]	; make sure all bytes written
-		je	write_ok
-		mov	ax,1			; else return 1 for not OK
-		jmp	short write_err
-write_ok:	xor	ax,ax			; return zero for OK
-write_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_write	endp
-
-
-;
-; void far jxms_getdriver (XMSDRIVER far *)
-;
-; Get the address of the XMS driver, or NULL if not available
-;
-_jxms_getdriver	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov 	ax,4300h		; call multiplex interrupt with
-		int	2fh			; a magic cookie, hex 4300
-		cmp 	al,80h			; AL should contain hex 80
-		je	xmsavail
-		xor 	dx,dx			; no XMS driver available
-		xor 	ax,ax			; return a nil pointer
-		jmp	short xmsavail_done
-xmsavail:	mov 	ax,4310h		; fetch driver address with
-		int	2fh			; another magic cookie
-		mov 	dx,es			; copy address to dx:ax
-		mov 	ax,bx
-xmsavail_done:	les 	bx,dword ptr [bp+6]	; get pointer to return value
-		mov	word ptr es:[bx],ax
-		mov	word ptr es:[bx+2],dx
-		pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop	bp
-		ret
-_jxms_getdriver	endp
-
-
-;
-; void far jxms_calldriver (XMSDRIVER, XMScontext far *)
-;
-; The XMScontext structure contains values for the AX,DX,BX,SI,DS registers.
-; These are loaded, the XMS call is performed, and the new values of the
-; AX,DX,BX registers are written back to the context structure.
-;
-_jxms_calldriver 	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		les 	bx,dword ptr [bp+10]	; get XMScontext pointer
-		mov 	ax,word ptr es:[bx]	; load registers
-		mov 	dx,word ptr es:[bx+2]
-		mov 	si,word ptr es:[bx+6]
-		mov 	ds,word ptr es:[bx+8]
-		mov 	bx,word ptr es:[bx+4]
-		call	dword ptr [bp+6]	; call the driver
-		mov	cx,bx			; save returned BX for a sec
-		les 	bx,dword ptr [bp+10]	; get XMScontext pointer
-		mov 	word ptr es:[bx],ax	; put back ax,dx,bx
-		mov 	word ptr es:[bx+2],dx
-		mov 	word ptr es:[bx+4],cx
-		pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jxms_calldriver 	endp
-
-
-;
-; short far jems_available (void)
-;
-; Have we got an EMS driver? (this comes straight from the EMS 4.0 specs)
-;
-_jems_available	proc	far
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	ax,3567h		; get interrupt vector 67h
-		int	21h
-		push	cs
-		pop	ds
-		mov	di,000ah		; check offs 10 in returned seg
-		lea	si,ASCII_device_name	; against literal string
-		mov	cx,8
-		cld
-		repe cmpsb
-		jne	no_ems
-		mov	ax,1			; match, it's there
-		jmp	short avail_done
-no_ems:		xor	ax,ax			; it's not there
-avail_done:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		ret
-
-ASCII_device_name	db	"EMMXXXX0"
-
-_jems_available	endp
-
-
-;
-; void far jems_calldriver (EMScontext far *)
-;
-; The EMScontext structure contains values for the AX,DX,BX,SI,DS registers.
-; These are loaded, the EMS trap is performed, and the new values of the
-; AX,DX,BX registers are written back to the context structure.
-;
-_jems_calldriver	proc far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		les 	bx,dword ptr [bp+6]	; get EMScontext pointer
-		mov 	ax,word ptr es:[bx]	; load registers
-		mov 	dx,word ptr es:[bx+2]
-		mov 	si,word ptr es:[bx+6]
-		mov 	ds,word ptr es:[bx+8]
-		mov 	bx,word ptr es:[bx+4]
-		int	67h			; call the EMS driver
-		mov	cx,bx			; save returned BX for a sec
-		les 	bx,dword ptr [bp+6]	; get EMScontext pointer
-		mov 	word ptr es:[bx],ax	; put back ax,dx,bx
-		mov 	word ptr es:[bx+2],dx
-		mov 	word ptr es:[bx+4],cx
-		pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jems_calldriver	endp
-
-JMEMDOSA_TXT	ends
-
-		end

diff --git a/jmemmac.c b/jmemmac.c
deleted file mode 100644
index 106f9be..0000000
--- a/jmemmac.c
+++ /dev/null

@@ -1,289 +0,0 @@
-/*
- * jmemmac.c
- *
- * Copyright (C) 1992-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * jmemmac.c provides an Apple Macintosh implementation of the system-
- * dependent portion of the JPEG memory manager.
- *
- * If you use jmemmac.c, then you must define USE_MAC_MEMMGR in the
- * JPEG_INTERNALS part of jconfig.h.
- *
- * jmemmac.c uses the Macintosh toolbox routines NewPtr and DisposePtr
- * instead of malloc and free.  It accurately determines the amount of
- * memory available by using CompactMem.  Notice that if left to its
- * own devices, this code can chew up all available space in the
- * application's zone, with the exception of the rather small "slop"
- * factor computed in jpeg_mem_available().  The application can ensure
- * that more space is left over by reducing max_memory_to_use.
- *
- * Large images are swapped to disk using temporary files and System 7.0+'s
- * temporary folder functionality.
- *
- * Note that jmemmac.c depends on two features of MacOS that were first
- * introduced in System 7: FindFolder and the FSSpec-based calls.
- * If your application uses jmemmac.c and is run under System 6 or earlier,
- * and the jpeg library decides it needs a temporary file, it will abort,
- * printing error messages about requiring System 7.  (If no temporary files
- * are created, it will run fine.)
- *
- * If you want to use jmemmac.c in an application that might be used with
- * System 6 or earlier, then you should remove dependencies on FindFolder
- * and the FSSpec calls.  You will need to replace FindFolder with some
- * other mechanism for finding a place to put temporary files, and you
- * should replace the FSSpec calls with their HFS equivalents:
- *
- *     FSpDelete     ->  HDelete
- *     FSpGetFInfo   ->  HGetFInfo
- *     FSpCreate     ->  HCreate
- *     FSpOpenDF     ->  HOpen      *** Note: not HOpenDF ***
- *     FSMakeFSSpec  ->  (fill in spec by hand.)
- *
- * (Use HOpen instead of HOpenDF.  HOpen is just a glue-interface to PBHOpen,
- * which is on all HFS macs.  HOpenDF is a System 7 addition which avoids the
- * ages-old problem of names starting with a period.)
- *
- * Contributed by Sam Bushell (jsam@iagu.on.net) and
- * Dan Gildor (gyld@in-touch.com).
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h"    /* import the system-dependent declarations */
-
-#ifndef USE_MAC_MEMMGR	/* make sure user got configuration right */
-  You forgot to define USE_MAC_MEMMGR in jconfig.h. /* deliberate syntax error */
-#endif
-
-#include <Memory.h>     /* we use the MacOS memory manager */
-#include <Files.h>      /* we use the MacOS File stuff */
-#include <Folders.h>    /* we use the MacOS HFS stuff */
-#include <Script.h>     /* for smSystemScript */
-#include <Gestalt.h>    /* we use Gestalt to test for specific functionality */
-
-#ifndef TEMP_FILE_NAME		/* can override from jconfig.h or Makefile */
-#define TEMP_FILE_NAME  "JPG%03d.TMP"
-#endif
-
-static int next_file_num;	/* to distinguish among several temp files */
-
-
-/*
- * Memory allocation and freeing are controlled by the MacOS library
- * routines NewPtr() and DisposePtr(), which allocate fixed-address
- * storage.  Unfortunately, the IJG library isn't smart enough to cope
- * with relocatable storage.
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void *) NewPtr(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
-  DisposePtr((Ptr) object);
-}
-
-
-/*
- * "Large" objects are treated the same as "small" ones.
- * NB: we include FAR keywords in the routine declarations simply for
- * consistency with the rest of the IJG code; FAR should expand to empty
- * on rational architectures like the Mac.
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void FAR *) NewPtr(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
-  DisposePtr((Ptr) object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- */
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
-		    long max_bytes_needed, long already_allocated)
-{
-  long limit = cinfo->mem->max_memory_to_use - already_allocated;
-  long slop, mem;
-
-  /* Don't ask for more than what application has told us we may use */
-  if (max_bytes_needed > limit && limit > 0)
-    max_bytes_needed = limit;
-  /* Find whether there's a big enough free block in the heap.
-   * CompactMem tries to create a contiguous block of the requested size,
-   * and then returns the size of the largest free block (which could be
-   * much more or much less than we asked for).
-   * We add some slop to ensure we don't use up all available memory.
-   */
-  slop = max_bytes_needed / 16 + 32768L;
-  mem = CompactMem(max_bytes_needed + slop) - slop;
-  if (mem < 0)
-    mem = 0;			/* sigh, couldn't even get the slop */
-  /* Don't take more than the application says we can have */
-  if (mem > limit && limit > 0)
-    mem = limit;
-  return mem;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed.  You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-
-METHODDEF(void)
-read_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		    void FAR * buffer_address,
-		    long file_offset, long byte_count)
-{
-  long bytes = byte_count;
-  long retVal;
-
-  if ( SetFPos ( info->temp_file, fsFromStart, file_offset ) != noErr )
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-
-  retVal = FSRead ( info->temp_file, &bytes,
-		    (unsigned char *) buffer_address );
-  if ( retVal != noErr || bytes != byte_count )
-    ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		     void FAR * buffer_address,
-		     long file_offset, long byte_count)
-{
-  long bytes = byte_count;
-  long retVal;
-
-  if ( SetFPos ( info->temp_file, fsFromStart, file_offset ) != noErr )
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-
-  retVal = FSWrite ( info->temp_file, &bytes,
-		     (unsigned char *) buffer_address );
-  if ( retVal != noErr || bytes != byte_count )
-    ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_backing_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  FSClose ( info->temp_file );
-  FSpDelete ( &(info->tempSpec) );
-}
-
-
-/*
- * Initial opening of a backing-store object.
- *
- * This version uses FindFolder to find the Temporary Items folder,
- * and puts the temporary file in there.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-			 long total_bytes_needed)
-{
-  short         tmpRef, vRefNum;
-  long          dirID;
-  FInfo         finderInfo;
-  FSSpec        theSpec;
-  Str255        fName;
-  OSErr         osErr;
-  long          gestaltResponse = 0;
-
-  /* Check that FSSpec calls are available. */
-  osErr = Gestalt( gestaltFSAttr, &gestaltResponse );
-  if ( ( osErr != noErr )
-       || !( gestaltResponse & (1<<gestaltHasFSSpecCalls) ) )
-    ERREXITS(cinfo, JERR_TFILE_CREATE, "- System 7.0 or later required");
-  /* TO DO: add a proper error message to jerror.h. */
-
-  /* Check that FindFolder is available. */
-  osErr = Gestalt( gestaltFindFolderAttr, &gestaltResponse );
-  if ( ( osErr != noErr )
-       || !( gestaltResponse & (1<<gestaltFindFolderPresent) ) )
-    ERREXITS(cinfo, JERR_TFILE_CREATE, "- System 7.0 or later required.");
-  /* TO DO: add a proper error message to jerror.h. */
-
-  osErr = FindFolder ( kOnSystemDisk, kTemporaryFolderType, kCreateFolder,
-                       &vRefNum, &dirID );
-  if ( osErr != noErr )
-    ERREXITS(cinfo, JERR_TFILE_CREATE, "- temporary items folder unavailable");
-  /* TO DO: Try putting the temp files somewhere else. */
-
-  /* Keep generating file names till we find one that's not in use */
-  for (;;) {
-    next_file_num++;		/* advance counter */
-
-    sprintf(info->temp_name, TEMP_FILE_NAME, next_file_num);
-    strcpy ( (Ptr)fName+1, info->temp_name );
-    *fName = strlen (info->temp_name);
-    osErr = FSMakeFSSpec ( vRefNum, dirID, fName, &theSpec );
-
-    if ( (osErr = FSpGetFInfo ( &theSpec, &finderInfo ) ) != noErr )
-      break;
-  }
-
-  osErr = FSpCreate ( &theSpec, '????', '????', smSystemScript );
-  if ( osErr != noErr )
-    ERREXITS(cinfo, JERR_TFILE_CREATE, info->temp_name);
-
-  osErr = FSpOpenDF ( &theSpec, fsRdWrPerm, &(info->temp_file) );
-  if ( osErr != noErr )
-    ERREXITS(cinfo, JERR_TFILE_CREATE, info->temp_name);
-
-  info->tempSpec = theSpec;
-
-  info->read_backing_store = read_backing_store;
-  info->write_backing_store = write_backing_store;
-  info->close_backing_store = close_backing_store;
-  TRACEMSS(cinfo, 1, JTRC_TFILE_OPEN, info->temp_name);
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
-  next_file_num = 0;
-
-  /* max_memory_to_use will be initialized to FreeMem()'s result;
-   * the calling application might later reduce it, for example
-   * to leave room to invoke multiple JPEG objects.
-   * Note that FreeMem returns the total number of free bytes;
-   * it may not be possible to allocate a single block of this size.
-   */
-  return FreeMem();
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
-  /* no work */
-}

diff --git a/jmemmgr.c b/jmemmgr.c
index d801b32..058a115 100644
--- a/jmemmgr.c
+++ b/jmemmgr.c

@@ -57,22 +57,25 @@
  * requirement, and we had better do so too.
  * There isn't any really portable way to determine the worst-case alignment
  * requirement.  This module assumes that the alignment requirement is
- * multiples of sizeof(ALIGN_TYPE).
- * By default, we define ALIGN_TYPE as double.  This is necessary on some
+ * multiples of ALIGN_SIZE.
+ * By default, we define ALIGN_SIZE as sizeof(double).  This is necessary on some
  * workstations (where doubles really do need 8-byte alignment) and will work
  * fine on nearly everything.  If your machine has lesser alignment needs,
- * you can save a few bytes by making ALIGN_TYPE smaller.
+ * you can save a few bytes by making ALIGN_SIZE smaller.
  * The only place I know of where this will NOT work is certain Macintosh
  * 680x0 compilers that define double as a 10-byte IEEE extended float.
  * Doing 10-byte alignment is counterproductive because longwords won't be
- * aligned well.  Put "#define ALIGN_TYPE long" in jconfig.h if you have
+ * aligned well.  Put "#define ALIGN_SIZE 4" in jconfig.h if you have
  * such a compiler.
  */
 
-#ifndef ALIGN_TYPE		/* so can override from jconfig.h */
-#define ALIGN_TYPE  double
+#ifndef ALIGN_SIZE		/* so can override from jconfig.h */
+#ifndef WITH_SIMD
+#define ALIGN_SIZE  SIZEOF(double)
+#else
+#define ALIGN_SIZE  16 /* Most SIMD implementations require this */
 #endif
-
+#endif
 
 /*
  * We allocate objects from "pools", where each pool is gotten with a single
@@ -81,34 +84,24 @@
  * header with a link to the next pool of the same class.
  * Small and large pool headers are identical except that the latter's
  * link pointer must be FAR on 80x86 machines.
- * Notice that the "real" header fields are union'ed with a dummy ALIGN_TYPE
- * field.  This forces the compiler to make SIZEOF(small_pool_hdr) a multiple
- * of the alignment requirement of ALIGN_TYPE.
  */
 
-typedef union small_pool_struct * small_pool_ptr;
+typedef struct small_pool_struct * small_pool_ptr;
 
-typedef union small_pool_struct {
-  struct {
-    small_pool_ptr next;	/* next in list of pools */
-    size_t bytes_used;		/* how many bytes already used within pool */
-    size_t bytes_left;		/* bytes still available in this pool */
-  } hdr;
-  ALIGN_TYPE dummy;		/* included in union to ensure alignment */
+typedef struct small_pool_struct {
+  small_pool_ptr next;	/* next in list of pools */
+  size_t bytes_used;		/* how many bytes already used within pool */
+  size_t bytes_left;		/* bytes still available in this pool */
 } small_pool_hdr;
 
-typedef union large_pool_struct FAR * large_pool_ptr;
+typedef struct large_pool_struct FAR * large_pool_ptr;
 
-typedef union large_pool_struct {
-  struct {
-    large_pool_ptr next;	/* next in list of pools */
-    size_t bytes_used;		/* how many bytes already used within pool */
-    size_t bytes_left;		/* bytes still available in this pool */
-  } hdr;
-  ALIGN_TYPE dummy;		/* included in union to ensure alignment */
+typedef struct large_pool_struct {
+  large_pool_ptr next;	/* next in list of pools */
+  size_t bytes_used;		/* how many bytes already used within pool */
+  size_t bytes_left;		/* bytes still available in this pool */
 } large_pool_hdr;
 
-
 /*
  * Here is the full definition of a memory manager object.
  */
@@ -197,16 +190,16 @@
 	  pool_id, mem->total_space_allocated);
 
   for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL;
-       lhdr_ptr = lhdr_ptr->hdr.next) {
+       lhdr_ptr = lhdr_ptr->next) {
     fprintf(stderr, "  Large chunk used %ld\n",
-	    (long) lhdr_ptr->hdr.bytes_used);
+	    (long) lhdr_ptr->bytes_used);
   }
 
   for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL;
-       shdr_ptr = shdr_ptr->hdr.next) {
+       shdr_ptr = shdr_ptr->next) {
     fprintf(stderr, "  Small chunk used %ld free %ld\n",
-	    (long) shdr_ptr->hdr.bytes_used,
-	    (long) shdr_ptr->hdr.bytes_left);
+	    (long) shdr_ptr->bytes_used,
+	    (long) shdr_ptr->bytes_left);
   }
 }
 
@@ -236,6 +229,10 @@
  * and we also distinguish the first pool of a class from later ones.
  * NOTE: the values given work fairly well on both 16- and 32-bit-int
  * machines, but may be too small if longs are 64 bits or more.
+ *
+ * Since we do not know what alignment malloc() gives us, we have to
+ * allocate ALIGN_SIZE-1 extra space per pool to have room for alignment
+ * adjustment.
  */
 
 static const size_t first_pool_slop[JPOOL_NUMPOOLS] = 
@@ -260,33 +257,36 @@
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
   small_pool_ptr hdr_ptr, prev_hdr_ptr;
   char * data_ptr;
-  size_t odd_bytes, min_request, slop;
+  size_t min_request, slop;
+
+  /*
+   * Round up the requested size to a multiple of ALIGN_SIZE in order
+   * to assure alignment for the next object allocated in the same pool
+   * and so that algorithms can straddle outside the proper area up
+   * to the next alignment.
+   */
+  sizeofobject = jround_up(sizeofobject, ALIGN_SIZE);
 
   /* Check for unsatisfiable request (do now to ensure no overflow below) */
-  if (sizeofobject > (size_t) (MAX_ALLOC_CHUNK-SIZEOF(small_pool_hdr)))
+  if ((SIZEOF(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > MAX_ALLOC_CHUNK)
     out_of_memory(cinfo, 1);	/* request exceeds malloc's ability */
 
-  /* Round up the requested size to a multiple of SIZEOF(ALIGN_TYPE) */
-  odd_bytes = sizeofobject % SIZEOF(ALIGN_TYPE);
-  if (odd_bytes > 0)
-    sizeofobject += SIZEOF(ALIGN_TYPE) - odd_bytes;
-
   /* See if space is available in any existing pool */
   if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id);	/* safety check */
   prev_hdr_ptr = NULL;
   hdr_ptr = mem->small_list[pool_id];
   while (hdr_ptr != NULL) {
-    if (hdr_ptr->hdr.bytes_left >= sizeofobject)
+    if (hdr_ptr->bytes_left >= sizeofobject)
       break;			/* found pool with enough space */
     prev_hdr_ptr = hdr_ptr;
-    hdr_ptr = hdr_ptr->hdr.next;
+    hdr_ptr = hdr_ptr->next;
   }
 
   /* Time to make a new pool? */
   if (hdr_ptr == NULL) {
     /* min_request is what we need now, slop is what will be leftover */
-    min_request = sizeofobject + SIZEOF(small_pool_hdr);
+    min_request = SIZEOF(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1;
     if (prev_hdr_ptr == NULL)	/* first pool in class? */
       slop = first_pool_slop[pool_id];
     else
@@ -305,20 +305,23 @@
     }
     mem->total_space_allocated += min_request + slop;
     /* Success, initialize the new pool header and add to end of list */
-    hdr_ptr->hdr.next = NULL;
-    hdr_ptr->hdr.bytes_used = 0;
-    hdr_ptr->hdr.bytes_left = sizeofobject + slop;
+    hdr_ptr->next = NULL;
+    hdr_ptr->bytes_used = 0;
+    hdr_ptr->bytes_left = sizeofobject + slop;
     if (prev_hdr_ptr == NULL)	/* first pool in class? */
       mem->small_list[pool_id] = hdr_ptr;
     else
-      prev_hdr_ptr->hdr.next = hdr_ptr;
+      prev_hdr_ptr->next = hdr_ptr;
   }
 
   /* OK, allocate the object from the current pool */
-  data_ptr = (char *) (hdr_ptr + 1); /* point to first data byte in pool */
-  data_ptr += hdr_ptr->hdr.bytes_used; /* point to place for object */
-  hdr_ptr->hdr.bytes_used += sizeofobject;
-  hdr_ptr->hdr.bytes_left -= sizeofobject;
+  data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+  data_ptr += SIZEOF(small_pool_hdr); /* ...by skipping the header... */
+  if ((unsigned long)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
+    data_ptr += ALIGN_SIZE - (unsigned long)data_ptr % ALIGN_SIZE;
+  data_ptr += hdr_ptr->bytes_used; /* point to place for object */
+  hdr_ptr->bytes_used += sizeofobject;
+  hdr_ptr->bytes_left -= sizeofobject;
 
   return (void *) data_ptr;
 }
@@ -344,37 +347,45 @@
 {
   my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
   large_pool_ptr hdr_ptr;
-  size_t odd_bytes;
+  char FAR * data_ptr;
+
+  /*
+   * Round up the requested size to a multiple of ALIGN_SIZE so that
+   * algorithms can straddle outside the proper area up to the next
+   * alignment.
+   */
+  sizeofobject = jround_up(sizeofobject, ALIGN_SIZE);
 
   /* Check for unsatisfiable request (do now to ensure no overflow below) */
-  if (sizeofobject > (size_t) (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)))
+  if ((SIZEOF(large_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > MAX_ALLOC_CHUNK)
     out_of_memory(cinfo, 3);	/* request exceeds malloc's ability */
 
-  /* Round up the requested size to a multiple of SIZEOF(ALIGN_TYPE) */
-  odd_bytes = sizeofobject % SIZEOF(ALIGN_TYPE);
-  if (odd_bytes > 0)
-    sizeofobject += SIZEOF(ALIGN_TYPE) - odd_bytes;
-
   /* Always make a new pool */
   if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id);	/* safety check */
 
   hdr_ptr = (large_pool_ptr) jpeg_get_large(cinfo, sizeofobject +
-					    SIZEOF(large_pool_hdr));
+					    SIZEOF(large_pool_hdr) +
+					    ALIGN_SIZE - 1);
   if (hdr_ptr == NULL)
     out_of_memory(cinfo, 4);	/* jpeg_get_large failed */
-  mem->total_space_allocated += sizeofobject + SIZEOF(large_pool_hdr);
+  mem->total_space_allocated += sizeofobject + SIZEOF(large_pool_hdr) + ALIGN_SIZE - 1;
 
   /* Success, initialize the new pool header and add to list */
-  hdr_ptr->hdr.next = mem->large_list[pool_id];
+  hdr_ptr->next = mem->large_list[pool_id];
   /* We maintain space counts in each pool header for statistical purposes,
    * even though they are not needed for allocation.
    */
-  hdr_ptr->hdr.bytes_used = sizeofobject;
-  hdr_ptr->hdr.bytes_left = 0;
+  hdr_ptr->bytes_used = sizeofobject;
+  hdr_ptr->bytes_left = 0;
   mem->large_list[pool_id] = hdr_ptr;
 
-  return (void FAR *) (hdr_ptr + 1); /* point to first data byte in pool */
+  data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+  data_ptr += SIZEOF(small_pool_hdr); /* ...by skipping the header... */
+  if ((unsigned long)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
+    data_ptr += ALIGN_SIZE - (unsigned long)data_ptr % ALIGN_SIZE;
+
+  return (void FAR *) data_ptr;
 }
 
 
@@ -389,6 +400,10 @@
  * this chunking of rows.  The rowsperchunk value is left in the mem manager
  * object so that it can be saved away if this sarray is the workspace for
  * a virtual array.
+ *
+ * Since we are often upsampling with a factor 2, we align the size (not
+ * the start) to 2 * ALIGN_SIZE so that the upsampling routines don't have
+ * to be as careful about size.
  */
 
 METHODDEF(JSAMPARRAY)
@@ -402,6 +417,11 @@
   JDIMENSION rowsperchunk, currow, i;
   long ltemp;
 
+  /* Make sure each row is properly aligned */
+  if ((ALIGN_SIZE % SIZEOF(JSAMPLE)) != 0)
+    out_of_memory(cinfo, 5);	/* safety check */
+  samplesperrow = jround_up(samplesperrow, (2 * ALIGN_SIZE) / SIZEOF(JSAMPLE));
+
   /* Calculate max # of rows allowed in one allocation chunk */
   ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
 	  ((long) samplesperrow * SIZEOF(JSAMPLE));
@@ -450,6 +470,10 @@
   JDIMENSION rowsperchunk, currow, i;
   long ltemp;
 
+  /* Make sure each row is properly aligned */
+  if ((SIZEOF(JBLOCK) % ALIGN_SIZE) != 0)
+    out_of_memory(cinfo, 6);	/* safety check */
+
   /* Calculate max # of rows allowed in one allocation chunk */
   ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
 	  ((long) blocksperrow * SIZEOF(JBLOCK));
@@ -968,9 +992,9 @@
   mem->large_list[pool_id] = NULL;
 
   while (lhdr_ptr != NULL) {
-    large_pool_ptr next_lhdr_ptr = lhdr_ptr->hdr.next;
-    space_freed = lhdr_ptr->hdr.bytes_used +
-		  lhdr_ptr->hdr.bytes_left +
+    large_pool_ptr next_lhdr_ptr = lhdr_ptr->next;
+    space_freed = lhdr_ptr->bytes_used +
+		  lhdr_ptr->bytes_left +
 		  SIZEOF(large_pool_hdr);
     jpeg_free_large(cinfo, (void FAR *) lhdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
@@ -982,9 +1006,9 @@
   mem->small_list[pool_id] = NULL;
 
   while (shdr_ptr != NULL) {
-    small_pool_ptr next_shdr_ptr = shdr_ptr->hdr.next;
-    space_freed = shdr_ptr->hdr.bytes_used +
-		  shdr_ptr->hdr.bytes_left +
+    small_pool_ptr next_shdr_ptr = shdr_ptr->next;
+    space_freed = shdr_ptr->bytes_used +
+		  shdr_ptr->bytes_left +
 		  SIZEOF(small_pool_hdr);
     jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
@@ -1041,16 +1065,16 @@
    * in common if and only if X is a power of 2, ie has only one one-bit.
    * Some compilers may give an "unreachable code" warning here; ignore it.
    */
-  if ((SIZEOF(ALIGN_TYPE) & (SIZEOF(ALIGN_TYPE)-1)) != 0)
+  if ((ALIGN_SIZE & (ALIGN_SIZE-1)) != 0)
     ERREXIT(cinfo, JERR_BAD_ALIGN_TYPE);
   /* MAX_ALLOC_CHUNK must be representable as type size_t, and must be
-   * a multiple of SIZEOF(ALIGN_TYPE).
+   * a multiple of ALIGN_SIZE.
    * Again, an "unreachable code" warning may be ignored here.
    * But a "constant too large" warning means you need to fix MAX_ALLOC_CHUNK.
    */
   test_mac = (size_t) MAX_ALLOC_CHUNK;
   if ((long) test_mac != MAX_ALLOC_CHUNK ||
-      (MAX_ALLOC_CHUNK % SIZEOF(ALIGN_TYPE)) != 0)
+      (MAX_ALLOC_CHUNK % ALIGN_SIZE) != 0)
     ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
 
   max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */

diff --git a/jmemname.c b/jmemname.c
deleted file mode 100644
index ed96dee..0000000
--- a/jmemname.c
+++ /dev/null

@@ -1,276 +0,0 @@
-/*
- * jmemname.c
- *
- * Copyright (C) 1992-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file provides a generic implementation of the system-dependent
- * portion of the JPEG memory manager.  This implementation assumes that
- * you must explicitly construct a name for each temp file.
- * Also, the problem of determining the amount of memory available
- * is shoved onto the user.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h"		/* import the system-dependent declarations */
-
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
-#endif
-
-#ifndef SEEK_SET		/* pre-ANSI systems may not define this; */
-#define SEEK_SET  0		/* if not, assume 0 is correct */
-#endif
-
-#ifdef DONT_USE_B_MODE		/* define mode parameters for fopen() */
-#define READ_BINARY	"r"
-#define RW_BINARY	"w+"
-#else
-#ifdef VMS			/* VMS is very nonstandard */
-#define READ_BINARY	"rb", "ctx=stm"
-#define RW_BINARY	"w+b", "ctx=stm"
-#else				/* standard ANSI-compliant case */
-#define READ_BINARY	"rb"
-#define RW_BINARY	"w+b"
-#endif
-#endif
-
-
-/*
- * Selection of a file name for a temporary file.
- * This is system-dependent!
- *
- * The code as given is suitable for most Unix systems, and it is easily
- * modified for most non-Unix systems.  Some notes:
- *  1.  The temp file is created in the directory named by TEMP_DIRECTORY.
- *      The default value is /usr/tmp, which is the conventional place for
- *      creating large temp files on Unix.  On other systems you'll probably
- *      want to change the file location.  You can do this by editing the
- *      #define, or (preferred) by defining TEMP_DIRECTORY in jconfig.h.
- *
- *  2.  If you need to change the file name as well as its location,
- *      you can override the TEMP_FILE_NAME macro.  (Note that this is
- *      actually a printf format string; it must contain %s and %d.)
- *      Few people should need to do this.
- *
- *  3.  mktemp() is used to ensure that multiple processes running
- *      simultaneously won't select the same file names.  If your system
- *      doesn't have mktemp(), define NO_MKTEMP to do it the hard way.
- *      (If you don't have <errno.h>, also define NO_ERRNO_H.)
- *
- *  4.  You probably want to define NEED_SIGNAL_CATCHER so that cjpeg.c/djpeg.c
- *      will cause the temp files to be removed if you stop the program early.
- */
-
-#ifndef TEMP_DIRECTORY		/* can override from jconfig.h or Makefile */
-#define TEMP_DIRECTORY  "/usr/tmp/" /* recommended setting for Unix */
-#endif
-
-static int next_file_num;	/* to distinguish among several temp files */
-
-#ifdef NO_MKTEMP
-
-#ifndef TEMP_FILE_NAME		/* can override from jconfig.h or Makefile */
-#define TEMP_FILE_NAME  "%sJPG%03d.TMP"
-#endif
-
-#ifndef NO_ERRNO_H
-#include <errno.h>		/* to define ENOENT */
-#endif
-
-/* ANSI C specifies that errno is a macro, but on older systems it's more
- * likely to be a plain int variable.  And not all versions of errno.h
- * bother to declare it, so we have to in order to be most portable.  Thus:
- */
-#ifndef errno
-extern int errno;
-#endif
-
-
-LOCAL(void)
-select_file_name (char * fname)
-{
-  FILE * tfile;
-
-  /* Keep generating file names till we find one that's not in use */
-  for (;;) {
-    next_file_num++;		/* advance counter */
-    sprintf(fname, TEMP_FILE_NAME, TEMP_DIRECTORY, next_file_num);
-    if ((tfile = fopen(fname, READ_BINARY)) == NULL) {
-      /* fopen could have failed for a reason other than the file not
-       * being there; for example, file there but unreadable.
-       * If <errno.h> isn't available, then we cannot test the cause.
-       */
-#ifdef ENOENT
-      if (errno != ENOENT)
-	continue;
-#endif
-      break;
-    }
-    fclose(tfile);		/* oops, it's there; close tfile & try again */
-  }
-}
-
-#else /* ! NO_MKTEMP */
-
-/* Note that mktemp() requires the initial filename to end in six X's */
-#ifndef TEMP_FILE_NAME		/* can override from jconfig.h or Makefile */
-#define TEMP_FILE_NAME  "%sJPG%dXXXXXX"
-#endif
-
-LOCAL(void)
-select_file_name (char * fname)
-{
-  next_file_num++;		/* advance counter */
-  sprintf(fname, TEMP_FILE_NAME, TEMP_DIRECTORY, next_file_num);
-  mktemp(fname);		/* make sure file name is unique */
-  /* mktemp replaces the trailing XXXXXX with a unique string of characters */
-}
-
-#endif /* NO_MKTEMP */
-
-
-/*
- * Memory allocation and freeing are controlled by the regular library
- * routines malloc() and free().
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * "Large" objects are treated the same as "small" ones.
- * NB: although we include FAR keywords in the routine declarations,
- * this file won't actually work in 80x86 small/medium model; at least,
- * you probably won't be able to process useful-size images in only 64KB.
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void FAR *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- * It's impossible to do this in a portable way; our current solution is
- * to make the user tell us (with a default value set at compile time).
- * If you can actually get the available space, it's a good idea to subtract
- * a slop factor of 5% or so.
- */
-
-#ifndef DEFAULT_MAX_MEM		/* so can override from makefile */
-#define DEFAULT_MAX_MEM		1000000L /* default: one megabyte */
-#endif
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
-		    long max_bytes_needed, long already_allocated)
-{
-  return cinfo->mem->max_memory_to_use - already_allocated;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed.  You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-
-METHODDEF(void)
-read_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		    void FAR * buffer_address,
-		    long file_offset, long byte_count)
-{
-  if (fseek(info->temp_file, file_offset, SEEK_SET))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  if (JFREAD(info->temp_file, buffer_address, byte_count)
-      != (size_t) byte_count)
-    ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		     void FAR * buffer_address,
-		     long file_offset, long byte_count)
-{
-  if (fseek(info->temp_file, file_offset, SEEK_SET))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  if (JFWRITE(info->temp_file, buffer_address, byte_count)
-      != (size_t) byte_count)
-    ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_backing_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  fclose(info->temp_file);	/* close the file */
-  unlink(info->temp_name);	/* delete the file */
-/* If your system doesn't have unlink(), use remove() instead.
- * remove() is the ANSI-standard name for this function, but if
- * your system was ANSI you'd be using jmemansi.c, right?
- */
-  TRACEMSS(cinfo, 1, JTRC_TFILE_CLOSE, info->temp_name);
-}
-
-
-/*
- * Initial opening of a backing-store object.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-			 long total_bytes_needed)
-{
-  select_file_name(info->temp_name);
-  if ((info->temp_file = fopen(info->temp_name, RW_BINARY)) == NULL)
-    ERREXITS(cinfo, JERR_TFILE_CREATE, info->temp_name);
-  info->read_backing_store = read_backing_store;
-  info->write_backing_store = write_backing_store;
-  info->close_backing_store = close_backing_store;
-  TRACEMSS(cinfo, 1, JTRC_TFILE_OPEN, info->temp_name);
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
-  next_file_num = 0;		/* initialize temp file name generator */
-  return DEFAULT_MAX_MEM;	/* default for max_memory_to_use */
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
-  /* no work */
-}

diff --git a/jmorecfg.h b/jmorecfg.h
index 54a7d1c..0e7fb72 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h

@@ -2,6 +2,7 @@
  * jmorecfg.h
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -62,11 +63,11 @@
 #else /* not HAVE_UNSIGNED_CHAR */
 
 typedef char JSAMPLE;
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
 #define GETJSAMPLE(value)  ((int) (value))
 #else
 #define GETJSAMPLE(value)  ((int) (value) & 0xFF)
-#endif /* CHAR_IS_UNSIGNED */
+#endif /* __CHAR_UNSIGNED__ */
 
 #endif /* HAVE_UNSIGNED_CHAR */
 
@@ -113,11 +114,11 @@
 #else /* not HAVE_UNSIGNED_CHAR */
 
 typedef char JOCTET;
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
 #define GETJOCTET(value)  (value)
 #else
 #define GETJOCTET(value)  ((value) & 0xFF)
-#endif /* CHAR_IS_UNSIGNED */
+#endif /* __CHAR_UNSIGNED__ */
 
 #endif /* HAVE_UNSIGNED_CHAR */
 
@@ -134,11 +135,11 @@
 #ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char UINT8;
 #else /* not HAVE_UNSIGNED_CHAR */
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
 typedef char UINT8;
-#else /* not CHAR_IS_UNSIGNED */
+#else /* not __CHAR_UNSIGNED__ */
 typedef short UINT8;
-#endif /* CHAR_IS_UNSIGNED */
+#endif /* __CHAR_UNSIGNED__ */
 #endif /* HAVE_UNSIGNED_CHAR */
 
 /* UINT16 must hold at least the values 0..65535. */
@@ -316,31 +317,37 @@
 #define RGB_BLUE	2	/* Offset of Blue */
 #define RGB_PIXELSIZE	3	/* JSAMPLEs per RGB scanline element */
 
+#define JPEG_NUMCS 12
+
+static const int rgb_red[JPEG_NUMCS] = {
+	-1, -1, RGB_RED, -1, -1, -1, 0, 0, 2, 2, 3, 1
+};
+
+static const int rgb_green[JPEG_NUMCS] = {
+	-1, -1, RGB_GREEN, -1, -1, -1, 1, 1, 1, 1, 2, 2
+};
+
+static const int rgb_blue[JPEG_NUMCS] = {
+	-1, -1, RGB_BLUE, -1, -1, -1, 2, 2, 0, 0, 1, 3
+};
+
+static const int rgb_pixelsize[JPEG_NUMCS] = {
+	-1, -1, RGB_PIXELSIZE, -1, -1, -1, 3, 4, 3, 4, 4, 4
+};
 
 /* Definitions for speed-related optimizations. */
 
-
-/* If your compiler supports inline functions, define INLINE
- * as the inline keyword; otherwise define it as empty.
- */
-
-#ifndef INLINE
-#ifdef __GNUC__			/* for instance, GNU C knows about inline */
-#define INLINE __inline__
-#endif
-#ifndef INLINE
-#define INLINE			/* default is to define it as empty */
-#endif
-#endif
-
-
 /* On some machines (notably 68000 series) "int" is 32 bits, but multiplying
  * two 16-bit shorts is faster than multiplying two ints.  Define MULTIPLIER
  * as short on such a machine.  MULTIPLIER must be at least 16 bits wide.
  */
 
 #ifndef MULTIPLIER
+#ifndef WITH_SIMD
 #define MULTIPLIER  int		/* type for fastest integer multiply */
+#else
+#define MULTIPLIER short  /* prefer 16-bit with SIMD for parellelism */
+#endif
 #endif
 
 

diff --git a/jpeg.dsp b/jpeg.dsp
new file mode 100644
index 0000000..5f97af7
--- /dev/null
+++ b/jpeg.dsp

@@ -0,0 +1,332 @@
+# Microsoft Developer Studio Project File - Name="jpeg" - Package Owner=<4>

+# Microsoft Developer Studio Generated Build File, Format Version 6.00

+# ** DO NOT EDIT **

+

+# TARGTYPE "Win32 (x86) Static Library" 0x0104

+

+CFG=jpeg - Win32 Debug

+!MESSAGE This is not a valid makefile. To build this project using NMAKE,

+!MESSAGE use the Export Makefile command and run

+!MESSAGE 

+!MESSAGE NMAKE /f "jpeg.mak".

+!MESSAGE 

+!MESSAGE You can specify a configuration when running NMAKE

+!MESSAGE by defining the macro CFG on the command line. For example:

+!MESSAGE 

+!MESSAGE NMAKE /f "jpeg.mak" CFG="jpeg - Win32 Debug"

+!MESSAGE 

+!MESSAGE Possible choices for configuration are:

+!MESSAGE 

+!MESSAGE "jpeg - Win32 Release" (based on "Win32 (x86) Static Library")

+!MESSAGE "jpeg - Win32 Debug" (based on "Win32 (x86) Static Library")

+!MESSAGE 

+

+# Begin Project

+# PROP AllowPerConfigDependencies 0

+# PROP Scc_ProjName ""

+# PROP Scc_LocalPath ""

+CPP=cl.exe

+RSC=rc.exe

+

+!IF  "$(CFG)" == "jpeg - Win32 Release"

+

+# PROP BASE Use_MFC 0

+# PROP BASE Use_Debug_Libraries 0

+# PROP BASE Output_Dir "Release"

+# PROP BASE Intermediate_Dir "Release"

+# PROP BASE Target_Dir ""

+# PROP Use_MFC 0

+# PROP Use_Debug_Libraries 0

+# PROP Output_Dir "..\Release"

+# PROP Intermediate_Dir "..\Release\jpeg"

+# PROP Target_Dir ""

+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_MBCS" /D "_LIB" /YX /FD /c

+# ADD CPP /nologo /MT /W3 /GX /O2 /D "NDEBUG" /D "_LIB" /D "WIN32" /D "_MBCS" /YX /FD /c /I "win"

+# ADD BASE RSC /l 0x419 /d "NDEBUG"

+# ADD RSC /l 0x809 /d "NDEBUG"

+BSC32=bscmake.exe

+# ADD BASE BSC32 /nologo

+# ADD BSC32 /nologo

+LIB32=link.exe -lib

+# ADD BASE LIB32 /nologo

+# ADD LIB32 /nologo

+

+!ELSEIF  "$(CFG)" == "jpeg - Win32 Debug"

+

+# PROP BASE Use_MFC 0

+# PROP BASE Use_Debug_Libraries 1

+# PROP BASE Output_Dir "..\Debug"

+# PROP BASE Intermediate_Dir "..\Debug\jpeg"

+# PROP BASE Target_Dir ""

+# PROP Use_MFC 0

+# PROP Use_Debug_Libraries 1

+# PROP Output_Dir "Debug"

+# PROP Intermediate_Dir "Debug"

+# PROP Target_Dir ""

+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_MBCS" /D "_LIB" /YX /FD /GZ  /c

+# ADD CPP /nologo /MTd /W3 /Gm /GX /Zi /Od /D "_DEBUG" /D "_LIB" /D "WIN32" /D "_MBCS" /YX /FD /GZ /c /I "win"

+# ADD BASE RSC /l 0x419 /d "_DEBUG"

+# ADD RSC /l 0x809 /d "_DEBUG"

+BSC32=bscmake.exe

+# ADD BASE BSC32 /nologo

+# ADD BSC32 /nologo

+LIB32=link.exe -lib

+# ADD BASE LIB32 /nologo

+# ADD LIB32 /nologo

+

+!ENDIF 

+

+# Begin Target

+

+# Name "jpeg - Win32 Release"

+# Name "jpeg - Win32 Debug"

+# Begin Group "Source Files"

+

+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"

+# Begin Source File

+

+SOURCE=.\jcapimin.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jcapistd.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jccoefct.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jccolor.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jcdctmgr.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jchuff.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jcinit.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jcmainct.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jcmarker.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jcmaster.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jcomapi.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jcparam.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jcphuff.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jcprepct.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jcsample.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jctrans.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdapimin.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdapistd.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdatadst.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdatasrc.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdcoefct.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdcolor.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jddctmgr.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdhuff.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdinput.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdmainct.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdmarker.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdmaster.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdmerge.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdphuff.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdpostct.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdsample.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdtrans.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jerror.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jfdctflt.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jfdctfst.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jfdctint.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jidctflt.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jidctfst.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jidctint.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jidctred.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jmemmgr.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jmemnobs.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jquant1.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jquant2.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jsimd_none.c

+# End Source File

+# Begin Source File

+

+SOURCE=.\jutils.c

+# End Source File

+# End Group

+# Begin Group "Header Files"

+

+# PROP Default_Filter "h;hpp;hxx;hm;inl"

+# Begin Source File

+

+SOURCE=.\jchuff.h

+# End Source File

+# Begin Source File

+

+SOURCE=.\jconfig.h

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdct.h

+# End Source File

+# Begin Source File

+

+SOURCE=.\jdhuff.h

+# End Source File

+# Begin Source File

+

+SOURCE=.\jerror.h

+# End Source File

+# Begin Source File

+

+SOURCE=.\jinclude.h

+# End Source File

+# Begin Source File

+

+SOURCE=.\jmemsys.h

+# End Source File

+# Begin Source File

+

+SOURCE=.\jmorecfg.h

+# End Source File

+# Begin Source File

+

+SOURCE=.\jpegint.h

+# End Source File

+# Begin Source File

+

+SOURCE=.\jpeglib.h

+# End Source File

+# Begin Source File

+

+SOURCE=.\jsimd.h

+# End Source File

+# Begin Source File

+

+SOURCE=.\jsimddct.h

+# End Source File

+# Begin Source File

+

+SOURCE=.\jversion.h

+# End Source File

+# End Group

+# End Target

+# End Project


diff --git a/jpeglib.h b/jpeglib.h
index d1be8dd..31f0dd6 100644
--- a/jpeglib.h
+++ b/jpeglib.h

@@ -2,6 +2,7 @@
  * jpeglib.h
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -203,13 +204,22 @@
 
 /* Known color spaces. */
 
+#define JCS_EXTENSIONS 1
+
 typedef enum {
 	JCS_UNKNOWN,		/* error/unspecified */
 	JCS_GRAYSCALE,		/* monochrome */
-	JCS_RGB,		/* red/green/blue */
+	JCS_RGB,		/* red/green/blue as specified by the RGB_RED, RGB_GREEN,
+                 RGB_BLUE, and RGB_PIXELSIZE macros */
 	JCS_YCbCr,		/* Y/Cb/Cr (also known as YUV) */
 	JCS_CMYK,		/* C/M/Y/K */
-	JCS_YCCK		/* Y/Cb/Cr/K */
+	JCS_YCCK,		/* Y/Cb/Cr/K */
+	JCS_EXT_RGB,		/* red/green/blue */
+	JCS_EXT_RGBX,		/* red/green/blue/x */
+	JCS_EXT_BGR,		/* blue/green/red */
+	JCS_EXT_BGRX,		/* blue/green/red/x */
+	JCS_EXT_XBGR,		/* x/blue/green/red */
+	JCS_EXT_XRGB,		/* x/red/green/blue */
 } J_COLOR_SPACE;
 
 /* DCT/IDCT algorithm options. */

diff --git a/jpegut.c b/jpegut.c
new file mode 100644
index 0000000..3b02fba
--- /dev/null
+++ b/jpegut.c

@@ -0,0 +1,384 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version.  The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * wxWindows Library License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./rrtimer.h"
+#include "./turbojpeg.h"
+
+#define _catch(f) {if((f)==-1) {printf("TJPEG: %s\n", tjGetErrorStr());  goto finally;}}
+
+const char *_subnamel[NUMSUBOPT]={"4:4:4", "4:2:2", "4:2:0", "GRAY"};
+const char *_subnames[NUMSUBOPT]={"444", "422", "420", "GRAY"};
+
+int pixels[9][3]=
+{
+	{0, 255, 0},
+	{255, 0, 255},
+	{255, 255, 0},
+	{0, 0, 255},
+	{0, 255, 255},
+	{255, 0, 0},
+	{255, 255, 255},
+	{0, 0, 0},
+	{255, 0, 0}
+};
+
+void initbuf(unsigned char *buf, int w, int h, int ps, int flags)
+{
+	int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
+		_i, j;
+	if(flags&TJ_ALPHAFIRST) {roffset++;  goffset++;  boffset++;}
+	memset(buf, 0, w*h*ps);
+	for(_i=0; _i<16; _i++)
+	{
+		if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
+		for(j=0; j<w; j++)
+		{
+			buf[(w*i+j)*ps+roffset]=255;
+			if(((_i/8)+(j/8))%2==0)
+			{
+				buf[(w*i+j)*ps+goffset]=255;
+				buf[(w*i+j)*ps+boffset]=255;
+			}
+		}
+	}
+	for(_i=16; _i<h; _i++)
+	{
+		if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
+		for(j=0; j<w; j++)
+		{
+			if(((_i/8)+(j/8))%2!=0)
+			{
+				buf[(w*i+j)*ps+roffset]=255;
+				buf[(w*i+j)*ps+goffset]=255;
+			}
+		}
+	}
+}
+
+int dumpbuf(unsigned char *buf, int w, int h, int ps, int flags)
+{
+	int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
+		j;
+	for(i=0; i<h; i++)
+	{
+		for(j=0; j<w; j++)
+		{
+			printf("%.3d/%.3d/%.3d ", buf[(w*i+j)*ps+roffset],
+				buf[(w*i+j)*ps+roffset], buf[(w*i+j)*ps+roffset]);
+		}
+		printf("\n");
+	}
+}
+
+int checkbuf(unsigned char *buf, int w, int h, int ps, int subsamp, int flags)
+{
+	int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
+		_i, j;
+	if(flags&TJ_ALPHAFIRST) {roffset++;  goffset++;  boffset++;}
+	if(subsamp==TJ_GRAYSCALE)
+	{
+		for(_i=0; _i<16; _i++)
+		{
+			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
+			for(j=0; j<w; j++)
+			{
+				unsigned char r=buf[(w*i+j)*ps+roffset],
+					g=buf[(w*i+j)*ps+goffset],
+					b=buf[(w*i+j)*ps+boffset];
+				if(((_i/8)+(j/8))%2==0)
+				{
+					if(r<253 || g<253 || b<253) return 0;
+				}
+				else
+				{
+					if(r<74 || r>78 || g<74 || g>78 || b<74 || b>78) return 0;
+				}
+			}
+		}
+		for(_i=16; _i<h; _i++)
+		{
+			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
+			for(j=0; j<w; j++)
+			{
+				unsigned char r=buf[(w*i+j)*ps+roffset],
+					g=buf[(w*i+j)*ps+goffset],
+					b=buf[(w*i+j)*ps+boffset];
+				if(((_i/8)+(j/8))%2==0)
+				{
+					if(r>2 || g>2 || b>2) return 0;
+				}
+				else
+				{
+					if(r<224 || r>228 || g<224 || g>228 || b<224 || b>228) return 0;
+				}
+			}
+		}
+	}
+	else
+	{
+		for(_i=0; _i<16; _i++)
+		{
+			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
+			for(j=0; j<w; j++)
+			{
+				if(buf[(w*i+j)*ps+roffset]<253) return 0;
+				if(((_i/8)+(j/8))%2==0)
+				{
+					if(buf[(w*i+j)*ps+goffset]<253) return 0;
+					if(buf[(w*i+j)*ps+boffset]<253) return 0;
+				}
+				else
+				{
+					if(buf[(w*i+j)*ps+goffset]>2) return 0;
+					if(buf[(w*i+j)*ps+boffset]>2) return 0;
+				}
+			}
+		}
+		for(_i=16; _i<h; _i++)
+		{
+			if(flags&TJ_BOTTOMUP) i=h-_i-1;  else i=_i;
+			for(j=0; j<w; j++)
+			{
+				if(buf[(w*i+j)*ps+boffset]>2) return 0;
+				if(((_i/8)+(j/8))%2==0)
+				{
+					if(buf[(w*i+j)*ps+roffset]>2) return 0;
+					if(buf[(w*i+j)*ps+goffset]>2) return 0;
+				}
+				else
+				{
+					if(buf[(w*i+j)*ps+roffset]<253) return 0;
+					if(buf[(w*i+j)*ps+goffset]<253) return 0;
+				}
+			}
+		}
+	}
+	return 1;
+}
+
+void writejpeg(unsigned char *jpegbuf, unsigned long jpgbufsize, char *filename)
+{
+	FILE *outfile=NULL;
+	if((outfile=fopen(filename, "wb"))==NULL)
+	{
+		printf("ERROR: Could not open %s for writing.\n", filename);
+		goto finally;
+	}
+	if(fwrite(jpegbuf, jpgbufsize, 1, outfile)!=1)
+	{
+		printf("ERROR: Could not write to %s.\n", filename);
+		goto finally;
+	}
+
+	finally:
+	if(outfile) fclose(outfile);
+}
+
+void gentestjpeg(tjhandle hnd, unsigned char *jpegbuf, unsigned long *size,
+	int w, int h, int ps, char *basefilename, int subsamp, int qual, int flags)
+{
+	char tempstr[1024];  unsigned char *bmpbuf=NULL;
+	const char *pixformat;  double t;
+
+	if(flags&TJ_BGR)
+	{
+		if(ps==3) pixformat="BGR";
+		else {if(flags&TJ_ALPHAFIRST) pixformat="ABGR";  else pixformat="BGRA";}
+	}
+	else
+	{
+		if(ps==3) pixformat="RGB";
+		else {if(flags&TJ_ALPHAFIRST) pixformat="ARGB";  else pixformat="RGBA";}
+	}
+	printf("%s %s -> %s Q%d ... ", pixformat,
+		(flags&TJ_BOTTOMUP)?"Bottom-Up":"Top-Down ", _subnamel[subsamp], qual);
+
+	if((bmpbuf=(unsigned char *)malloc(w*h*ps+1))==NULL)
+	{
+		printf("ERROR: Could not allocate buffer\n");  goto finally;
+	}
+	initbuf(bmpbuf, w, h, ps, flags);
+	memset(jpegbuf, 0, TJBUFSIZE(w, h));
+
+	t=rrtime();
+	_catch(tjCompress(hnd, bmpbuf, w, 0, h, ps, jpegbuf, size, subsamp, qual, flags));
+	t=rrtime()-t;
+
+	sprintf(tempstr, "%s_enc_%s_%s_%sQ%d.jpg", basefilename, pixformat,
+		(flags&TJ_BOTTOMUP)? "BU":"TD", _subnames[subsamp], qual);
+	writejpeg(jpegbuf, *size, tempstr);
+	printf("Done.  %f ms\n  Result in %s\n", t*1000., tempstr);
+
+	finally:
+	if(bmpbuf) free(bmpbuf);
+}
+
+void gentestbmp(tjhandle hnd, unsigned char *jpegbuf, unsigned long jpegsize,
+	int w, int h, int ps, char *basefilename, int subsamp, int qual, int flags)
+{
+	unsigned char *bmpbuf=NULL;
+	const char *pixformat;  int _w=0, _h=0;  double t;
+
+	if(flags&TJ_BGR)
+	{
+		if(ps==3) pixformat="BGR";
+		else {if(flags&TJ_ALPHAFIRST) pixformat="ABGR";  else pixformat="BGRA";}
+	}
+	else
+	{
+		if(ps==3) pixformat="RGB";
+		else {if(flags&TJ_ALPHAFIRST) pixformat="ARGB";  else pixformat="RGBA";}
+	}
+	printf("JPEG -> %s %s ... ", pixformat, (flags&TJ_BOTTOMUP)?"Bottom-Up":"Top-Down ");
+
+	_catch(tjDecompressHeader(hnd, jpegbuf, jpegsize, &_w, &_h));
+	if(_w!=w || _h!=h)
+	{
+		printf("Incorrect JPEG header\n");  goto finally;
+	}
+
+	if((bmpbuf=(unsigned char *)malloc(w*h*ps+1))==NULL)
+	{
+		printf("ERROR: Could not allocate buffer\n");  goto finally;
+	}
+	memset(bmpbuf, 0, w*ps*h);
+
+	t=rrtime();
+	_catch(tjDecompress(hnd, jpegbuf, jpegsize, bmpbuf, w, w*ps, h, ps, flags));
+	t=rrtime()-t;
+
+	if(checkbuf(bmpbuf, w, h, ps, subsamp, flags)) printf("Passed.");
+	else {printf("FAILED!");  dumpbuf(bmpbuf, w, h, ps, flags);}
+
+	printf("  %f ms\n\n", t*1000.);
+
+	finally:
+	if(bmpbuf) free(bmpbuf);
+}
+
+void dotest(int w, int h, int ps, int subsamp, char *basefilename)
+{
+	tjhandle hnd=NULL, dhnd=NULL;  unsigned char *jpegbuf=NULL;
+	unsigned long size;
+
+	if((jpegbuf=(unsigned char *)malloc(TJBUFSIZE(w, h))) == NULL)
+	{
+		puts("ERROR: Could not allocate buffer.");  goto finally;
+	}
+
+	if((hnd=tjInitCompress())==NULL)
+		{printf("Error in tjInitCompress():\n%s\n", tjGetErrorStr());  goto finally;}
+	if((dhnd=tjInitDecompress())==NULL)
+		{printf("Error in tjInitDecompress():\n%s\n", tjGetErrorStr());  goto finally;}
+
+	gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, 0);
+	gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, 0);
+
+	gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_BGR);
+	gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_BGR);
+
+	gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_BOTTOMUP);
+	gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_BOTTOMUP);
+
+	gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_BGR|TJ_BOTTOMUP);
+	gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_BGR|TJ_BOTTOMUP);
+
+	if(ps==4)
+	{
+		gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST);
+		gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST);
+
+		gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR);
+		gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR);
+
+		gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BOTTOMUP);
+		gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BOTTOMUP);
+
+		gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR|TJ_BOTTOMUP);
+		gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR|TJ_BOTTOMUP);
+	}
+
+	finally:
+	if(hnd) tjDestroy(hnd);
+	if(dhnd) tjDestroy(dhnd);
+
+	if(jpegbuf) free(jpegbuf);
+}
+
+#define MAXLENGTH 2048
+
+void dotest1(void)
+{
+	int i, j, i2;  unsigned char *bmpbuf=NULL, *jpgbuf=NULL;
+	tjhandle hnd=NULL;  unsigned long size;
+	if((hnd=tjInitCompress())==NULL)
+		{printf("Error in tjInitCompress():\n%s\n", tjGetErrorStr());  goto finally;}
+	printf("Buffer size regression test\n");
+	for(j=1; j<48; j++)
+	{
+		for(i=1; i<(j==1?MAXLENGTH:48); i++)
+		{
+			if(i%100==0) printf("%.4d x %.4d\b\b\b\b\b\b\b\b\b\b\b", i, j);
+			if((bmpbuf=(unsigned char *)malloc(i*j*4))==NULL
+			|| (jpgbuf=(unsigned char *)malloc(TJBUFSIZE(i, j)))==NULL)
+			{
+				printf("Memory allocation failure\n");  goto finally;
+			}
+			memset(bmpbuf, 0, i*j*4);
+			for(i2=0; i2<i*j; i2++)
+			{
+				bmpbuf[i2*4]=pixels[i2%9][2];
+				bmpbuf[i2*4+1]=pixels[i2%9][1];
+				bmpbuf[i2*2+2]=pixels[i2%9][0];
+			}
+			_catch(tjCompress(hnd, bmpbuf, i, i*4, j, 4,
+				jpgbuf, &size, TJ_444, 100, TJ_BGR));
+			free(bmpbuf);  bmpbuf=NULL;  free(jpgbuf);  jpgbuf=NULL;
+
+			if((bmpbuf=(unsigned char *)malloc(j*i*4))==NULL
+			|| (jpgbuf=(unsigned char *)malloc(TJBUFSIZE(j, i)))==NULL)
+			{
+				printf("Memory allocation failure\n");  goto finally;
+			}
+			for(i2=0; i2<j*i*4; i2++)
+			{
+				if(i2%2==0) bmpbuf[i2]=0xFF;
+				else bmpbuf[i2]=0;
+			}
+			_catch(tjCompress(hnd, bmpbuf, j, j*4, i, 4,
+				jpgbuf, &size, TJ_444, 100, TJ_BGR));
+			free(bmpbuf);  bmpbuf=NULL;  free(jpgbuf);  jpgbuf=NULL;
+		}
+	}
+	printf("Done.      \n");
+
+	finally:
+	if(bmpbuf) free(bmpbuf);  if(jpgbuf) free(jpgbuf);
+	if(hnd) tjDestroy(hnd);
+}
+
+int main(int argc, char *argv[])
+{
+	dotest(35, 41, 3, TJ_444, "test");
+	dotest(35, 41, 4, TJ_444, "test");
+	dotest(35, 41, 3, TJ_GRAYSCALE, "test");
+	dotest(35, 41, 4, TJ_GRAYSCALE, "test");
+	dotest1();
+
+	return 0;
+}

diff --git a/jpgtest.cxx b/jpgtest.cxx
new file mode 100644
index 0000000..da0ef93
--- /dev/null
+++ b/jpgtest.cxx

@@ -0,0 +1,392 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005, 2006 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version.  The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * wxWindows Library License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "./bmp.h"
+#include "./rrutil.h"
+#include "./rrtimer.h"
+#include "./turbojpeg.h"
+
+#define _catch(f) {if((f)==-1) {printf("Error in %s:\n%s\n", #f, tjGetErrorStr());  goto bailout;}}
+
+int forcemmx=0, forcesse=0, forcesse2=0, forcesse3=0, fastupsample=0;
+const int _ps[BMPPIXELFORMATS]={3, 4, 3, 4, 4, 4};
+const int _flags[BMPPIXELFORMATS]={0, 0, TJ_BGR, TJ_BGR,
+	TJ_BGR|TJ_ALPHAFIRST, TJ_ALPHAFIRST};
+const int _rindex[BMPPIXELFORMATS]={0, 0, 2, 2, 3, 1};
+const int _gindex[BMPPIXELFORMATS]={1, 1, 1, 1, 2, 2};
+const int _bindex[BMPPIXELFORMATS]={2, 2, 0, 0, 1, 3};
+const char *_pfname[]={"RGB", "RGBA", "BGR", "BGRA", "ABGR", "ARGB"};
+const char *_subnamel[NUMSUBOPT]={"4:4:4", "4:2:2", "4:2:0", "GRAY"};
+const char *_subnames[NUMSUBOPT]={"444", "422", "420", "GRAY"};
+
+void printsigfig(double val, int figs)
+{
+	char format[80];
+	double _l=log10(val);  int l;
+	if(_l<0.)
+	{
+		l=(int)fabs(_l);
+		sprintf(format, "%%%d.%df", figs+l+2, figs+l);
+	}
+	else
+	{
+		l=(int)_l+1;
+		if(figs<=l) sprintf(format, "%%.0f");
+		else sprintf(format, "%%%d.%df", figs+1, figs-l);
+	}	
+	printf(format, val);
+}
+
+void dotest(unsigned char *srcbuf, int w, int h, BMPPIXELFORMAT pf, int bu,
+	int jpegsub, int qual, char *filename, int dotile, int useppm, int quiet)
+{
+	char tempstr[1024];
+	FILE *outfile;  tjhandle hnd;
+	unsigned char **jpegbuf=NULL, *rgbbuf=NULL;
+	rrtimer timer; double elapsed;
+	int jpgbufsize=0, i, j, tilesizex, tilesizey, numtilesx, numtilesy, ITER;
+	unsigned long *comptilesize=NULL;
+	int flags=(forcemmx?TJ_FORCEMMX:0)|(forcesse?TJ_FORCESSE:0)
+		|(forcesse2?TJ_FORCESSE2:0)|(forcesse3?TJ_FORCESSE3:0)
+		|(fastupsample?TJ_FASTUPSAMPLE:0);
+	int ps=_ps[pf];
+	int pitch=w*ps;
+
+	flags |= _flags[pf];
+	if(bu) flags |= TJ_BOTTOMUP;
+
+	if((rgbbuf=(unsigned char *)malloc(pitch*h)) == NULL)
+	{
+		puts("ERROR: Could not allocate image buffer.");
+		exit(1);
+	}
+
+	if(!quiet) printf("\n>>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", _pfname[pf],
+		bu?"Bottom-up":"Top-down", _subnamel[jpegsub], qual);
+	if(dotile) {tilesizex=tilesizey=4;}  else {tilesizex=w;  tilesizey=h;}
+
+	do
+	{
+		tilesizex*=2;  if(tilesizex>w) tilesizex=w;
+		tilesizey*=2;  if(tilesizey>h) tilesizey=h;
+		numtilesx=(w+tilesizex-1)/tilesizex;
+		numtilesy=(h+tilesizey-1)/tilesizey;
+		if((comptilesize=(unsigned long *)malloc(sizeof(unsigned long)*numtilesx*numtilesy)) == NULL
+		|| (jpegbuf=(unsigned char **)malloc(sizeof(unsigned char *)*numtilesx*numtilesy)) == NULL)
+		{
+			puts("ERROR: Could not allocate image buffers.");
+			goto bailout;
+		}
+		memset(jpegbuf, 0, sizeof(unsigned char *)*numtilesx*numtilesy);
+		for(i=0; i<numtilesx*numtilesy; i++)
+		{
+			if((jpegbuf[i]=(unsigned char *)malloc(TJBUFSIZE(tilesizex, tilesizey))) == NULL)
+			{
+				puts("ERROR: Could not allocate image buffers.");
+				goto bailout;
+			}
+		}
+
+		// Compression test
+		if(quiet) printf("%s\t%s\t%s\t%d\t",  _pfname[pf], bu?"BU":"TD",
+			_subnamel[jpegsub], qual);
+		for(i=0; i<h; i++) memcpy(&rgbbuf[pitch*i], &srcbuf[w*ps*i], w*ps);
+		if((hnd=tjInitCompress())==NULL)
+		{
+			printf("Error in tjInitCompress():\n%s\n", tjGetErrorStr());
+			goto bailout;
+		}
+		_catch(tjCompress(hnd, rgbbuf, tilesizex, pitch, tilesizey, ps,
+			jpegbuf[0], &comptilesize[0], jpegsub, qual, flags));
+		ITER=0;
+		timer.start();
+		do
+		{
+			jpgbufsize=0;  int tilen=0;
+			for(i=0; i<h; i+=tilesizey)
+			{
+				for(j=0; j<w; j+=tilesizex)
+				{
+					int tempw=min(tilesizex, w-j), temph=min(tilesizey, h-i);
+					_catch(tjCompress(hnd, &rgbbuf[pitch*i+j*ps], tempw, pitch,
+						temph, ps, jpegbuf[tilen], &comptilesize[tilen], jpegsub, qual,
+						flags));
+					jpgbufsize+=comptilesize[tilen];
+					tilen++;
+				}
+			}
+			ITER++;
+		} while((elapsed=timer.elapsed())<5.);
+		_catch(tjDestroy(hnd));
+		if(quiet)
+		{
+			if(tilesizex==w && tilesizey==h) printf("Full     \t");
+			else printf("%-4d %-4d\t", tilesizex, tilesizey);
+			printsigfig((double)(w*h)/1000000.*(double)ITER/elapsed, 4);
+			printf("\t");
+			printsigfig((double)(w*h*ps)/(double)jpgbufsize, 4);
+			printf("\t");
+		}
+		else
+		{
+			if(tilesizex==w && tilesizey==h) printf("\nFull image\n");
+			else printf("\nTile size: %d x %d\n", tilesizex, tilesizey);
+			printf("C--> Frame rate:           %f fps\n", (double)ITER/elapsed);
+			printf("     Output image size:    %d bytes\n", jpgbufsize);
+			printf("     Compression ratio:    %f:1\n",
+				(double)(w*h*ps)/(double)jpgbufsize);
+			printf("     Source throughput:    %f Megapixels/sec\n",
+				(double)(w*h)/1000000.*(double)ITER/elapsed);
+			printf("     Output bit stream:    %f Megabits/sec\n",
+				(double)jpgbufsize*8./1000000.*(double)ITER/elapsed);
+		}
+		if(tilesizex==w && tilesizey==h)
+		{
+			sprintf(tempstr, "%s_%sQ%d.jpg", filename, _subnames[jpegsub], qual);
+			if((outfile=fopen(tempstr, "wb"))==NULL)
+			{
+				puts("ERROR: Could not open reference image");
+				exit(1);
+			}
+			if(fwrite(jpegbuf[0], jpgbufsize, 1, outfile)!=1)
+			{
+				puts("ERROR: Could not write reference image");
+				exit(1);
+			}
+			fclose(outfile);
+			if(!quiet) printf("Reference image written to %s\n", tempstr);
+		}
+
+		// Decompression test
+		memset(rgbbuf, 127, pitch*h);  // Grey image means decompressor did nothing
+		if((hnd=tjInitDecompress())==NULL)
+		{
+			printf("Error in tjInitDecompress():\n%s\n", tjGetErrorStr());
+			goto bailout;
+		}
+		_catch(tjDecompress(hnd, jpegbuf[0], jpgbufsize, rgbbuf, tilesizex, pitch,
+			tilesizey, ps, flags));
+		ITER=0;
+		timer.start();
+		do
+		{
+			int tilen=0;
+			for(i=0; i<h; i+=tilesizey)
+			{
+				for(j=0; j<w; j+=tilesizex)
+				{
+					int tempw=min(tilesizex, w-j), temph=min(tilesizey, h-i);
+					_catch(tjDecompress(hnd, jpegbuf[tilen], comptilesize[tilen],
+						&rgbbuf[pitch*i+ps*j], tempw, pitch, temph, ps, flags));
+					tilen++;
+				}
+			}
+			ITER++;
+		}	while((elapsed=timer.elapsed())<5.);
+		_catch(tjDestroy(hnd));
+		if(quiet)
+		{
+			printsigfig((double)(w*h)/1000000.*(double)ITER/elapsed, 4);
+			printf("\n");
+		}
+		else
+		{
+			printf("D--> Frame rate:           %f fps\n", (double)ITER/elapsed);
+			printf("     Dest. throughput:     %f Megapixels/sec\n",
+				(double)(w*h)/1000000.*(double)ITER/elapsed);
+		}
+		if(tilesizex==w && tilesizey==h)
+			sprintf(tempstr, "%s_%sQ%d_full.%s", filename, _subnames[jpegsub], qual,
+				useppm?"ppm":"bmp");
+		else sprintf(tempstr, "%s_%sQ%d_%dx%d.%s", filename, _subnames[jpegsub],
+			qual, tilesizex, tilesizey, useppm?"ppm":"bmp");
+		if(savebmp(tempstr, rgbbuf, w, h, pf, pitch, bu)==-1)
+		{
+			printf("ERROR saving bitmap: %s\n", bmpgeterr());
+			goto bailout;
+		}
+		sprintf(strrchr(tempstr, '.'), "-err.%s", useppm?"ppm":"bmp");
+		if(!quiet)
+			printf("Computing compression error and saving to %s.\n", tempstr);
+		if(jpegsub==TJ_GRAYSCALE)
+		{
+			for(j=0; j<h; j++)
+			{
+				for(i=0; i<w*ps; i+=ps)
+				{
+					int y=(int)((double)srcbuf[w*ps*j+i+_rindex[pf]]*0.299
+						+ (double)srcbuf[w*ps*j+i+_gindex[pf]]*0.587
+						+ (double)srcbuf[w*ps*j+i+_bindex[pf]]*0.114 + 0.5);
+					if(y>255) y=255;  if(y<0) y=0;
+					rgbbuf[pitch*j+i+_rindex[pf]]=abs(rgbbuf[pitch*j+i+_rindex[pf]]-y);
+					rgbbuf[pitch*j+i+_gindex[pf]]=abs(rgbbuf[pitch*j+i+_gindex[pf]]-y);
+					rgbbuf[pitch*j+i+_bindex[pf]]=abs(rgbbuf[pitch*j+i+_bindex[pf]]-y);
+				}
+			}
+		}		
+		else
+		{
+			for(j=0; j<h; j++) for(i=0; i<w*ps; i++)
+				rgbbuf[pitch*j+i]=abs(rgbbuf[pitch*j+i]-srcbuf[w*ps*j+i]);
+		}
+		if(savebmp(tempstr, rgbbuf, w, h, pf, pitch, bu)==-1)
+		{
+			printf("ERROR saving bitmap: %s\n", bmpgeterr());
+			goto bailout;
+		}
+
+		// Cleanup
+		if(jpegbuf)
+		{
+			for(i=0; i<numtilesx*numtilesy; i++)
+				{if(jpegbuf[i]) free(jpegbuf[i]);  jpegbuf[i]=NULL;}
+			free(jpegbuf);  jpegbuf=NULL;
+		}
+		if(comptilesize) {free(comptilesize);  comptilesize=NULL;}
+	} while(tilesizex<w || tilesizey<h);
+
+	if(rgbbuf) {free(rgbbuf);  rgbbuf=NULL;}
+	return;
+
+	bailout:
+	if(jpegbuf)
+	{
+		for(i=0; i<numtilesx*numtilesy; i++)
+			{if(jpegbuf[i]) free(jpegbuf[i]);  jpegbuf[i]=NULL;}
+		free(jpegbuf);  jpegbuf=NULL;
+	}
+	if(comptilesize) {free(comptilesize);  comptilesize=NULL;}
+	if(rgbbuf) {free(rgbbuf);  rgbbuf=NULL;}
+	return;
+}
+
+
+int main(int argc, char *argv[])
+{
+	unsigned char *bmpbuf=NULL;  int w, h, i, useppm=0;
+	int qual, dotile=0, quiet=0, hiqual=-1;  char *temp;
+	BMPPIXELFORMAT pf=BMP_BGR;
+	int bu=0;
+
+	printf("\n");
+
+	if(argc<3)
+	{
+		printf("USAGE: %s <Inputfile (BMP|PPM)> <%% Quality>\n\n", argv[0]);
+		printf("       [-tile]\n");
+		printf("       Test performance of the codec when the image is encoded\n");
+		printf("       as separate tiles of varying sizes.\n\n");
+		printf("       [-forcemmx] [-forcesse] [-forcesse2] [-forcesse3]\n");
+		printf("       Force MMX, SSE, or SSE2 code paths in Intel codec\n\n");
+		printf("       [-rgb | -bgr | -rgba | -bgra | -abgr | -argb]\n");
+		printf("       Test the specified color conversion path in the codec (default: BGR)\n\n");
+		printf("       [-fastupsample]\n");
+		printf("       Use fast, inaccurate upsampling code to perform 4:2:2 and 4:2:0\n");
+		printf("       YUV decoding in libjpeg decompressor\n\n");
+		printf("       [-quiet]\n");
+		printf("       Output in tabular rather than verbose format\n\n");
+		printf("       NOTE: If the quality is specified as a range, i.e. 90-100, a separate\n");
+		printf("       test will be performed for all quality values in the range.\n");
+		exit(1);
+	}
+	if((qual=atoi(argv[2]))<1 || qual>100)
+	{
+		puts("ERROR: Quality must be between 1 and 100.");
+		exit(1);
+	}
+	if((temp=strchr(argv[2], '-'))!=NULL && strlen(temp)>1
+		&& sscanf(&temp[1], "%d", &hiqual)==1 && hiqual>qual && hiqual>=1
+		&& hiqual<=100) {}
+	else hiqual=qual;
+
+	if(argc>3)
+	{
+		for(i=3; i<argc; i++)
+		{
+			if(!stricmp(argv[i], "-tile")) dotile=1;
+			if(!stricmp(argv[i], "-forcesse3"))
+			{
+				printf("Using SSE3 code in Intel compressor\n");
+				forcesse3=1;
+			}
+			if(!stricmp(argv[i], "-forcesse2"))
+			{
+				printf("Using SSE2 code in Intel compressor\n");
+				forcesse2=1;
+			}
+			if(!stricmp(argv[i], "-forcesse"))
+			{
+				printf("Using SSE code in Intel compressor\n");
+				forcesse=1;
+			}
+			if(!stricmp(argv[i], "-forcemmx"))
+			{
+				printf("Using MMX code in Intel compressor\n");
+				forcemmx=1;
+			}
+			if(!stricmp(argv[i], "-fastupsample"))
+			{
+				printf("Using fast upsampling code\n");
+				fastupsample=1;
+			}
+			if(!stricmp(argv[i], "-rgb")) pf=BMP_RGB;
+			if(!stricmp(argv[i], "-rgba")) pf=BMP_RGBA;
+			if(!stricmp(argv[i], "-bgr")) pf=BMP_BGR;
+			if(!stricmp(argv[i], "-bgra")) pf=BMP_BGRA;
+			if(!stricmp(argv[i], "-abgr")) pf=BMP_ABGR;
+			if(!stricmp(argv[i], "-argb")) pf=BMP_ARGB;
+			if(!stricmp(argv[i], "-bottomup")) bu=1;
+			if(!stricmp(argv[i], "-quiet")) quiet=1;
+		}
+	}
+
+	if(loadbmp(argv[1], &bmpbuf, &w, &h, pf, 1, bu)==-1)
+	{
+		printf("ERROR loading bitmap: %s\n", bmpgeterr());  exit(1);
+	}
+
+	temp=strrchr(argv[1], '.');
+	if(temp!=NULL)
+	{
+		if(!stricmp(temp, ".ppm")) useppm=1;
+		*temp='\0';
+	}
+
+	if(quiet)
+	{
+		printf("All performance values in Mpixels/sec\n\n");
+		printf("Bitmap\tBitmap\tJPEG\tJPEG\tTile Size\tCompr\tCompr\tDecomp\n");
+		printf("Format\tOrder\tFormat\tQual\t X    Y  \tPerf \tRatio\tPerf\n\n");
+	}
+
+	for(i=hiqual; i>=qual; i--)
+		dotest(bmpbuf, w, h, pf, bu, TJ_GRAYSCALE, i, argv[1], dotile, useppm, quiet);
+	if(quiet) printf("\n");
+	for(i=hiqual; i>=qual; i--)
+		dotest(bmpbuf, w, h, pf, bu, TJ_420, i, argv[1], dotile, useppm, quiet);
+	if(quiet) printf("\n");
+	for(i=hiqual; i>=qual; i--)
+		dotest(bmpbuf, w, h, pf, bu, TJ_422, i, argv[1], dotile, useppm, quiet);
+	if(quiet) printf("\n");
+	for(i=hiqual; i>=qual; i--)
+		dotest(bmpbuf, w, h, pf, bu, TJ_444, i, argv[1], dotile, useppm, quiet);
+
+	if(bmpbuf) free(bmpbuf);
+	return 0;
+}

diff --git a/jquant1.c b/jquant1.c
index b2f96aa..362bb1e 100644
--- a/jquant1.c
+++ b/jquant1.c

@@ -2,6 +2,7 @@
  * jquant1.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -193,7 +194,10 @@
   int total_colors, iroot, i, j;
   boolean changed;
   long temp;
-  static const int RGB_order[3] = { RGB_GREEN, RGB_RED, RGB_BLUE };
+  int RGB_order[3] = { RGB_GREEN, RGB_RED, RGB_BLUE };
+  RGB_order[0] = rgb_green[cinfo->out_color_space];
+  RGB_order[1] = rgb_red[cinfo->out_color_space];
+  RGB_order[2] = rgb_blue[cinfo->out_color_space];
 
   /* We can allocate at least the nc'th root of max_colors per component. */
   /* Compute floor(nc'th root of max_colors). */

diff --git a/jquant2.c b/jquant2.c
index af601e3..da964f7 100644
--- a/jquant2.c
+++ b/jquant2.c

@@ -2,6 +2,7 @@
  * jquant2.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -74,29 +75,10 @@
 #define G_SCALE 3		/* scale G distances by this much */
 #define B_SCALE 1		/* and B by this much */
 
-/* Relabel R/G/B as components 0/1/2, respecting the RGB ordering defined
- * in jmorecfg.h.  As the code stands, it will do the right thing for R,G,B
- * and B,G,R orders.  If you define some other weird order in jmorecfg.h,
- * you'll get compile errors until you extend this logic.  In that case
- * you'll probably want to tweak the histogram sizes too.
- */
-
-#if RGB_RED == 0
-#define C0_SCALE R_SCALE
-#endif
-#if RGB_BLUE == 0
-#define C0_SCALE B_SCALE
-#endif
-#if RGB_GREEN == 1
-#define C1_SCALE G_SCALE
-#endif
-#if RGB_RED == 2
-#define C2_SCALE R_SCALE
-#endif
-#if RGB_BLUE == 2
-#define C2_SCALE B_SCALE
-#endif
-
+static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
+#define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]]
+#define C1_SCALE c_scales[rgb_green[cinfo->out_color_space]]
+#define C2_SCALE c_scales[rgb_blue[cinfo->out_color_space]]
 
 /*
  * First we have the histogram data structure and routines for creating it.
@@ -454,15 +436,16 @@
     /* We want to break any ties in favor of green, then red, blue last.
      * This code does the right thing for R,G,B or B,G,R color orders only.
      */
-#if RGB_RED == 0
-    cmax = c1; n = 1;
-    if (c0 > cmax) { cmax = c0; n = 0; }
-    if (c2 > cmax) { n = 2; }
-#else
-    cmax = c1; n = 1;
-    if (c2 > cmax) { cmax = c2; n = 2; }
-    if (c0 > cmax) { n = 0; }
-#endif
+    if (rgb_red[cinfo->out_color_space] == 0) {
+      cmax = c1; n = 1;
+      if (c0 > cmax) { cmax = c0; n = 0; }
+      if (c2 > cmax) { n = 2; }
+    }
+    else {
+      cmax = c1; n = 1;
+      if (c2 > cmax) { cmax = c2; n = 2; }
+      if (c0 > cmax) { n = 0; }
+    }
     /* Choose split point along selected axis, and update box bounds.
      * Current algorithm: split at halfway point.
      * (Since the box has been shrunk to minimum volume,

diff --git a/jsimd.h b/jsimd.h
new file mode 100644
index 0000000..90be19e
--- /dev/null
+++ b/jsimd.h

@@ -0,0 +1,89 @@
+/*
+ * jsimd.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ */
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jsimd_can_rgb_ycc                 jSCanRgbYcc
+#define jsimd_can_ycc_rgb                 jSCanYccRgb
+#define jsimd_rgb_ycc_convert             jSRgbYccConv
+#define jsimd_ycc_rgb_convert             jSYccRgbConv
+#define jsimd_can_h2v2_downsample         jSCanH2V2Down
+#define jsimd_can_h2v1_downsample         jSCanH2V1Down
+#define jsimd_h2v2_downsample             jSH2V2Down
+#define jsimd_h2v1_downsample             jSH2V1Down
+#define jsimd_can_h2v2_upsample           jSCanH2V2Up
+#define jsimd_can_h2v1_upsample           jSCanH2V1Up
+#define jsimd_h2v2_upsample               jSH2V2Up
+#define jsimd_h2v1_upsample               jSH2V1Up
+#define jsimd_can_h2v2_fancy_upsample     jSCanH2V2FUp
+#define jsimd_can_h2v1_fancy_upsample     jSCanH2V1FUp
+#define jsimd_h2v2_fancy_upsample         jSH2V2FUp
+#define jsimd_h2v1_fancy_upsample         jSH2V1FUp
+#define jsimd_can_h2v2_merged_upsample    jSCanH2V2MUp
+#define jsimd_can_h2v1_merged_upsample    jSCanH2V1MUp
+#define jsimd_h2v2_merged_upsample        jSH2V2MUp
+#define jsimd_h2v1_merged_upsample        jSH2V1MUp
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+EXTERN(int) jsimd_can_rgb_ycc JPP((void));
+EXTERN(int) jsimd_can_ycc_rgb JPP((void));
+
+EXTERN(void) jsimd_rgb_ycc_convert
+        JPP((j_compress_ptr cinfo,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_ycc_rgb_convert
+        JPP((j_decompress_ptr cinfo,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
+EXTERN(int) jsimd_can_h2v2_downsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_downsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_downsample
+        JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample
+        JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+EXTERN(int) jsimd_can_h2v2_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_upsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_upsample
+        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample
+        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(int) jsimd_can_h2v2_fancy_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_fancy_upsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_fancy_upsample
+        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample
+        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(int) jsimd_can_h2v2_merged_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_merged_upsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_merged_upsample
+        JPP((j_decompress_ptr cinfo,
+             JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+             JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample
+        JPP((j_decompress_ptr cinfo,
+             JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+             JSAMPARRAY output_buf));
+

diff --git a/jsimd_none.c b/jsimd_none.c
new file mode 100644
index 0000000..8960802
--- /dev/null
+++ b/jsimd_none.c

@@ -0,0 +1,299 @@
+/*
+ * jsimd_none.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This file contains stubs for when there is no SIMD support available.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jsimd.h"
+#include "jdct.h"
+#include "jsimddct.h"
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+

diff --git a/jsimddct.h b/jsimddct.h
new file mode 100644
index 0000000..d73d0c4
--- /dev/null
+++ b/jsimddct.h

@@ -0,0 +1,101 @@
+/*
+ * jsimddct.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ */
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jsimd_can_convsamp                jSCanConv
+#define jsimd_can_convsamp_float          jSCanConvF
+#define jsimd_convsamp                    jSConv
+#define jsimd_convsamp_float              jSConvF
+#define jsimd_can_fdct_islow              jSCanFDCTIS
+#define jsimd_can_fdct_ifast              jSCanFDCTIF
+#define jsimd_can_fdct_float              jSCanFDCTFl
+#define jsimd_fdct_islow                  jSFDCTIS
+#define jsimd_fdct_ifast                  jSFDCTIF
+#define jsimd_fdct_float                  jSFDCTFl
+#define jsimd_can_quantize                jSCanQuant
+#define jsimd_can_quantize_float          jSCanQuantF
+#define jsimd_quantize                    jSQuant
+#define jsimd_quantize_float              jSQuantF
+#define jsimd_can_idct_2x2                jSCanIDCT22
+#define jsimd_can_idct_4x4                jSCanIDCT44
+#define jsimd_idct_2x2                    jSIDCT22
+#define jsimd_idct_4x4                    jSIDCT44
+#define jsimd_can_idct_islow              jSCanIDCTIS
+#define jsimd_can_idct_ifast              jSCanIDCTIF
+#define jsimd_can_idct_float              jSCanIDCTFl
+#define jsimd_idct_islow                  jSIDCTIS
+#define jsimd_idct_ifast                  jSIDCTIF
+#define jsimd_idct_float                  jSIDCTFl
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+EXTERN(int) jsimd_can_convsamp JPP((void));
+EXTERN(int) jsimd_can_convsamp_float JPP((void));
+
+EXTERN(void) jsimd_convsamp JPP((JSAMPARRAY sample_data,
+                                 JDIMENSION start_col,
+                                 DCTELEM * workspace));
+EXTERN(void) jsimd_convsamp_float JPP((JSAMPARRAY sample_data,
+                                       JDIMENSION start_col,
+                                       FAST_FLOAT * workspace));
+
+EXTERN(int) jsimd_can_fdct_islow JPP((void));
+EXTERN(int) jsimd_can_fdct_ifast JPP((void));
+EXTERN(int) jsimd_can_fdct_float JPP((void));
+
+EXTERN(void) jsimd_fdct_islow JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_float JPP((FAST_FLOAT * data));
+
+EXTERN(int) jsimd_can_quantize JPP((void));
+EXTERN(int) jsimd_can_quantize_float JPP((void));
+
+EXTERN(void) jsimd_quantize JPP((JCOEFPTR coef_block,
+                                 DCTELEM * divisors,
+                                 DCTELEM * workspace));
+EXTERN(void) jsimd_quantize_float JPP((JCOEFPTR coef_block,
+                                       FAST_FLOAT * divisors,
+                                       FAST_FLOAT * workspace));
+
+EXTERN(int) jsimd_can_idct_2x2 JPP((void));
+EXTERN(int) jsimd_can_idct_4x4 JPP((void));
+
+EXTERN(void) jsimd_idct_2x2 JPP((j_decompress_ptr cinfo,
+                                 jpeg_component_info * compptr,
+                                 JCOEFPTR coef_block,
+                                 JSAMPARRAY output_buf,
+                                 JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4 JPP((j_decompress_ptr cinfo,
+                                 jpeg_component_info * compptr,
+                                 JCOEFPTR coef_block,
+                                 JSAMPARRAY output_buf,
+                                 JDIMENSION output_col));
+
+EXTERN(int) jsimd_can_idct_islow JPP((void));
+EXTERN(int) jsimd_can_idct_ifast JPP((void));
+EXTERN(int) jsimd_can_idct_float JPP((void));
+
+EXTERN(void) jsimd_idct_islow JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
+EXTERN(void) jsimd_idct_float JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
+

diff --git a/ltconfig b/ltconfig
deleted file mode 100755
index 2347e69..0000000
--- a/ltconfig
+++ /dev/null

@@ -1,1512 +0,0 @@
-#! /bin/sh
-
-# ltconfig - Create a system-specific libtool.
-# Copyright (C) 1996-1998 Free Software Foundation, Inc.
-# Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
-#
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# A lot of this script is taken from autoconf-2.10.
-
-# The HP-UX ksh and POSIX shell print the target directory to stdout
-# if CDPATH is set.
-if test "${CDPATH+set}" = set; then CDPATH=; export CDPATH; fi
-
-echo=echo
-if test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then :
-else
-  # The Solaris and AIX default echo program unquotes backslashes.
-  # This makes it impossible to quote backslashes using
-  #   echo "$something" | sed 's/\\/\\\\/g'
-  # So, we emulate echo with printf '%s\n'
-  echo="printf %s\\n"
-  if test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then :
-  else
-    # Oops.  We have no working printf.  Try to find a not-so-buggy echo.
-    echo=echo
-    IFS="${IFS= 	}"; save_ifs="$IFS"; IFS="${IFS}:"
-    for dir in $PATH /usr/ucb; do
-      if test -f $dir/echo && test "X`$dir/echo '\t'`" = 'X\t'; then
-        echo="$dir/echo"
-        break
-      fi
-    done
-    IFS="$save_ifs"
-  fi
-fi
-
-# Sed substitution that helps us do robust quoting.  It backslashifies
-# metacharacters that are still active within double-quoted strings.
-Xsed='sed -e s/^X//'
-sed_quote_subst='s/\([\\"\\`$\\\\]\)/\\\1/g'
-
-# Same as above, but do not quote variable references.
-double_quote_subst='s/\([\\"\\`\\\\]\)/\\\1/g'
-
-# The name of this program.
-progname=`$echo "X$0" | $Xsed -e 's%^.*/%%'`
-
-# Constants:
-PROGRAM=ltconfig
-PACKAGE=libtool
-VERSION=1.2
-ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.c 1>&5'
-ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.c $LIBS 1>&5'
-rm="rm -f"
-
-help="Try \`$progname --help' for more information."
-
-# Global variables:
-can_build_shared=yes
-enable_shared=yes
-# All known linkers require a `.a' archive for static linking.
-enable_static=yes
-ltmain=
-silent=
-srcdir=
-ac_config_guess=
-ac_config_sub=
-host=
-nonopt=
-verify_host=yes
-with_gcc=no
-with_gnu_ld=no
-
-old_AR="$AR"
-old_CC="$CC"
-old_CFLAGS="$CFLAGS"
-old_CPPFLAGS="$CPPFLAGS"
-old_LD="$LD"
-old_LN_S="$LN_S"
-old_NM="$NM"
-old_RANLIB="$RANLIB"
-
-# Parse the command line options.
-args=
-prev=
-for option
-do
-  case "$option" in
-  -*=*) optarg=`echo "$option" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
-  *) optarg= ;;
-  esac
-
-  # If the previous option needs an argument, assign it.
-  if test -n "$prev"; then
-    eval "$prev=\$option"
-    prev=
-    continue
-  fi
-
-  case "$option" in
-  --help) cat <<EOM
-Usage: $progname [OPTION]... LTMAIN [HOST]
-
-Generate a system-specific libtool script.
-
-    --disable-shared       do not build shared libraries
-    --disable-static       do not build static libraries
-    --help                 display this help and exit
-    --no-verify            do not verify that HOST is a valid host type
-    --quiet                same as \`--silent'
-    --silent               do not print informational messages
-    --srcdir=DIR           find \`config.guess' in DIR
-    --version              output version information and exit
-    --with-gcc             assume that the GNU C compiler will be used
-    --with-gnu-ld          assume that the C compiler uses the GNU linker
-
-LTMAIN is the \`ltmain.sh' shell script fragment that provides basic libtool
-functionality.
-
-HOST is the canonical host system name [default=guessed].
-EOM
-  exit 0
-  ;;
-
-  --disable-shared) enable_shared=no ;;
-
-  --disable-static) enable_static=no ;;
-
-  --quiet | --silent) silent=yes ;;
-
-  --srcdir) prev=srcdir ;;
-  --srcdir=*) srcdir="$optarg" ;;
-
-  --no-verify) verify_host=no ;;
-
-  --version) echo "$PROGRAM (GNU $PACKAGE) $VERSION"; exit 0 ;;
-
-  --with-gcc) with_gcc=yes ;;
-  --with-gnu-ld) with_gnu_ld=yes ;;
-
-  -*)
-    echo "$progname: unrecognized option \`$option'" 1>&2
-    echo "$help" 1>&2
-    exit 1
-    ;;
-
-  *)
-    if test -z "$ltmain"; then
-      ltmain="$option"
-    elif test -z "$host"; then
-# This generates an unnecessary warning for sparc-sun-solaris4.1.3_U1
-#      if test -n "`echo $option| sed 's/[-a-z0-9.]//g'`"; then
-#        echo "$progname: warning \`$option' is not a valid host type" 1>&2
-#      fi
-      host="$option"
-    else
-      echo "$progname: too many arguments" 1>&2
-      echo "$help" 1>&2
-      exit 1
-    fi ;;
-  esac
-done
-
-if test -z "$ltmain"; then
-  echo "$progname: you must specify a LTMAIN file" 1>&2
-  echo "$help" 1>&2
-  exit 1
-fi
-
-if test -f "$ltmain"; then :
-else
-  echo "$progname: \`$ltmain' does not exist" 1>&2
-  echo "$help" 1>&2
-  exit 1
-fi
-
-# Quote any args containing shell metacharacters.
-ltconfig_args=
-for arg
-do
-  case "$arg" in
-  *" "*|*"	"*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*)
-  ltconfig_args="$ltconfig_args '$arg'" ;;
-  *) ltconfig_args="$ltconfig_args $arg" ;;
-  esac
-done
-
-# A relevant subset of AC_INIT.
-
-# File descriptor usage:
-# 0 standard input
-# 1 file creation
-# 2 errors and warnings
-# 3 some systems may open it to /dev/tty
-# 4 used on the Kubota Titan
-# 5 compiler messages saved in config.log
-# 6 checking for... messages and results
-if test "$silent" = yes; then
-  exec 6>/dev/null
-else
-  exec 6>&1
-fi
-exec 5>>./config.log
-
-# NLS nuisances.
-# Only set LANG and LC_ALL to C if already set.
-# These must not be set unconditionally because not all systems understand
-# e.g. LANG=C (notably SCO).
-if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi
-if test "${LANG+set}"   = set; then LANG=C;   export LANG;   fi
-
-if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
-  # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu.
-  if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
-    ac_n= ac_c='
-' ac_t='	'
-  else
-    ac_n=-n ac_c= ac_t=
-  fi
-else
-  ac_n= ac_c='\c' ac_t=
-fi
-
-if test -z "$srcdir"; then
-  # Assume the source directory is the same one as the path to ltmain.sh.
-  srcdir=`$echo "$ltmain" | $Xsed -e 's%/[^/]*$%%'`
-  test "$srcdir" = "$ltmain" && srcdir=.
-fi
-
-trap "$rm conftest*; exit 1" 1 2 15
-if test "$verify_host" = yes; then
-  # Check for config.guess and config.sub.
-  ac_aux_dir=
-  for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do
-    if test -f $ac_dir/config.guess; then
-      ac_aux_dir=$ac_dir
-      break
-    fi
-  done
-  if test -z "$ac_aux_dir"; then
-    echo "$progname: cannot find config.guess in $srcdir $srcdir/.. $srcdir/../.." 1>&2
-    echo "$help" 1>&2
-    exit 1
-  fi
-  ac_config_guess=$ac_aux_dir/config.guess
-  ac_config_sub=$ac_aux_dir/config.sub
-
-  # Make sure we can run config.sub.
-  if $ac_config_sub sun4 >/dev/null 2>&1; then :
-  else
-    echo "$progname: cannot run $ac_config_sub" 1>&2
-    echo "$help" 1>&2
-    exit 1
-  fi
-
-  echo $ac_n "checking host system type""... $ac_c" 1>&6
-
-  host_alias=$host
-  case "$host_alias" in
-  "")
-    if host_alias=`$ac_config_guess`; then :
-    else
-      echo "$progname: cannot guess host type; you must specify one" 1>&2
-      echo "$help" 1>&2
-      exit 1
-    fi ;;
-  esac
-  host=`$ac_config_sub $host_alias`
-  echo "$ac_t$host" 1>&6
-
-  # Make sure the host verified.
-  test -z "$host" && exit 1
-
-elif test -z "$host"; then
-  echo "$progname: you must specify a host type if you use \`--no-verify'" 1>&2
-  echo "$help" 1>&2
-  exit 1
-else
-  host_alias=$host
-fi
-
-# Transform linux* to *-*-linux-gnu*, to support old configure scripts.
-case "$host_os" in
-linux-gnu*) ;;
-linux*) host=`echo $host | sed 's/^\(.*-.*-linux\)\(.*\)$/\1-gnu\2/'`
-esac
-
-host_cpu=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
-host_vendor=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
-host_os=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
-
-case "$host_os" in
-aix3*)
-  # AIX sometimes has problems with the GCC collect2 program.  For some
-  # reason, if we set the COLLECT_NAMES environment variable, the problems
-  # vanish in a puff of smoke.
-  if test "${COLLECT_NAMES+set}" != set; then
-    COLLECT_NAMES=
-    export COLLECT_NAMES
-  fi
-  ;;
-esac
-
-# Determine commands to create old-style static archives.
-old_archive_cmds='$AR cru $oldlib$oldobjs'
-old_postinstall_cmds='chmod 644 $oldlib'
-old_postuninstall_cmds=
-
-# Set a sane default for `AR'.
-test -z "$AR" && AR=ar
-
-# If RANLIB is not set, then run the test.
-if test "${RANLIB+set}" != "set"; then
-  result=no
-
-  echo $ac_n "checking for ranlib... $ac_c" 1>&6
-  IFS="${IFS= 	}"; save_ifs="$IFS"; IFS="${IFS}:"
-  for dir in $PATH; do
-    test -z "$dir" && dir=.
-    if test -f $dir/ranlib; then
-      RANLIB="ranlib"
-      result="ranlib"
-      break
-    fi
-  done
-  IFS="$save_ifs"
-
-  echo "$ac_t$result" 1>&6
-fi
-
-if test -n "$RANLIB"; then
-  old_archive_cmds="$old_archive_cmds;\$RANLIB \$oldlib"
-  old_postinstall_cmds="\$RANLIB \$oldlib;$old_postinstall_cmds"
-fi
-
-# Check to see if we are using GCC.
-if test "$with_gcc" != yes || test -z "$CC"; then
-  # If CC is not set, then try to find GCC or a usable CC.
-  if test -z "$CC"; then
-    echo $ac_n "checking for gcc... $ac_c" 1>&6
-    IFS="${IFS= 	}"; save_ifs="$IFS"; IFS="${IFS}:"
-    for dir in $PATH; do
-      IFS="$save_ifs"
-      test -z "$dir" && dir=.
-      if test -f $dir/gcc; then
-	CC="gcc"
-	break
-      fi
-    done
-    IFS="$save_ifs"
-
-    if test -n "$CC"; then
-      echo "$ac_t$CC" 1>&6
-    else
-      echo "$ac_t"no 1>&6
-    fi
-  fi
-
-  # Not "gcc", so try "cc", rejecting "/usr/ucb/cc".
-  if test -z "$CC"; then
-    echo $ac_n "checking for cc... $ac_c" 1>&6
-    IFS="${IFS= 	}"; save_ifs="$IFS"; IFS="${IFS}:"
-    cc_rejected=no
-    for dir in $PATH; do
-      test -z "$dir" && dir=.
-      if test -f $dir/cc; then
-	if test "$dir/cc" = "/usr/ucb/cc"; then
-	  cc_rejected=yes
-	  continue
-	fi
-	CC="cc"
-	break
-      fi
-    done
-    IFS="$save_ifs"
-    if test $cc_rejected = yes; then
-      # We found a bogon in the path, so make sure we never use it.
-      set dummy $CC
-      shift
-      if test $# -gt 0; then
-	# We chose a different compiler from the bogus one.
-	# However, it has the same name, so the bogon will be chosen
-	# first if we set CC to just the name; use the full file name.
-	shift
-	set dummy "$dir/cc" "$@"
-	shift
-	CC="$@"
-      fi
-    fi
-
-    if test -n "$CC"; then
-      echo "$ac_t$CC" 1>&6
-    else
-      echo "$ac_t"no 1>&6
-    fi
-
-    if test -z "$CC"; then
-      echo "$progname: error: no acceptable cc found in \$PATH" 1>&2
-      exit 1
-    fi
-  fi
-
-  # Now see if the compiler is really GCC.
-  with_gcc=no
-  echo $ac_n "checking whether we are using GNU C... $ac_c" 1>&6
-  echo "$progname:424: checking whether we are using GNU C" >&5
-
-  $rm conftest.c
-  cat > conftest.c <<EOF
-#ifdef __GNUC__
-  yes;
-#endif
-EOF
-  if { ac_try='${CC-cc} -E conftest.c'; { (eval echo $progname:432: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
-    with_gcc=yes
-  fi
-  $rm conftest.c
-  echo "$ac_t$with_gcc" 1>&6
-fi
-
-# Allow CC to be a program name with arguments.
-set dummy $CC
-compiler="$2"
-
-echo $ac_n "checking for $compiler option to produce PIC... $ac_c" 1>&6
-pic_flag=
-special_shlib_compile_flags=
-wl=
-link_static_flag=
-no_builtin_flag=
-
-if test "$with_gcc" = yes; then
-  wl='-Wl,'
-  link_static_flag='-static'
-  no_builtin_flag=' -fno-builtin'
-
-  case "$host_os" in
-  aix3* | aix4* | irix5* | irix6* | osf3* | osf4*)
-    # PIC is the default for these OSes.
-    ;;
-  os2*)
-    # We can build DLLs from non-PIC.
-    ;;
-  amigaos*)
-    # FIXME: we need at least 68020 code to build shared libraries, but
-    # adding the `-m68020' flag to GCC prevents building anything better,
-    # like `-m68040'.
-    pic_flag='-m68020 -resident32 -malways-restore-a4'
-    ;;
-  *)
-    pic_flag='-fPIC'
-    ;;
-  esac
-else
-  # PORTME Check for PIC flags for the system compiler.
-  case "$host_os" in
-  aix3* | aix4*)
-    # All AIX code is PIC.
-    link_static_flag='-bnso -bI:/lib/syscalls.exp'
-    ;;
-
-  hpux9* | hpux10*)
-    # Is there a better link_static_flag that works with the bundled CC?
-    wl='-Wl,'
-    link_static_flag="${wl}-a ${wl}archive"
-    pic_flag='+Z'
-    ;;
-
-  irix5* | irix6*)
-    wl='-Wl,'
-    link_static_flag='-non_shared'
-    # PIC (with -KPIC) is the default.
-    ;;
-
-  os2*)
-    # We can build DLLs from non-PIC.
-    ;;
-
-  osf3* | osf4*)
-    # All OSF/1 code is PIC.
-    wl='-Wl,'
-    link_static_flag='-non_shared'
-    ;;
-
-  sco3.2v5*)
-    pic_flag='-Kpic'
-    link_static_flag='-dn'
-    special_shlib_compile_flags='-belf'
-    ;;
-
-  solaris2*)
-    pic_flag='-KPIC'
-    link_static_flag='-Bstatic'
-    wl='-Wl,'
-    ;;
-
-  sunos4*)
-    pic_flag='-PIC'
-    link_static_flag='-Bstatic'
-    wl='-Qoption ld '
-    ;;
-
-  sysv4.2uw2*)
-    pic_flag='-KPIC'
-    link_static_flag='-Bstatic'
-    wl='-Wl,'
-    ;;
-
-  uts4*)
-    pic_flag='-pic'
-    link_static_flag='-Bstatic'
-    ;;
-
-  *)
-    can_build_shared=no
-    ;;
-  esac
-fi
-
-if test -n "$pic_flag"; then
-  echo "$ac_t$pic_flag" 1>&6
-
-  # Check to make sure the pic_flag actually works.
-  echo $ac_n "checking if $compiler PIC flag $pic_flag works... $ac_c" 1>&6
-  $rm conftest*
-  echo > conftest.c
-  save_CFLAGS="$CFLAGS"
-  CFLAGS="$CFLAGS $pic_flag -DPIC"
-  echo "$progname:547: checking if $compiler PIC flag $pic_flag works" >&5
-  if { (eval echo $progname:548: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.o; then
-    # Append any warnings to the config.log.
-    cat conftest.err 1>&5
-
-    # On HP-UX, both CC and GCC only warn that PIC is supported... then they
-    # create non-PIC objects.  So, if there were any warnings, we assume that
-    # PIC is not supported.
-    if test -s conftest.err; then
-      echo "$ac_t"no 1>&6
-      can_build_shared=no
-      pic_flag=
-    else
-      echo "$ac_t"yes 1>&6
-      pic_flag=" $pic_flag"
-    fi
-  else
-    # Append any errors to the config.log.
-    cat conftest.err 1>&5
-    can_build_shared=no
-    pic_flag=
-    echo "$ac_t"no 1>&6
-  fi
-  CFLAGS="$save_CFLAGS"
-  $rm conftest*
-else
-  echo "$ac_t"none 1>&6
-fi
-
-# Check for any special shared library compilation flags.
-if test -n "$special_shlib_compile_flags"; then
-  echo "$progname: warning: \`$CC' requires \`$special_shlib_compile_flags' to build shared libraries" 1>&2
-  if echo "$old_CC $old_CFLAGS " | egrep -e "[ 	]$special_shlib_compile_flags[ 	]" >/dev/null; then :
-  else
-    echo "$progname: add \`$special_shlib_compile_flags' to the CC or CFLAGS env variable and reconfigure" 1>&2
-    can_build_shared=no
-  fi
-fi
-
-echo $ac_n "checking if $compiler static flag $link_static_flag works... $ac_c" 1>&6
-$rm conftest*
-echo 'main(){return(0);}' > conftest.c
-save_LDFLAGS="$LDFLAGS"
-LDFLAGS="$LDFLAGS $link_static_flag"
-echo "$progname:591: checking if $compiler static flag $link_static_flag works" >&5
-if { (eval echo $progname:592: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
-  echo "$ac_t$link_static_flag" 1>&6
-else
-  echo "$ac_t"none 1>&6
-  link_static_flag=
-fi
-LDFLAGS="$save_LDFLAGS"
-$rm conftest*
-
-if test -z "$LN_S"; then
-  # Check to see if we can use ln -s, or we need hard links.
-  echo $ac_n "checking whether ln -s works... $ac_c" 1>&6
-  $rm conftestdata
-  if ln -s X conftestdata 2>/dev/null; then
-    $rm conftestdata
-    LN_S="ln -s"
-  else
-    LN_S=ln
-  fi
-  if test "$LN_S" = "ln -s"; then
-    echo "$ac_t"yes 1>&6
-  else
-    echo "$ac_t"no 1>&6
-  fi
-fi
-
-# Make sure LD is an absolute path.
-if test -z "$LD"; then
-  ac_prog=ld
-  if test "$with_gcc" = yes; then
-    # Check if gcc -print-prog-name=ld gives a path.
-    echo $ac_n "checking for ld used by GCC... $ac_c" 1>&6
-    echo "$progname:624: checking for ld used by GCC" >&5
-    ac_prog=`($CC -print-prog-name=ld) 2>&5`
-    case "$ac_prog" in
-    # Accept absolute paths.
-    /* | [A-Za-z]:\\*)
-      test -z "$LD" && LD="$ac_prog"
-      ;;
-    "")
-      # If it fails, then pretend we are not using GCC.
-      ac_prog=ld
-      ;;
-    *)
-      # If it is relative, then search for the first ld in PATH.
-      with_gnu_ld=unknown
-      ;;
-    esac
-  elif test "$with_gnu_ld" = yes; then
-    echo $ac_n "checking for GNU ld... $ac_c" 1>&6
-    echo "$progname:642: checking for GNU ld" >&5
-  else
-    echo $ac_n "checking for non-GNU ld""... $ac_c" 1>&6
-    echo "$progname:645: checking for non-GNU ld" >&5
-  fi
-
-  if test -z "$LD"; then
-    IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:"
-    for ac_dir in $PATH; do
-      test -z "$ac_dir" && ac_dir=.
-      if test -f "$ac_dir/$ac_prog"; then
-	LD="$ac_dir/$ac_prog"
-	# Check to see if the program is GNU ld.  I'd rather use --version,
-	# but apparently some GNU ld's only accept -v.
-	# Break only if it was the GNU/non-GNU ld that we prefer.
-	if "$LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
-	  test "$with_gnu_ld" != no && break
-	else
-	  test "$with_gnu_ld" != yes && break
-	fi
-      fi
-    done
-    IFS="$ac_save_ifs"
-  fi
-
-  if test -n "$LD"; then
-    echo "$ac_t$LD" 1>&6
-  else
-    echo "$ac_t"no 1>&6
-  fi
-
-  if test -z "$LD"; then
-    echo "$progname: error: no acceptable ld found in \$PATH" 1>&2
-    exit 1
-  fi
-fi
-
-# Check to see if it really is or is not GNU ld.
-echo $ac_n "checking if the linker ($LD) is GNU ld... $ac_c" 1>&6
-# I'd rather use --version here, but apparently some GNU ld's only accept -v.
-if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
-  with_gnu_ld=yes
-else
-  with_gnu_ld=no
-fi
-echo "$ac_t$with_gnu_ld" 1>&6
-
-# See if the linker supports building shared libraries.
-echo $ac_n "checking whether the linker ($LD) supports shared libraries... $ac_c" 1>&6
-
-allow_undefined_flag=
-no_undefined_flag=
-archive_cmds=
-old_archive_from_new_cmds=
-export_dynamic_flag_spec=
-hardcode_libdir_flag_spec=
-hardcode_libdir_separator=
-hardcode_direct=no
-hardcode_minus_L=no
-hardcode_shlibpath_var=unsupported
-runpath_var=
-
-case "$host_os" in
-amigaos* | sunos4*)
-  # On these operating systems, we should treat GNU ld like the system ld.
-  gnu_ld_acts_native=yes
-  ;;
-*)
-  gnu_ld_acts_native=no
-  ;;
-esac
-
-ld_shlibs=yes
-if test "$with_gnu_ld" = yes && test "$gnu_ld_acts_native" != yes; then
-
-  # See if GNU ld supports shared libraries.
-  if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
-    archive_cmds='$CC -shared ${wl}-soname $wl$soname -o $lib$libobjs'
-    runpath_var=LD_RUN_PATH
-    ld_shlibs=yes
-  else
-    ld_shlibs=no
-  fi
-
-  if test "$ld_shlibs" = yes; then
-    hardcode_libdir_flag_spec='${wl}--rpath ${wl}$libdir'
-    export_dynamic_flag_spec='${wl}--export-dynamic'
-  fi
-else
-  # PORTME fill in a description of your system's linker (not GNU ld)
-  case "$host_os" in
-  aix3*)
-    allow_undefined_flag=unsupported
-    archive_cmds='$NM$libobjs | $global_symbol_pipe | sed '\''s/.* //'\'' > $lib.exp;$LD -o $objdir/$soname$libobjs -bE:$lib.exp -T512 -H512 -bM:SRE;$AR cru $lib $objdir/$soname'
-    # Note: this linker hardcodes the directories in LIBPATH if there
-    # are no directories specified by -L.
-    hardcode_minus_L=yes
-    if test "$with_gcc" = yes && test -z "$link_static_flag"; then
-      # Neither direct hardcoding nor static linking is supported with a
-      # broken collect2.
-      hardcode_direct=unsupported
-    fi
-    ;;
-
-  aix4*)
-    allow_undefined_flag=unsupported
-    archive_cmds='$NM$libobjs | $global_symbol_pipe | sed '\''s/.* //'\'' > $lib.exp;$CC -o $objdir/$soname$libobjs ${wl}-bE:$lib.exp ${wl}-bM:SRE ${wl}-bnoentry;$AR cru $lib $objdir/$soname'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    ;;
-
-  amigaos*)
-    archive_cmds='$rm $objdir/a2ixlibrary.data;$echo "#define NAME $libname" > $objdir/a2ixlibrary.data;$echo "#define LIBRARY_ID 1" >> $objdir/a2ixlibrary.data;$echo "#define VERSION $major" >> $objdir/a2ixlibrary.data;$echo "#define REVISION $revision" >> $objdir/a2ixlibrary.data;$AR cru $lib$libobjs;$RANLIB $lib;(cd $objdir && a2ixlibrary -32)'
-    hardcode_libdir_flag_spec='-L$libdir'
-    hardcode_minus_L=yes
-    ;;
-
-  # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
-  # support.  Future versions do this automatically, but an explicit c++rt0.o
-  # does not break anything, and helps significantly (at the cost of a little
-  # extra space).
-  freebsd2.2*)
-    archive_cmds='$LD -Bshareable -o $lib$libobjs /usr/lib/c++rt0.o'
-    hardcode_libdir_flag_spec='-R$libdir'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    hardcode_shlibpath_var=no
-    ;;
-
-  # Unfortunately, older versions of FreeBSD 2 do not have this feature.
-  freebsd2*)
-    archive_cmds='$LD -Bshareable -o $lib$libobjs'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    hardcode_shlibpath_var=no
-    ;;
-
-  # FreeBSD 3, at last, uses gcc -shared to do shared libraries.
-  freebsd3*)
-    archive_cmds='$CC -shared -o $lib$libobjs'
-    hardcode_libdir_flag_spec='-R$libdir'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    hardcode_shlibpath_var=no
-    ;;
-
-  hpux9*)
-    archive_cmds='$rm $objdir/$soname;$LD -b +s +b $install_libdir -o $objdir/$soname$libobjs;mv $objdir/$soname $lib'
-    hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    export_dynamic_flag_spec='${wl}-E'
-    ;;
-
-  hpux10*)
-    archive_cmds='$LD -b +h $soname +s +b $install_libdir -o $lib$libobjs'
-    hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    export_dynamic_flag_spec='${wl}-E'
-    ;;
-
-  irix5* | irix6*)
-    archive_cmds='$LD -shared -o $lib -soname $soname -set_version $verstring$libobjs'
-    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
-    ;;
-
-  netbsd*)
-    # Tested with NetBSD 1.2 ld
-    archive_cmds='$LD -Bshareable -o $lib$libobjs'
-    hardcode_libdir_flag_spec='-R$libdir'
-    hardcode_direct=yes
-    hardcode_shlibpath_var=no
-    ;;
-
-  openbsd*)
-    archive_cmds='$LD -Bshareable -o $lib$libobjs'
-    hardcode_libdir_flag_spec='-R$libdir'
-    hardcode_direct=yes
-    hardcode_shlibpath_var=no
-    ;;
-
-  os2*)
-    hardcode_libdir_flag_spec='-L$libdir'
-    hardcode_minus_L=yes
-    allow_undefined_flag=unsupported
-    archive_cmds='$echo "LIBRARY $libname INITINSTANCE" > $objdir/$libname.def;$echo "DESCRIPTION \"$libname\"" >> $objdir/$libname.def;$echo DATA >> $objdir/$libname.def;$echo " SINGLE NONSHARED" >> $objdir/$libname.def;$echo EXPORTS >> $objdir/$libname.def;emxexp$libobjs >> $objdir/$libname.def;$CC -Zdll -Zcrtdll -o $lib$libobjs $objdir/$libname.def'
-    old_archive_from_new_cmds='emximp -o $objdir/$libname.a $objdir/$libname.def'
-    ;;
-
-  osf3* | osf4*)
-    allow_undefined_flag=' -expect_unresolved \*'
-    archive_cmds='$LD -shared${allow_undefined_flag} -o $lib -soname $soname -set_version $verstring$libobjs$deplibs'
-    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
-    hardcode_libdir_separator=:
-    ;;
-
-  sco3.2v5*)
-    archive_cmds='$LD -G -o $lib$libobjs'
-    hardcode_direct=yes
-    ;;
-
-  solaris2*)
-    no_undefined_flag=' -z text'
-    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib$libobjs'
-    hardcode_libdir_flag_spec='-R$libdir'
-    hardcode_shlibpath_var=no
-
-    # Solaris 2 before 2.5 hardcodes -L paths.
-    case "$host_os" in
-    solaris2.[0-4]*)
-      hardcode_minus_L=yes
-      ;;
-    esac
-    ;;
-
-  sunos4*)
-    if test "$with_gcc" = yes; then
-      archive_cmds='$CC -shared -o $lib$libobjs'
-    else
-      archive_cmds='$LD -assert pure-text -Bstatic -o $lib$libobjs'
-    fi
-
-    if test "$with_gnu_ld" = yes; then
-      export_dynamic_flag_spec='${wl}-export-dynamic'
-    fi
-    hardcode_libdir_flag_spec='-L$libdir'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    hardcode_shlibpath_var=no
-    ;;
-
-  uts4*)
-    archive_cmds='$LD -G -h $soname -o $lib$libobjs'
-    hardcode_libdir_flag_spec='-L$libdir'
-    hardcode_direct=no
-    hardcode_minus_L=no
-    hardcode_shlibpath_var=no
-    ;;
-
-  *)
-    ld_shlibs=no
-    can_build_shared=no
-    ;;
-  esac
-fi
-echo "$ac_t$ld_shlibs" 1>&6
-
-if test -z "$NM"; then
-  echo $ac_n "checking for BSD-compatible nm... $ac_c" 1>&6
-  case "$NM" in
-  /* | [A-Za-z]:\\*) ;; # Let the user override the test with a path.
-  *)
-    IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:"
-    for ac_dir in /usr/ucb /usr/ccs/bin $PATH /bin; do
-      test -z "$ac_dir" && ac_dir=.
-      if test -f $ac_dir/nm; then
-        # Check to see if the nm accepts a BSD-compat flag.
-        # Adding the `sed 1q' prevents false positives on HP-UX, which says:
-        #   nm: unknown option "B" ignored
-        if ($ac_dir/nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
-          NM="$ac_dir/nm -B"
-        elif ($ac_dir/nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
-          NM="$ac_dir/nm -p"
-	else
-          NM="$ac_dir/nm"
-	fi
-        break
-      fi
-    done
-    IFS="$ac_save_ifs"
-    test -z "$NM" && NM=nm
-    ;;
-  esac
-  echo "$ac_t$NM" 1>&6
-fi
-
-# Check for command to grab the raw symbol name followed by C symbol from nm.
-echo $ac_n "checking command to parse $NM output... $ac_c" 1>&6
-
-# These are sane defaults that work on at least a few old systems.
-# [They come from Ultrix.  What could be older than Ultrix?!! ;)]
-
-# Character class describing NM global symbol codes.
-symcode='[BCDEGRSTU]'
-
-# Regexp to match symbols that can be accessed directly from C.
-sympat='\([_A-Za-z][_A-Za-z0-9]*\)'
-
-# Transform the above into a raw symbol and a C symbol.
-symxfrm='\1 \1'
-
-# Define system-specific variables.
-case "$host_os" in
-aix*)
-  symcode='[BCDTU]'
-  ;;
-irix*)
-  # Cannot use undefined symbols on IRIX because inlined functions mess us up.
-  symcode='[BCDEGRST]'
-  ;;
-solaris2*)
-  symcode='[BDTU]'
-  ;;
-esac
-
-# If we're using GNU nm, then use its standard symbol codes.
-if $NM -V 2>&1 | egrep '(GNU|with BFD)' > /dev/null; then
-  symcode='[ABCDGISTUW]'
-fi
-
-# Write the raw and C identifiers.
-global_symbol_pipe="sed -n -e 's/^.* $symcode $sympat$/$symxfrm/p'"
-
-# Check to see that the pipe works correctly.
-pipe_works=no
-$rm conftest*
-cat > conftest.c <<EOF
-#ifdef __cplusplus
-extern "C" {
-#endif
-char nm_test_var;
-void nm_test_func(){}
-#ifdef __cplusplus
-}
-#endif
-main(){nm_test_var='a';nm_test_func();return(0);}
-EOF
-
-echo "$progname:971: checking if global_symbol_pipe works" >&5
-if { (eval echo $progname:972: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; } && test -s conftest.o; then
-  # Now try to grab the symbols.
-  nlist=conftest.nm
-  if { echo "$progname:975: eval \"$NM conftest.o | $global_symbol_pipe > $nlist\"" >&5; eval "$NM conftest.o | $global_symbol_pipe > $nlist 2>&5"; } && test -s "$nlist"; then
-
-    # Try sorting and uniquifying the output.
-    if sort "$nlist" | uniq > "$nlist"T; then
-      mv -f "$nlist"T "$nlist"
-      wcout=`wc "$nlist" 2>/dev/null`
-      count=`$echo "X$wcout" | $Xsed -e 's/^[ 	]*\([0-9][0-9]*\).*$/\1/'`
-      (test "$count" -ge 0) 2>/dev/null || count=-1
-    else
-      rm -f "$nlist"T
-      count=-1
-    fi
-
-    # Make sure that we snagged all the symbols we need.
-    if egrep ' nm_test_var$' "$nlist" >/dev/null; then
-      if egrep ' nm_test_func$' "$nlist" >/dev/null; then
-	cat <<EOF > conftest.c
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-EOF
-        # Now generate the symbol file.
-        sed 's/^.* \(.*\)$/extern char \1;/' < "$nlist" >> conftest.c
-
-	cat <<EOF >> conftest.c
-#if defined (__STDC__) && __STDC__
-# define __ptr_t void *
-#else
-# define __ptr_t char *
-#endif
-
-/* The number of symbols in dld_preloaded_symbols, -1 if unsorted. */
-int dld_preloaded_symbol_count = $count;
-
-/* The mapping between symbol names and symbols. */
-struct {
-  char *name;
-  __ptr_t address;
-}
-dld_preloaded_symbols[] =
-{
-EOF
-        sed 's/^\(.*\) \(.*\)$/  {"\1", (__ptr_t) \&\2},/' < "$nlist" >> conftest.c
-        cat <<\EOF >> conftest.c
-  {0, (__ptr_t) 0}
-};
-
-#ifdef __cplusplus
-}
-#endif
-EOF
-        # Now try linking the two files.
-        mv conftest.o conftestm.o
-	save_LIBS="$LIBS"
-	save_CFLAGS="$CFLAGS"
-        LIBS='conftestm.o'
-	CFLAGS="$CFLAGS$no_builtin_flag"
-        if { (eval echo $progname:1033: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
-          pipe_works=yes
-        else
-          echo "$progname: failed program was:" >&5
-          cat conftest.c >&5
-        fi
-        LIBS="$save_LIBS"
-      else
-        echo "cannot find nm_test_func in $nlist" >&5
-      fi
-    else
-      echo "cannot find nm_test_var in $nlist" >&5
-    fi
-  else
-    echo "cannot run $global_symbol_pipe" >&5
-  fi
-else
-  echo "$progname: failed program was:" >&5
-  cat conftest.c >&5
-fi
-$rm conftest*
-
-# Do not use the global_symbol_pipe unless it works.
-echo "$ac_t$pipe_works" 1>&6
-test "$pipe_works" = yes || global_symbol_pipe=
-
-# Check hardcoding attributes.
-echo $ac_n "checking how to hardcode library paths into programs... $ac_c" 1>&6
-hardcode_action=
-if test -n "$hardcode_libdir_flag_spec" || \
-   test -n "$runpath_var"; then
-
-  # We can hardcode non-existant directories.
-  if test "$hardcode_direct" != no && \
-     test "$hardcode_minus_L" != no && \
-     test "$hardcode_shlibpath_var" != no; then
-
-    # Linking always hardcodes the temporary library directory.
-    hardcode_action=relink
-  else
-    # We can link without hardcoding, and we can hardcode nonexisting dirs.
-    hardcode_action=immediate
-  fi
-elif test "$hardcode_direct" != yes && \
-     test "$hardcode_minus_L" != yes && \
-     test "$hardcode_shlibpath_var" != yes; then
-  # We cannot hardcode anything.
-  hardcode_action=unsupported
-else
-  # We can only hardcode existing directories.
-  hardcode_action=relink
-fi
-echo "$ac_t$hardcode_action" 1>&6
-test "$hardcode_action" = unsupported && can_build_shared=no
-
-
-reload_flag=
-reload_cmds='$LD$reload_flag -o $output$reload_objs'
-echo $ac_n "checking for $LD option to reload object files... $ac_c" 1>&6
-# PORTME Some linker may need a different reload flag.
-reload_flag='-r'
-echo "$ac_t$reload_flag"
-test -n "$reload_flag" && reload_flag=" $reload_flag"
-
-# PORTME Fill in your ld.so characteristics
-library_names_spec=
-libname_spec='lib$name'
-soname_spec=
-postinstall_cmds=
-postuninstall_cmds=
-finish_cmds=
-finish_eval=
-shlibpath_var=
-version_type=none
-dynamic_linker="$host_os ld.so"
-
-echo $ac_n "checking dynamic linker characteristics... $ac_c" 1>&6
-case "$host_os" in
-aix3* | aix4*)
-  version_type=linux
-  library_names_spec='${libname}${release}.so.$versuffix $libname.a'
-  shlibpath_var=LIBPATH
-
-  # AIX has no versioning support, so we append a major version to the name.
-  soname_spec='${libname}${release}.so.$major'
-  ;;
-
-amigaos*)
-  library_names_spec='$libname.ixlibrary $libname.a'
-  # Create ${libname}_ixlibrary.a entries in /sys/libs.
-  finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`$echo "X$lib" | $Xsed -e '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $rm /sys/libs/${libname}_ixlibrary.a; $show "(cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a)"; (cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a) || exit 1; done'
-  ;;
-
-freebsd2* | freebsd3*)
-  version_type=sunos
-  library_names_spec='${libname}${release}.so.$versuffix $libname.so'
-  finish_cmds='PATH="$PATH:/sbin" ldconfig -m $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-gnu*)
-  version_type=sunos
-  library_names_spec='${libname}${release}.so.$versuffix'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-hpux9* | hpux10*)
-  # Give a soname corresponding to the major version so that dld.sl refuses to
-  # link against other versions.
-  dynamic_linker="$host_os dld.sl"
-  version_type=sunos
-  shlibpath_var=SHLIB_PATH
-  library_names_spec='${libname}${release}.sl.$versuffix ${libname}${release}.sl.$major $libname.sl'
-  soname_spec='${libname}${release}.sl.$major'
-  # HP-UX runs *really* slowly unless shared libraries are mode 555.
-  postinstall_cmds='chmod 555 $lib'
-  ;;
-
-irix5* | irix6*)
-  version_type=osf
-  soname_spec='${libname}${release}.so'
-  library_names_spec='${libname}${release}.so.$versuffix $libname.so'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-# No shared lib support for Linux oldld, aout, or coff.
-linux-gnuoldld* | linux-gnuaout* | linux-gnucoff*)
-  dynamic_linker=no
-  ;;
-
-# This must be Linux ELF.
-linux-gnu*)
-  version_type=linux
-  library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
-  soname_spec='${libname}${release}.so.$major'
-  finish_cmds='PATH="$PATH:/sbin" ldconfig -n $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-
-  if test -f /lib/ld.so.1; then
-    dynamic_linker='GNU ld.so'
-  else
-    # Only the GNU ld.so supports shared libraries on MkLinux.
-    case "$host_cpu" in
-    powerpc*) dynamic_linker=no ;;
-    *) dynamic_linker='Linux ld.so' ;;
-    esac
-  fi
-  ;;
-
-netbsd* | openbsd*)
-  version_type=sunos
-  library_names_spec='${libname}${release}.so.$versuffix'
-  finish_cmds='PATH="$PATH:/sbin" ldconfig -m $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-os2*)
-  libname_spec='$name'
-  library_names_spec='$libname.dll $libname.a'
-  dynamic_linker='OS/2 ld.exe'
-  shlibpath_var=LIBPATH
-  ;;
-
-osf3* | osf4*)
-  version_type=osf
-  soname_spec='${libname}${release}.so'
-  library_names_spec='${libname}${release}.so.$versuffix $libname.so'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-sco3.2v5*)
-  version_type=osf
-  soname_spec='${libname}${release}.so.$major'
-  library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-solaris2*)
-  version_type=linux
-  library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
-  soname_spec='${libname}${release}.so.$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-sunos4*)
-  version_type=sunos
-  library_names_spec='${libname}${release}.so.$versuffix'
-  finish_cmds='PATH="$PATH:/usr/etc" ldconfig $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-sysv4.2uw2*)
-  version_type=linux
-  library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
-  soname_spec='${libname}${release}.so.$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-uts4*)
-  version_type=linux
-  library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
-  soname_spec='${libname}${release}.so.$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-*)
-  dynamic_linker=no
-  ;;
-esac
-echo "$ac_t$dynamic_linker"
-test "$dynamic_linker" = no && can_build_shared=no
-
-# Report the final consequences.
-echo "checking if libtool supports shared libraries... $can_build_shared" 1>&6
-
-echo $ac_n "checking whether to build shared libraries... $ac_c" 1>&6
-test "$can_build_shared" = "no" && enable_shared=no
-
-# On AIX, shared libraries and static libraries use the same namespace, and
-# are all built from PIC.
-case "$host_os" in
-aix*)
-  test "$enable_shared" = yes && enable_static=no
-  if test -n "$RANLIB"; then
-    archive_cmds="$archive_cmds;\$RANLIB \$lib"
-    postinstall_cmds='$RANLIB $lib'
-  fi
-  ;;
-esac
-
-echo "$ac_t$enable_shared" 1>&6
-
-# Make sure either enable_shared or enable_static is yes.
-test "$enable_shared" = yes || enable_static=yes
-
-echo "checking whether to build static libraries... $enable_static" 1>&6
-
-echo $ac_n "checking for objdir... $ac_c" 1>&6
-rm -f .libs 2>/dev/null
-mkdir .libs 2>/dev/null
-if test -d .libs; then
-  objdir=.libs
-else
-  # MS-DOS does not allow filenames that begin with a dot.
-  objdir=_libs
-fi
-rmdir .libs 2>/dev/null
-echo "$ac_t$objdir" 1>&6
-
-# Copy echo and quote the copy, instead of the original, because it is
-# used later.
-ltecho="$echo"
-
-# Now quote all the things that may contain metacharacters.
-for var in ltecho old_CC old_CFLAGS old_CPPFLAGS old_LD old_NM old_RANLIB \
-  old_LN_S AR CC LD LN_S NM reload_flag reload_cmds wl pic_flag \
-  link_static_flag no_builtin_flag export_dynamic_flag_spec \
-  libname_spec library_names_spec soname_spec RANLIB \
-  old_archive_cmds old_archive_from_new_cmds old_postinstall_cmds \
-  old_postuninstall_cmds archive_cmds postinstall_cmds postuninstall_cmds \
-  allow_undefined_flag no_undefined_flag \
-  finish_cmds finish_eval global_symbol_pipe \
-  hardcode_libdir_flag_spec hardcode_libdir_separator; do
-
-  case "$var" in
-  reload_cmds | old_archive_cmds | old_archive_from_new_cmds | \
-  old_postinstall_cmds | old_postuninstall_cmds | archive_cmds | \
-  postinstall_cmds | postuninstall_cmds | finish_cmds)
-    # Double-quote double-evaled strings.
-    eval "$var=\`\$echo \"X\$$var\" | \$Xsed -e \"\$double_quote_subst\" -e \"\$sed_quote_subst\"\`"
-    ;;
-  *)
-    eval "$var=\`\$echo \"X\$$var\" | \$Xsed -e \"\$sed_quote_subst\"\`"
-    ;;
-  esac
-done
-
-ofile=libtool
-trap "$rm $ofile; exit 1" 1 2 15
-echo creating $ofile
-$rm $ofile
-cat <<EOF > $ofile
-#! /bin/sh
-
-# libtool - Provide generalized library-building support services.
-# Generated automatically by $PROGRAM - GNU $PACKAGE $VERSION
-# NOTE: Changes made to this file will be lost: look at ltconfig or ltmain.sh.
-#
-# Copyright (C) 1996-1998 Free Software Foundation, Inc.
-# Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# This program was configured as follows,
-# on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
-#
-# CC="$old_CC" CFLAGS="$old_CFLAGS" CPPFLAGS="$old_CPPFLAGS" \\
-# LD="$old_LD" NM="$old_NM" RANLIB="$old_RANLIB" LN_S="$old_LN_S" \\
-#   $0$ltconfig_args
-#
-# Compiler and other test output produced by $progname, useful for
-# debugging $progname, is in ./config.log if it exists.
-
-# Sed that helps us avoid accidentally triggering echo(1) options like -n.
-Xsed="sed -e s/^X//"
-
-# The HP-UX ksh and POSIX shell print the target directory to stdout
-# if CDPATH is set.
-if test "\${CDPATH+set}" = set; then CDPATH=; export CDPATH; fi
-
-# An echo program that does not interpret backslashes.
-echo="$ltecho"
-
-# The version of $progname that generated this script.
-LTCONFIG_VERSION="$VERSION"
-
-# Shell to use when invoking shell scripts.
-SHELL=${CONFIG_SHELL-/bin/sh}
-
-# Whether or not to build libtool libraries.
-build_libtool_libs=$enable_shared
-
-# Whether or not to build old-style libraries.
-build_old_libs=$enable_static
-
-# The host system.
-host_alias="$host_alias"
-host="$host"
-
-# The archiver.
-AR="$AR"
-
-# The default C compiler.
-CC="$CC"
-
-# The linker used to build libraries.
-LD="$LD"
-
-# Whether we need hard or soft links.
-LN_S="$LN_S"
-
-# A BSD-compatible nm program.
-NM="$NM"
-
-# The name of the directory that contains temporary libtool files.
-objdir="$objdir"
-
-# How to create reloadable object files.
-reload_flag="$reload_flag"
-reload_cmds="$reload_cmds"
-
-# How to pass a linker flag through the compiler.
-wl="$wl"
-
-# Additional compiler flags for building library objects.
-pic_flag="$pic_flag"
-
-# Compiler flag to prevent dynamic linking.
-link_static_flag="$link_static_flag"
-
-# Compiler flag to turn off builtin functions.
-no_builtin_flag="$no_builtin_flag"
-
-# Compiler flag to allow reflexive dlopens.
-export_dynamic_flag_spec="$export_dynamic_flag_spec"
-
-# Library versioning type.
-version_type=$version_type
-
-# Format of library name prefix.
-libname_spec="$libname_spec"
-
-# List of archive names.  First name is the real one, the rest are links.
-# The last name is the one that the linker finds with -lNAME.
-library_names_spec="$library_names_spec"
-
-# The coded name of the library, if different from the real name.
-soname_spec="$soname_spec"
-
-# Commands used to build and install an old-style archive.
-RANLIB="$RANLIB"
-old_archive_cmds="$old_archive_cmds"
-old_postinstall_cmds="$old_postinstall_cmds"
-old_postuninstall_cmds="$old_postuninstall_cmds"
-
-# Create an old-style archive from a shared archive.
-old_archive_from_new_cmds="$old_archive_from_new_cmds"
-
-# Commands used to build and install a shared archive.
-archive_cmds="$archive_cmds"
-postinstall_cmds="$postinstall_cmds"
-postuninstall_cmds="$postuninstall_cmds"
-
-# Flag that allows shared libraries with undefined symbols to be built.
-allow_undefined_flag="$allow_undefined_flag"
-
-# Flag that forces no undefined symbols.
-no_undefined_flag="$no_undefined_flag"
-
-# Commands used to finish a libtool library installation in a directory.
-finish_cmds="$finish_cmds"
-
-# Same as above, but a single script fragment to be evaled but not shown.
-finish_eval="$finish_eval"
-
-# Take the output of nm and produce a listing of raw symbols and C names.
-global_symbol_pipe="$global_symbol_pipe"
-
-# This is the shared library runtime path variable.
-runpath_var=$runpath_var
-
-# This is the shared library path variable.
-shlibpath_var=$shlibpath_var
-
-# How to hardcode a shared library path into an executable.
-hardcode_action=$hardcode_action
-
-# Flag to hardcode \$libdir into a binary during linking.
-# This must work even if \$libdir does not exist.
-hardcode_libdir_flag_spec="$hardcode_libdir_flag_spec"
-
-# Whether we need a single -rpath flag with a separated argument.
-hardcode_libdir_separator="$hardcode_libdir_separator"
-
-# Set to yes if using DIR/libNAME.so during linking hardcodes DIR into the
-# resulting binary.
-hardcode_direct=$hardcode_direct
-
-# Set to yes if using the -LDIR flag during linking hardcodes DIR into the
-# resulting binary.
-hardcode_minus_L=$hardcode_minus_L
-
-# Set to yes if using SHLIBPATH_VAR=DIR during linking hardcodes DIR into
-# the resulting binary.
-hardcode_shlibpath_var=$hardcode_shlibpath_var
-
-EOF
-
-case "$host_os" in
-aix3*)
-  cat <<\EOF >> $ofile
-# AIX sometimes has problems with the GCC collect2 program.  For some
-# reason, if we set the COLLECT_NAMES environment variable, the problems
-# vanish in a puff of smoke.
-if test "${COLLECT_NAMES+set}" != set; then
-  COLLECT_NAMES=
-  export COLLECT_NAMES
-fi
-
-EOF
-  ;;
-esac
-
-# Append the ltmain.sh script.
-cat "$ltmain" >> $ofile || (rm -f $ofile; exit 1)
-
-chmod +x $ofile
-exit 0
-
-# Local Variables:
-# mode:shell-script
-# sh-indentation:2
-# End:

diff --git a/ltmain.sh b/ltmain.sh
deleted file mode 100644
index e9350b3..0000000
--- a/ltmain.sh
+++ /dev/null

@@ -1,2453 +0,0 @@
-# ltmain.sh - Provide generalized library-building support services.
-# NOTE: Changing this file will not affect anything until you rerun ltconfig.
-#
-# Copyright (C) 1996-1998 Free Software Foundation, Inc.
-# Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# The name of this program.
-progname=`$echo "$0" | sed 's%^.*/%%'`
-modename="$progname"
-
-# Constants.
-PROGRAM=ltmain.sh
-PACKAGE=libtool
-VERSION=1.2
-
-default_mode=
-help="Try \`$progname --help' for more information."
-magic="%%%MAGIC variable%%%"
-mkdir="mkdir"
-mv="mv -f"
-rm="rm -f"
-
-# Sed substitution that helps us do robust quoting.  It backslashifies
-# metacharacters that are still active within double-quoted strings.
-Xsed='sed -e s/^X//'
-sed_quote_subst='s/\([\\`\\"$\\\\]\)/\\\1/g'
-
-# NLS nuisances.
-# Only set LANG and LC_ALL to C if already set.
-# These must not be set unconditionally because not all systems understand
-# e.g. LANG=C (notably SCO).
-if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi
-if test "${LANG+set}"   = set; then LANG=C;   export LANG;   fi
-
-if test "$LTCONFIG_VERSION" != "$VERSION"; then
-  echo "$modename: ltconfig version \`$LTCONFIG_VERSION' does not match $PROGRAM version \`$VERSION'" 1>&2
-  echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
-  exit 1
-fi
-
-if test "$build_libtool_libs" != yes && test "$build_old_libs" != yes; then
-  echo "$modename: not configured to build any kind of library" 1>&2
-  echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
-  exit 1
-fi
-
-# Global variables.
-mode=$default_mode
-nonopt=
-prev=
-prevopt=
-run=
-show="$echo"
-show_help=
-execute_dlfiles=
-
-# Parse our command line options once, thoroughly.
-while test $# -gt 0
-do
-  arg="$1"
-  shift
-
-  case "$arg" in
-  -*=*) optarg=`$echo "X$arg" | $Xsed -e 's/[-_a-zA-Z0-9]*=//'` ;;
-  *) optarg= ;;
-  esac
-
-  # If the previous option needs an argument, assign it.
-  if test -n "$prev"; then
-    case "$prev" in
-    execute_dlfiles)
-      eval "$prev=\"\$$prev \$arg\""
-      ;;
-    *)
-      eval "$prev=\$arg"
-      ;;
-    esac
-
-    prev=
-    prevopt=
-    continue
-  fi
-
-  # Have we seen a non-optional argument yet?
-  case "$arg" in
-  --help)
-    show_help=yes
-    ;;
-
-  --version)
-    echo "$PROGRAM (GNU $PACKAGE) $VERSION"
-    exit 0
-    ;;
-
-  --dry-run | -n)
-    run=:
-    ;;
-
-  --features)
-    echo "host: $host"
-    if test "$build_libtool_libs" = yes; then
-      echo "enable shared libraries"
-    else
-      echo "disable shared libraries"
-    fi
-    if test "$build_old_libs" = yes; then
-      echo "enable static libraries"
-    else
-      echo "disable static libraries"
-    fi
-    exit 0
-    ;;
-
-  --finish) mode="finish" ;;
-
-  --mode) prevopt="--mode" prev=mode ;;
-  --mode=*) mode="$optarg" ;;
-
-  --quiet | --silent)
-    show=:
-    ;;
-
-  -dlopen)
-    prevopt="-dlopen"
-    prev=execute_dlfiles
-    ;;
-
-  -*)
-    $echo "$modename: unrecognized option \`$arg'" 1>&2
-    $echo "$help" 1>&2
-    exit 1
-    ;;
-
-  *)
-    nonopt="$arg"
-    break
-    ;;
-  esac
-done
-
-if test -n "$prevopt"; then
-  $echo "$modename: option \`$prevopt' requires an argument" 1>&2
-  $echo "$help" 1>&2
-  exit 1
-fi
-
-if test -z "$show_help"; then
-
-  # Infer the operation mode.
-  if test -z "$mode"; then
-    case "$nonopt" in
-    *cc | *++ | gcc* | *-gcc*)
-      mode=link
-      for arg
-      do
-        case "$arg" in
-        -c)
-           mode=compile
-           break
-           ;;
-        esac
-      done
-      ;;
-    *db | *dbx)
-      mode=execute
-      ;;
-    *install*|cp|mv)
-      mode=install
-      ;;
-    *rm)
-      mode=uninstall
-      ;;
-    *)
-      # If we have no mode, but dlfiles were specified, then do execute mode.
-      test -n "$execute_dlfiles" && mode=execute
-
-      # Just use the default operation mode.
-      if test -z "$mode"; then
-        if test -n "$nonopt"; then
-          $echo "$modename: warning: cannot infer operation mode from \`$nonopt'" 1>&2
-        else
-          $echo "$modename: warning: cannot infer operation mode without MODE-ARGS" 1>&2
-        fi
-      fi
-      ;;
-    esac
-  fi
-
-  # Only execute mode is allowed to have -dlopen flags.
-  if test -n "$execute_dlfiles" && test "$mode" != execute; then
-    $echo "$modename: unrecognized option \`-dlopen'" 1>&2
-    $echo "$help" 1>&2
-    exit 1
-  fi
-
-  # Change the help message to a mode-specific one.
-  generic_help="$help"
-  help="Try \`$modename --help --mode=$mode' for more information."
-
-  # These modes are in order of execution frequency so that they run quickly.
-  case "$mode" in
-  # libtool compile mode
-  compile)
-    modename="$modename: compile"
-    # Get the compilation command and the source file.
-    base_compile=
-    lastarg=
-    srcfile="$nonopt"
-    suppress_output=
-
-    for arg
-    do
-      # Accept any command-line options.
-      case "$arg" in
-      -o)
-	$echo "$modename: you cannot specify the output filename with \`-o'" 1>&2
-	$echo "$help" 1>&2
-	exit 1
-	;;
-
-      -static)
-	build_libtool_libs=no
-	build_old_libs=yes
-	continue
-	;;
-      esac
-
-      # Accept the current argument as the source file.
-      lastarg="$srcfile"
-      srcfile="$arg"
-
-      # Aesthetically quote the previous argument.
-
-      # Backslashify any backslashes, double quotes, and dollar signs.
-      # These are the only characters that are still specially
-      # interpreted inside of double-quoted scrings.
-      lastarg=`$echo "X$lastarg" | $Xsed -e "$sed_quote_subst"`
-
-      # Double-quote args containing other shell metacharacters.
-      # Many Bourne shells cannot handle close brackets correctly in scan
-      # sets, so we specify it separately.
-      case "$lastarg" in
-      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
-	lastarg="\"$lastarg\""
-	;;
-      esac
-
-      # Add the previous argument to base_compile.
-      if test -z "$base_compile"; then
-	base_compile="$lastarg"
-      else
-	base_compile="$base_compile $lastarg"
-      fi
-    done
-
-    # Get the name of the library object.
-    libobj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%'`
-
-    # Recognize several different file suffixes.
-    xform='[cCFSfms]'
-    case "$libobj" in
-    *.ada) xform=ada ;;
-    *.adb) xform=adb ;;
-    *.ads) xform=ads ;;
-    *.asm) xform=asm ;;
-    *.c++) xform=c++ ;;
-    *.cc) xform=cc ;;
-    *.cpp) xform=cpp ;;
-    *.cxx) xform=cxx ;;
-    *.f90) xform=f90 ;;
-    *.for) xform=for ;;
-    esac
-
-    libobj=`$echo "X$libobj" | $Xsed -e "s/\.$xform$/.lo/"`
-
-    case "$libobj" in
-    *.lo) obj=`$echo "X$libobj" | $Xsed -e 's/\.lo$/.o/'` ;;
-    *)
-      $echo "$modename: cannot determine name of library object from \`$srcfile'" 1>&2
-      exit 1
-      ;;
-    esac
-
-    if test -z "$base_compile"; then
-      $echo "$modename: you must specify a compilation command" 1>&2
-      $echo "$help" 1>&2
-      exit 1
-    fi
-
-    # Delete any leftover library objects.
-    if test "$build_old_libs" = yes; then
-      $run $rm $obj $libobj
-      trap "$run $rm $obj $libobj; exit 1" 1 2 15
-    else
-      $run $rm $libobj
-      trap "$run $rm $libobj; exit 1" 1 2 15
-    fi
-
-    # Only build a PIC object if we are building libtool libraries.
-    if test "$build_libtool_libs" = yes; then
-      # Without this assignment, base_compile gets emptied.
-      fbsd_hideous_sh_bug=$base_compile
-
-      # All platforms use -DPIC, to notify preprocessed assembler code.
-      $show "$base_compile$pic_flag -DPIC $srcfile"
-      if $run eval "$base_compile\$pic_flag -DPIC \$srcfile"; then :
-      else
-        test -n "$obj" && $run $rm $obj
-        exit 1
-      fi
-
-      # If we have no pic_flag, then copy the object into place and finish.
-      if test -z "$pic_flag"; then
-        $show "$LN_S $obj $libobj"
-        $run $LN_S $obj $libobj
-        exit $?
-      fi
-
-      # Just move the object, then go on to compile the next one
-      $show "$mv $obj $libobj"
-      $run $mv $obj $libobj || exit 1
-
-      # Allow error messages only from the first compilation.
-      suppress_output=' >/dev/null 2>&1'
-    fi
-
-    # Only build a position-dependent object if we build old libraries.
-    if test "$build_old_libs" = yes; then
-      # Suppress compiler output if we already did a PIC compilation.
-      $show "$base_compile $srcfile$suppress_output"
-      if $run eval "$base_compile \$srcfile$suppress_output"; then :
-      else
-        $run $rm $obj $libobj
-        exit 1
-      fi
-    fi
-
-    # Create an invalid libtool object if no PIC, so that we do not
-    # accidentally link it into a program.
-    if test "$build_libtool_libs" != yes; then
-      $show "echo timestamp > $libobj"
-      $run eval "echo timestamp > \$libobj" || exit $?
-    fi
-
-    exit 0
-    ;;
-
-  # libtool link mode
-  link)
-    modename="$modename: link"
-    CC="$nonopt"
-    allow_undefined=yes
-    compile_command="$CC"
-    finalize_command="$CC"
-
-    compile_shlibpath=
-    finalize_shlibpath=
-    deplibs=
-    dlfiles=
-    dlprefiles=
-    export_dynamic=no
-    hardcode_libdirs=
-    libobjs=
-    link_against_libtool_libs=
-    ltlibs=
-    objs=
-    prev=
-    prevarg=
-    release=
-    rpath=
-    perm_rpath=
-    temp_rpath=
-    vinfo=
-
-    # We need to know -static, to get the right output filenames.
-    for arg
-    do
-      case "$arg" in
-      -all-static | -static)
-        if test "X$arg" = "X-all-static" && test "$build_libtool_libs" = yes && test -z "$link_static_flag"; then
-	    $echo "$modename: warning: complete static linking is impossible in this configuration" 1>&2
-        fi
-        build_libtool_libs=no
-	build_old_libs=yes
-        break
-        ;;
-      esac
-    done
-
-    # See if our shared archives depend on static archives.
-    test -n "$old_archive_from_new_cmds" && build_old_libs=yes
-
-    # Go through the arguments, transforming them on the way.
-    for arg
-    do
-      # If the previous option needs an argument, assign it.
-      if test -n "$prev"; then
-        case "$prev" in
-        output)
-          compile_command="$compile_command @OUTPUT@"
-          finalize_command="$finalize_command @OUTPUT@"
-          ;;
-        esac
-
-        case "$prev" in
-        dlfiles|dlprefiles)
-          case "$arg" in
-          *.la | *.lo) ;;  # We handle these cases below.
-          *)
-            dlprefiles="$dlprefiles $arg"
-            test "$prev" = dlfiles && dlfiles="$dlfiles $arg"
-            prev=
-            ;;
-          esac
-          ;;
-	release)
-	  release="-$arg"
-	  prev=
-	  continue
-	  ;;
-        rpath)
-          rpath="$rpath $arg"
-	  prev=
-	  continue
-	  ;;
-        *)
-          eval "$prev=\"\$arg\""
-          prev=
-          continue
-          ;;
-        esac
-      fi
-
-      prevarg="$arg"
-
-      case "$arg" in
-      -all-static)
-	if test -n "$link_static_flag"; then
-          compile_command="$compile_command $link_static_flag"
-	  finalize_command="$finalize_command $link_static_flag"
-        fi
-        continue
-	;;
-
-      -allow-undefined)
-	# FIXME: remove this flag sometime in the future.
-	$echo "$modename: \`-allow-undefined' is deprecated because it is the default" 1>&2
-	continue
-	;;
-
-      -dlopen)
-        prev=dlfiles
-        continue
-        ;;
-
-      -dlpreopen)
-        prev=dlprefiles
-        continue
-        ;;
-
-      -export-dynamic)
-        if test "$export_dynamic" != yes; then
-          export_dynamic=yes
-	  if test -n "$export_dynamic_flag_spec"; then
-	    eval arg=\"$export_dynamic_flag_spec\"
-	  else
-	    arg=
-	  fi
-
-          # Add the symbol object into the linking commands.
-	  compile_command="$compile_command @SYMFILE@"
-	  finalize_command="$finalize_command @SYMFILE@"
-        fi
-        ;;
-
-      -L*)
-        dir=`$echo "X$arg" | $Xsed -e 's%^-L\(.*\)$%\1%'`
-        case "$dir" in
-        /* | [A-Za-z]:\\*)
-	  # Add the corresponding hardcode_libdir_flag, if it is not identical.
-          ;;
-        *)
-          $echo "$modename: \`-L$dir' cannot specify a relative directory" 1>&2
-          exit 1
-          ;;
-        esac
-        deplibs="$deplibs $arg"
-        ;;
-
-      -l*) deplibs="$deplibs $arg" ;;
-
-      -no-undefined)
-	allow_undefined=no
-	continue
-	;;
-
-      -o) prev=output ;;
-
-      -release)
-	prev=release
-	continue
-	;;
-
-      -rpath)
-        prev=rpath
-        continue
-        ;;
-
-      -static)
-	# If we have no pic_flag, then this is the same as -all-static.
-	if test -z "$pic_flag" && test -n "$link_static_flag"; then
-          compile_command="$compile_command $link_static_flag"
-	  finalize_command="$finalize_command $link_static_flag"
-        fi
-	continue
-	;;
-
-      -version-info)
-        prev=vinfo
-        continue
-        ;;
-
-      # Some other compiler flag.
-      -* | +*)
-	# Unknown arguments in both finalize_command and compile_command need
-	# to be aesthetically quoted because they are evaled later.
-	arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
-	case "$arg" in
-	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
-	  arg="\"$arg\""
-	  ;;
-	esac
-        ;;
-
-      *.o | *.a)
-        # A standard object.
-        objs="$objs $arg"
-        ;;
-
-      *.lo)
-        # A library object.
-	if test "$prev" = dlfiles; then
-	  dlfiles="$dlfiles $arg"
-	  if test "$build_libtool_libs" = yes; then
-	    prev=
-	    continue
-	  else
-	    # If libtool objects are unsupported, then we need to preload.
-	    prev=dlprefiles
-	  fi
-	fi
-
-	if test "$prev" = dlprefiles; then
-	  # Preload the old-style object.
-	  dlprefiles="$dlprefiles "`$echo "X$arg" | $Xsed -e 's/\.lo$/\.o/'`
-	  prev=
-	fi
-	libobjs="$libobjs $arg"
-        ;;
-
-      *.la)
-        # A libtool-controlled library.
-
-        dlname=
-        libdir=
-        library_names=
-        old_library=
-
-        # Check to see that this really is a libtool archive.
-        if (sed -e '2q' $arg | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then :
-        else
-          $echo "$modename: \`$arg' is not a valid libtool archive" 1>&2
-          exit 1
-        fi
-
-        # If there is no directory component, then add one.
-        case "$arg" in
-        */* | *\\*) . $arg ;;
-        *) . ./$arg ;;
-        esac
-
-        if test -z "$libdir"; then
-          $echo "$modename: \`$arg' contains no -rpath information" 1>&2
-          exit 1
-        fi
-
-        # Get the name of the library we link against.
-        linklib=
-        for l in $old_library $library_names; do
-          linklib="$l"
-        done
-
-        if test -z "$linklib"; then
-          $echo "$modename: cannot find name of link library for \`$arg'" 1>&2
-          exit 1
-        fi
-
-        # Find the relevant object directory and library name.
-        name=`$echo "X$arg" | $Xsed -e 's%^.*/%%' -e 's/\.la$//' -e 's/^lib//'`
-        dir=`$echo "X$arg" | $Xsed -e 's%/[^/]*$%%'`
-        if test "X$dir" = "X$arg"; then
-          dir="$objdir"
-        else
-          dir="$dir/$objdir"
-        fi
-
-        # This library was specified with -dlopen.
-        if test "$prev" = dlfiles; then
-          dlfiles="$dlfiles $arg"
-          if test -z "$dlname"; then
-            # If there is no dlname, we need to preload.
-            prev=dlprefiles
-          else
-            # We should not create a dependency on this library, but we
-	    # may need any libraries it requires.
-	    compile_command="$compile_command$dependency_libs"
-	    finalize_command="$finalize_command$dependency_libs"
-            prev=
-            continue
-          fi
-        fi
-
-        # The library was specified with -dlpreopen.
-        if test "$prev" = dlprefiles; then
-          # Prefer using a static library (so that no silly _DYNAMIC symbols
-          # are required to link).
-          if test -n "$old_library"; then
-            dlprefiles="$dlprefiles $dir/$old_library"
-          else
-            dlprefiles="$dlprefiles $dir/$linklib"
-          fi
-          prev=
-        fi
-
-        if test "$build_libtool_libs" = yes && test -n "$library_names"; then
-          link_against_libtool_libs="$link_against_libtool_libs $arg"
-          if test -n "$shlibpath_var"; then
-            # Make sure the rpath contains only unique directories.
-            case "$temp_rpath " in
-            *" $dir "*) ;;
-            *) temp_rpath="$temp_rpath $dir" ;;
-            esac
-          fi
-
-	  # This is the magic to use -rpath.
-          if test -n "$hardcode_libdir_flag_spec"; then
-            if test -n "$hardcode_libdir_separator"; then
-              if test -z "$hardcode_libdirs"; then
-                # Put the magic libdir with the hardcode flag.
-                hardcode_libdirs="$libdir"
-                libdir="@HARDCODE_LIBDIRS@"
-              else
-                # Just accumulate the unique libdirs.
-		case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in
-		*"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
-		  ;;
-		*)
-		  hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
-		  ;;
-		esac
-                libdir=
-              fi
-            fi
-
-            if test -n "$libdir"; then
-              eval flag=\"$hardcode_libdir_flag_spec\"
-
-              compile_command="$compile_command $flag"
-              finalize_command="$finalize_command $flag"
-            fi
-          elif test -n "$runpath_var"; then
-            # Do the same for the permanent run path.
-            case "$perm_rpath " in
-            *" $libdir "*) ;;
-            *) perm_rpath="$perm_rpath $libdir" ;;
-            esac
-          fi
-
-
-          case "$hardcode_action" in
-          immediate)
-            if test "$hardcode_direct" = no; then
-              compile_command="$compile_command $dir/$linklib"
-            elif test "$hardcode_minus_L" = no; then
-              compile_command="$compile_command -L$dir -l$name"
-            elif test "$hardcode_shlibpath_var" = no; then
-              compile_shlibpath="$compile_shlibpath$dir:"
-              compile_command="$compile_command -l$name"
-            fi
-            ;;
-
-          relink)
-            # We need an absolute path.
-            case "$dir" in
-            /* | [A-Za-z]:\\*) ;;
-            *)
-              absdir=`cd "$dir" && pwd`
-              if test -z "$absdir"; then
-                $echo "$modename: cannot determine absolute directory name of \`$dir'" 1>&2
-                exit 1
-              fi
-              dir="$absdir"
-              ;;
-            esac
-
-            if test "$hardcode_direct" = yes; then
-              compile_command="$compile_command $dir/$linklib"
-            elif test "$hardcode_minus_L" = yes; then
-              compile_command="$compile_command -L$dir -l$name"
-            elif test "$hardcode_shlibpath_var" = yes; then
-              compile_shlibpath="$compile_shlibpath$dir:"
-              compile_command="$compile_command -l$name"
-            fi
-            ;;
-
-          *)
-            $echo "$modename: \`$hardcode_action' is an unknown hardcode action" 1>&2
-            exit 1
-            ;;
-          esac
-
-          # Finalize command for both is simple: just hardcode it.
-          if test "$hardcode_direct" = yes; then
-            finalize_command="$finalize_command $libdir/$linklib"
-          elif test "$hardcode_minus_L" = yes; then
-            finalize_command="$finalize_command -L$libdir -l$name"
-          elif test "$hardcode_shlibpath_var" = yes; then
-            finalize_shlibpath="$finalize_shlibpath$libdir:"
-            finalize_command="$finalize_command -l$name"
-          else
-            # We cannot seem to hardcode it, guess we'll fake it.
-            finalize_command="$finalize_command -L$libdir -l$name"
-          fi
-        else
-          # Transform directly to old archives if we don't build new libraries.
-          if test -n "$pic_flag" && test -z "$old_library"; then
-            $echo "$modename: cannot find static library for \`$arg'" 1>&2
-            exit 1
-          fi
-
-	  # Here we assume that one of hardcode_direct or hardcode_minus_L
-	  # is not unsupported.  This is valid on all known static and
-	  # shared platforms.
-	  if test "$hardcode_direct" != unsupported; then
-	    test -n "$old_library" && linklib="$old_library"
-	    compile_command="$compile_command $dir/$linklib"
-	    finalize_command="$finalize_command $dir/$linklib"
-	  else
-	    compile_command="$compile_command -L$dir -l$name"
-	    finalize_command="$finalize_command -L$dir -l$name"
-	  fi
-        fi
-
-	# Add in any libraries that this one depends upon.
-	compile_command="$compile_command$dependency_libs"
-	finalize_command="$finalize_command$dependency_libs"
-	continue
-        ;;
-
-      # Some other compiler argument.
-      *)
-	# Unknown arguments in both finalize_command and compile_command need
-	# to be aesthetically quoted because they are evaled later.
-	arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
-	case "$arg" in
-	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
-	  arg="\"$arg\""
-	  ;;
-	esac
-        ;;
-      esac
-
-      # Now actually substitute the argument into the commands.
-      if test -n "$arg"; then
-	compile_command="$compile_command $arg"
-	finalize_command="$finalize_command $arg"
-      fi
-    done
-
-    if test -n "$prev"; then
-      $echo "$modename: the \`$prevarg' option requires an argument" 1>&2
-      $echo "$help" 1>&2
-      exit 1
-    fi
-
-    if test -n "$vinfo" && test -n "$release"; then
-      $echo "$modename: you cannot specify both \`-version-info' and \`-release'" 1>&2
-      $echo "$help" 1>&2
-      exit 1
-    fi
-
-    oldlib=
-    oldobjs=
-    case "$output" in
-    "")
-      $echo "$modename: you must specify an output file" 1>&2
-      $echo "$help" 1>&2
-      exit 1
-      ;;
-
-    */* | *\\*)
-      $echo "$modename: output file \`$output' must have no directory components" 1>&2
-      exit 1
-      ;;
-
-    *.a)
-      # Now set the variables for building old libraries.
-      build_libtool_libs=no
-      build_old_libs=yes
-      oldlib="$output"
-      $show "$rm $oldlib"
-      $run $rm $oldlib
-      ;;
-
-    *.la)
-      # Make sure we only generate libraries of the form `libNAME.la'.
-      case "$output" in
-      lib*) ;;
-      *)
-	$echo "$modename: libtool library \`$arg' must begin with \`lib'" 1>&2
-	$echo "$help" 1>&2
-	exit 1
-	;;
-      esac
-
-      name=`$echo "X$output" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
-      eval libname=\"$libname_spec\"
-
-      # All the library-specific variables (install_libdir is set above).
-      library_names=
-      old_library=
-      dlname=
-      current=0
-      revision=0
-      age=0
-
-      if test -n "$objs"; then
-        $echo "$modename: cannot build libtool library \`$output' from non-libtool objects:$objs" 2>&1
-        exit 1
-      fi
-
-      # How the heck are we supposed to write a wrapper for a shared library?
-      if test -n "$link_against_libtool_libs"; then
-        $echo "$modename: libtool library \`$output' may not depend on uninstalled libraries:$link_against_libtool_libs" 1>&2
-        exit 1
-      fi
-
-      if test -n "$dlfiles$dlprefiles"; then
-        $echo "$modename: warning: \`-dlopen' is ignored while creating libtool libraries" 1>&2
-        # Nullify the symbol file.
-        compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
-        finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
-      fi
-
-      if test -z "$rpath"; then
-        $echo "$modename: you must specify an installation directory with \`-rpath'" 1>&2
-	$echo "$help" 1>&2
-        exit 1
-      fi
-
-      set dummy $rpath
-      if test $# -gt 2; then
-	$echo "$modename: warning: ignoring multiple \`-rpath's for a libtool library" 1>&2
-      fi
-      install_libdir="$2"
-
-      # Parse the version information argument.
-      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=':'
-      set dummy $vinfo
-      IFS="$save_ifs"
-
-      if test -n "$5"; then
-        $echo "$modename: too many parameters to \`-version-info'" 1>&2
-        $echo "$help" 1>&2
-        exit 1
-      fi
-
-      test -n "$2" && current="$2"
-      test -n "$3" && revision="$3"
-      test -n "$4" && age="$4"
-
-      # Check that each of the things are valid numbers.
-      case "$current" in
-      0 | [1-9] | [1-9][0-9]*) ;;
-      *)
-        $echo "$modename: CURRENT \`$current' is not a nonnegative integer" 1>&2
-        $echo "$modename: \`$vinfo' is not valid version information" 1>&2
-        exit 1
-        ;;
-      esac
-
-      case "$revision" in
-      0 | [1-9] | [1-9][0-9]*) ;;
-      *)
-        $echo "$modename: REVISION \`$revision' is not a nonnegative integer" 1>&2
-        $echo "$modename: \`$vinfo' is not valid version information" 1>&2
-        exit 1
-        ;;
-      esac
-
-      case "$age" in
-      0 | [1-9] | [1-9][0-9]*) ;;
-      *)
-        $echo "$modename: AGE \`$age' is not a nonnegative integer" 1>&2
-        $echo "$modename: \`$vinfo' is not valid version information" 1>&2
-        exit 1
-        ;;
-      esac
-
-      if test $age -gt $current; then
-        $echo "$modename: AGE \`$age' is greater than the current interface number \`$current'" 1>&2
-        $echo "$modename: \`$vinfo' is not valid version information" 1>&2
-        exit 1
-      fi
-
-      # Calculate the version variables.
-      version_vars="version_type current age revision"
-      case "$version_type" in
-      none) ;;
-
-      linux)
-        version_vars="$version_vars major versuffix"
-        major=`expr $current - $age`
-        versuffix="$major.$age.$revision"
-        ;;
-
-      osf)
-        version_vars="$version_vars versuffix verstring"
-        major=`expr $current - $age`
-        versuffix="$current.$age.$revision"
-        verstring="$versuffix"
-
-        # Add in all the interfaces that we are compatible with.
-        loop=$age
-        while test $loop != 0; do
-          iface=`expr $current - $loop`
-          loop=`expr $loop - 1`
-          verstring="$verstring:${iface}.0"
-        done
-
-        # Make executables depend on our current version.
-        verstring="$verstring:${current}.0"
-        ;;
-
-      sunos)
-        version_vars="$version_vars major versuffix"
-        major="$current"
-        versuffix="$current.$revision"
-        ;;
-
-      *)
-        $echo "$modename: unknown library version type \`$version_type'" 1>&2
-        echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
-        exit 1
-        ;;
-      esac
-
-      # Create the output directory, or remove our outputs if we need to.
-      if test -d $objdir; then
-        $show "$rm $objdir/$output $objdir/$libname.* $objdir/${libname}${release}.*"
-        $run $rm $objdir/$output $objdir/$libname.* $objdir/${libname}${release}.*
-      else
-        $show "$mkdir $objdir"
-        $run $mkdir $objdir
-	status=$?
-	if test $status -eq 0 || test -d $objdir; then :
-	else
-	  exit $status
-	fi
-      fi
-
-      # Check to see if the archive will have undefined symbols.
-      if test "$allow_undefined" = yes; then
-        if test "$allow_undefined_flag" = unsupported; then
-          $echo "$modename: warning: undefined symbols not allowed in $host shared libraries" 1>&2
-          build_libtool_libs=no
-	  build_old_libs=yes
-        fi
-      else
-        # Don't allow undefined symbols.
-        allow_undefined_flag="$no_undefined_flag"
-      fi
-
-      # Add libc to deplibs on all systems.
-      dependency_libs="$deplibs"
-      deplibs="$deplibs -lc"
-
-      if test "$build_libtool_libs" = yes; then
-        # Get the real and link names of the library.
-        eval library_names=\"$library_names_spec\"
-        set dummy $library_names
-        realname="$2"
-        shift; shift
-
-        if test -n "$soname_spec"; then
-          eval soname=\"$soname_spec\"
-        else
-          soname="$realname"
-        fi
-
-        lib="$objdir/$realname"
-	for link
-	do
-	  linknames="$linknames $link"
-	done
-
-        # Use standard objects if they are PIC.
-        test -z "$pic_flag" && libobjs=`$echo "X$libobjs " | $Xsed -e 's/\.lo /.o /g' -e 's/ $//g'`
-
-        # Do each of the archive commands.
-        eval cmds=\"$archive_cmds\"
-        IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-        for cmd in $cmds; do
-          IFS="$save_ifs"
-          $show "$cmd"
-          $run eval "$cmd" || exit $?
-        done
-        IFS="$save_ifs"
-
-        # Create links to the real library.
-        for linkname in $linknames; do
-          $show "(cd $objdir && $LN_S $realname $linkname)"
-          $run eval '(cd $objdir && $LN_S $realname $linkname)' || exit $?
-        done
-
-        # If -export-dynamic was specified, set the dlname.
-        if test "$export_dynamic" = yes; then
-          # On all known operating systems, these are identical.
-          dlname="$soname"
-        fi
-      fi
-
-      # Now set the variables for building old libraries.
-      oldlib="$objdir/$libname.a"
-      ;;
-
-    *.lo | *.o)
-      if test -n "$link_against_libtool_libs"; then
-        $echo "$modename: error: cannot link libtool libraries into reloadable objects" 1>&2
-        exit 1
-      fi
-
-      if test -n "$deplibs"; then
-        $echo "$modename: warning: \`-l' and \`-L' are ignored while creating objects" 1>&2
-      fi
-
-      if test -n "$dlfiles$dlprefiles"; then
-        $echo "$modename: warning: \`-dlopen' is ignored while creating objects" 1>&2
-        # Nullify the symbol file.
-        compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
-        finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
-      fi
-
-      if test -n "$rpath"; then
-        $echo "$modename: warning: \`-rpath' is ignored while creating objects" 1>&2
-      fi
-
-      if test -n "$vinfo"; then
-        $echo "$modename: warning: \`-version-info' is ignored while creating objects" 1>&2
-      fi
-
-      if test -n "$release"; then
-        $echo "$modename: warning: \`-release' is ignored while creating objects" 1>&2
-      fi
-
-      case "$output" in
-      *.lo)
-        if test -n "$objs"; then
-          $echo "$modename: cannot build library object \`$output' from non-libtool objects" 1>&2
-          exit 1
-        fi
-        libobj="$output"
-        obj=`$echo "X$output" | $Xsed -e 's/\.lo$/.o/'`
-        ;;
-      *)
-        libobj=
-        obj="$output"
-        ;;
-      esac
-
-      # Delete the old objects.
-      $run $rm $obj $libobj
-
-      # Create the old-style object.
-      reload_objs="$objs"`$echo "X$libobjs " | $Xsed -e 's/[^       ]*\.a //g' -e 's/\.lo /.o /g' -e 's/ $//g'`
-
-      output="$obj"
-      eval cmds=\"$reload_cmds\"
-      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-      for cmd in $cmds; do
-        IFS="$save_ifs"
-        $show "$cmd"
-        $run eval "$cmd" || exit $?
-      done
-      IFS="$save_ifs"
-
-      # Exit if we aren't doing a library object file.
-      test -z "$libobj" && exit 0
-
-      if test "$build_libtool_libs" != yes; then
-        # Create an invalid libtool object if no PIC, so that we don't
-        # accidentally link it into a program.
-        $show "echo timestamp > $libobj"
-        $run eval "echo timestamp > $libobj" || exit $?
-        exit 0
-      fi
-
-      if test -n "$pic_flag"; then
-        # Only do commands if we really have different PIC objects.
-        reload_objs="$libobjs"
-        output="$libobj"
-        eval cmds=\"$reload_cmds\"
-        IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-        for cmd in $cmds; do
-          IFS="$save_ifs"
-          $show "$cmd"
-          $run eval "$cmd" || exit $?
-        done
-        IFS="$save_ifs"
-      else
-        # Just create a symlink.
-        $show "$LN_S $obj $libobj"
-        $run $LN_S $obj $libobj || exit 1
-      fi
-
-      exit 0
-      ;;
-
-    *)
-      if test -n "$vinfo"; then
-        $echo "$modename: warning: \`-version-info' is ignored while linking programs" 1>&2
-      fi
-
-      if test -n "$release"; then
-        $echo "$modename: warning: \`-release' is ignored while creating objects" 1>&2
-      fi
-
-      if test -n "$rpath"; then
-	# If the user specified any rpath flags, then add them.
-	for libdir in $rpath; do
-          if test -n "$hardcode_libdir_flag_spec"; then
-            if test -n "$hardcode_libdir_separator"; then
-              if test -z "$hardcode_libdirs"; then
-                # Put the magic libdir with the hardcode flag.
-                hardcode_libdirs="$libdir"
-                libdir="@HARDCODE_LIBDIRS@"
-              else
-                # Just accumulate the unique libdirs.
-		case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in
-		*"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
-		  ;;
-		*)
-		  hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
-		  ;;
-		esac
-                libdir=
-              fi
-            fi
-
-            if test -n "$libdir"; then
-              eval flag=\"$hardcode_libdir_flag_spec\"
-
-              compile_command="$compile_command $flag"
-              finalize_command="$finalize_command $flag"
-            fi
-          elif test -n "$runpath_var"; then
-            case "$perm_rpath " in
-            *" $libdir "*) ;;
-            *) perm_rpath="$perm_rpath $libdir" ;;
-            esac
-          fi
-	done
-      fi
-
-      # Substitute the hardcoded libdirs into the compile commands.
-      if test -n "$hardcode_libdir_separator"; then
-	compile_command=`$echo "X$compile_command" | $Xsed -e "s%@HARDCODE_LIBDIRS@%$hardcode_libdirs%g"`
-	finalize_command=`$echo "X$finalize_command" | $Xsed -e "s%@HARDCODE_LIBDIRS@%$hardcode_libdirs%g"`
-      fi
-
-      if test -n "$libobjs" && test "$build_old_libs" = yes; then
-        # Transform all the library objects into standard objects.
-        compile_command=`$echo "X$compile_command " | $Xsed -e 's/\.lo /.o /g' -e 's/ $//'`
-        finalize_command=`$echo "X$finalize_command " | $Xsed -e 's/\.lo /.o /g' -e 's/ $//'`
-      fi
-
-      if test "$export_dynamic" = yes && test -n "$NM" && test -n "$global_symbol_pipe"; then
-        dlsyms="${output}S.c"
-      else
-        dlsyms=
-      fi
-
-      if test -n "$dlsyms"; then
-        # Add our own program objects to the preloaded list.
-        dlprefiles=`$echo "X$objs$dlprefiles " | $Xsed -e 's/\.lo /.o /g' -e 's/ $//'`
-
-	# Discover the nlist of each of the dlfiles.
-        nlist="$objdir/${output}.nm"
-
-	if test -d $objdir; then
-	  $show "$rm $nlist ${nlist}T"
-	  $run $rm "$nlist" "${nlist}T"
-	else
-	  $show "$mkdir $objdir"
-	  $run $mkdir $objdir
-	  status=$?
-	  if test $status -eq 0 || test -d $objdir; then :
-	  else
-	    exit $status
-	  fi
-	fi
-
-        for arg in $dlprefiles; do
-	  $show "extracting global C symbols from \`$arg'"
-	  $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
-        done
-
-        # Parse the name list into a source file.
-        $show "creating $objdir/$dlsyms"
-        if test -z "$run"; then
-	  # Make sure we at least have an empty file.
-	  test -f "$nlist" || : > "$nlist"
-
-	  # Try sorting and uniquifying the output.
-	  if sort "$nlist" | uniq > "$nlist"T; then
-	    mv -f "$nlist"T "$nlist"
-	    wcout=`wc "$nlist" 2>/dev/null`
-	    count=`echo "X$wcout" | $Xsed -e 's/^[ 	]*\([0-9][0-9]*\).*$/\1/'`
-	    (test "$count" -ge 0) 2>/dev/null || count=-1
-	  else
-	    $rm "$nlist"T
-	    count=-1
-	  fi
-
-	  case "$dlsyms" in
-	  "") ;;
-	  *.c)
-	    $echo > "$objdir/$dlsyms" "\
-/* $dlsyms - symbol resolution table for \`$output' dlsym emulation. */
-/* Generated by $PROGRAM - GNU $PACKAGE $VERSION */
-
-#ifdef __cplusplus
-extern \"C\" {
-#endif
-
-/* Prevent the only kind of declaration conflicts we can make. */
-#define dld_preloaded_symbol_count some_other_symbol
-#define dld_preloaded_symbols some_other_symbol
-
-/* External symbol declarations for the compiler. */\
-"
-
-	    if test -f "$nlist"; then
-	      sed -e 's/^.* \(.*\)$/extern char \1;/' < "$nlist" >> "$objdir/$dlsyms"
-	    else
-	      echo '/* NONE */' >> "$objdir/$dlsyms"
-	    fi
-
-	    $echo >> "$objdir/$dlsyms" "\
-
-#undef dld_preloaded_symbol_count
-#undef dld_preloaded_symbols
-
-#if defined (__STDC__) && __STDC__
-# define __ptr_t void *
-#else
-# define __ptr_t char *
-#endif
-
-/* The number of symbols in dld_preloaded_symbols, -1 if unsorted. */
-int dld_preloaded_symbol_count = $count;
-
-/* The mapping between symbol names and symbols. */
-struct {
-  char *name;
-  __ptr_t address;
-}
-dld_preloaded_symbols[] =
-{\
-"
-
-	    if test -f "$nlist"; then
-	      sed 's/^\(.*\) \(.*\)$/  {"\1", (__ptr_t) \&\2},/' < "$nlist" >> "$objdir/$dlsyms"
-	    fi
-
-	    $echo >> "$objdir/$dlsyms" "\
-  {0, (__ptr_t) 0}
-};
-
-#ifdef __cplusplus
-}
-#endif\
-"
-	    ;;
-
-	  *)
-	    $echo "$modename: unknown suffix for \`$dlsyms'" 1>&2
-	    exit 1
-	    ;;
-	  esac
-        fi
-
-        # Now compile the dynamic symbol file.
-        $show "(cd $objdir && $CC -c$no_builtin_flag \"$dlsyms\")"
-        $run eval '(cd $objdir && $CC -c$no_builtin_flag "$dlsyms")' || exit $?
-
-        # Transform the symbol file into the correct name.
-        compile_command=`$echo "X$compile_command" | $Xsed -e "s%@SYMFILE@%$objdir/${output}S.o%"`
-        finalize_command=`$echo "X$finalize_command" | $Xsed -e "s%@SYMFILE@%$objdir/${output}S.o%"`
-      elif test "$export_dynamic" != yes; then
-        test -n "$dlfiles$dlprefiles" && $echo "$modename: warning: \`-dlopen' and \`-dlpreopen' are ignored without \`-export-dynamic'" 1>&2
-      else
-        # We keep going just in case the user didn't refer to
-        # dld_preloaded_symbols.  The linker will fail if global_symbol_pipe
-        # really was required.
-        $echo "$modename: not configured to extract global symbols from dlpreopened files" 1>&2
-
-        # Nullify the symbol file.
-        compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
-        finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
-      fi
-
-      if test -z "$link_against_libtool_libs" || test "$build_libtool_libs" != yes; then
-        # Replace the output file specification.
-        compile_command=`$echo "X$compile_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
-        finalize_command=`$echo "X$finalize_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
-
-        # We have no uninstalled library dependencies, so finalize right now.
-        $show "$compile_command"
-        $run eval "$compile_command"
-        exit $?
-      fi
-
-      # Replace the output file specification.
-      compile_command=`$echo "X$compile_command" | $Xsed -e 's%@OUTPUT@%'"$objdir/$output"'%g'`
-      finalize_command=`$echo "X$finalize_command" | $Xsed -e 's%@OUTPUT@%'"$objdir/$output"'T%g'`
-
-      # Create the binary in the object directory, then wrap it.
-      if test -d $objdir; then :
-      else
-        $show "$mkdir $objdir"
-	$run $mkdir $objdir
-	status=$?
-	if test $status -eq 0 || test -d $objdir; then :
-	else
-	  exit $status
-	fi
-      fi
-
-      if test -n "$shlibpath_var"; then
-        # We should set the shlibpath_var
-        rpath=
-        for dir in $temp_rpath; do
-          case "$dir" in
-          /* | [A-Za-z]:\\*)
-            # Absolute path.
-            rpath="$rpath$dir:"
-            ;;
-          *)
-            # Relative path: add a thisdir entry.
-            rpath="$rpath\$thisdir/$dir:"
-            ;;
-          esac
-        done
-        temp_rpath="$rpath"
-      fi
-
-      # Delete the old output file.
-      $run $rm $output
-
-      if test -n "$compile_shlibpath"; then
-        compile_command="$shlibpath_var=\"$compile_shlibpath\$$shlibpath_var\" $compile_command"
-      fi
-      if test -n "$finalize_shlibpath"; then
-        finalize_command="$shlibpath_var=\"$finalize_shlibpath\$$shlibpath_var\" $finalize_command"
-      fi
-
-      if test -n "$runpath_var" && test -n "$perm_rpath"; then
-        # We should set the runpath_var.
-        rpath=
-        for dir in $perm_rpath; do
-          rpath="$rpath$dir:"
-        done
-        compile_command="$runpath_var=\"$rpath\$$runpath_var\" $compile_command"
-        finalize_command="$runpath_var=\"$rpath\$$runpath_var\" $finalize_command"
-      fi
-
-      case "$hardcode_action" in
-      relink)
-        # AGH! Flame the AIX and HP-UX people for me, will ya?
-        $echo "$modename: warning: using a buggy system linker" 1>&2
-        $echo "$modename: relinking will be required before \`$output' can be installed" 1>&2
-        ;;
-      esac
-
-      $show "$compile_command"
-      $run eval "$compile_command" || exit $?
-
-      # Now create the wrapper script.
-      $show "creating $output"
-
-      # Quote the finalize command for shipping.
-      finalize_command=`$echo "X$finalize_command" | $Xsed -e "$sed_quote_subst"`
-
-      # Quote $echo for shipping.
-      qecho=`$echo "X$echo" | $Xsed -e "$sed_quote_subst"`
-
-      # Only actually do things if our run command is non-null.
-      if test -z "$run"; then
-        $rm $output
-        trap "$rm $output; exit 1" 1 2 15
-
-        $echo > $output "\
-#! /bin/sh
-
-# $output - temporary wrapper script for $objdir/$output
-# Generated by ltmain.sh - GNU $PACKAGE $VERSION
-#
-# The $output program cannot be directly executed until all the libtool
-# libraries that it depends on are installed.
-#
-# This wrapper script should never be moved out of \``pwd`'.
-# If it is, it will not operate correctly.
-
-# Sed substitution that helps us do robust quoting.  It backslashifies
-# metacharacters that are still active within double-quoted strings.
-Xsed='sed -e s/^X//'
-sed_quote_subst='$sed_quote_subst'
-
-# The HP-UX ksh and POSIX shell print the target directory to stdout
-# if CDPATH is set.
-if test \"\${CDPATH+set}\" = set; then CDPATH=; export CDPATH; fi
-
-# This environment variable determines our operation mode.
-if test \"\$libtool_install_magic\" = \"$magic\"; then
-  # install mode needs the following variables:
-  link_against_libtool_libs='$link_against_libtool_libs'
-  finalize_command=\"$finalize_command\"
-else
-  # When we are sourced in execute mode, \$file and \$echo are already set.
-  if test \"\$libtool_execute_magic\" = \"$magic\"; then :
-  else
-    echo=\"$qecho\"
-    file=\"\$0\"
-  fi\
-"
-        $echo >> $output "\
-
-  # Find the directory that this script lives in.
-  thisdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*$%%'\`
-  test \"x\$thisdir\" = \"x\$file\" && thisdir=.
-
-  # Follow symbolic links until we get to the real thisdir.
-  file=\`ls -ld \"\$file\" | sed -n 's/.*-> //p'\`
-  while test -n \"\$file\"; do
-    destdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*\$%%'\`
-
-    # If there was a directory component, then change thisdir.
-    if test \"x\$destdir\" != \"x\$file\"; then
-      case \"\$destdir\" in
-      /* | [A-Za-z]:\\*) thisdir=\"\$destdir\" ;;
-      *) thisdir=\"\$thisdir/\$destdir\" ;;
-      esac
-    fi
-
-    file=\`\$echo \"X\$file\" | \$Xsed -e 's%^.*/%%'\`
-    file=\`ls -ld \"\$thisdir/\$file\" | sed -n 's/.*-> //p'\`
-  done
-
-  # Try to get the absolute directory name.
-  absdir=\`cd \"\$thisdir\" && pwd\`
-  test -n \"\$absdir\" && thisdir=\"\$absdir\"
-
-  progdir=\"\$thisdir/$objdir\"
-  program='$output'
-
-  if test -f \"\$progdir/\$program\"; then"
-
-        # Export our shlibpath_var if we have one.
-        if test -n "$shlibpath_var" && test -n "$temp_rpath"; then
-          $echo >> $output "\
-    # Add our own library path to $shlibpath_var
-    $shlibpath_var=\"$temp_rpath\$$shlibpath_var\"
-
-    # Some systems cannot cope with colon-terminated $shlibpath_var
-    $shlibpath_var=\`\$echo \"X\$$shlibpath_var\" | \$Xsed -e 's/:*\$//'\`
-
-    export $shlibpath_var
-"
-        fi
-
-        $echo >> $output "\
-    if test \"\$libtool_execute_magic\" != \"$magic\"; then
-      # Run the actual program with our arguments.
-
-      # Export the path to the program.
-      PATH=\"\$progdir:\$PATH\"
-      export PATH
-
-      exec \$program \${1+\"\$@\"}
-
-      \$echo \"\$0: cannot exec \$program \${1+\"\$@\"}\"
-      exit 1
-    fi
-  else
-    # The program doesn't exist.
-    \$echo \"\$0: error: \$progdir/\$program does not exist\" 1>&2
-    \$echo \"This script is just a wrapper for \$program.\" 1>&2
-    echo \"See the $PACKAGE documentation for more information.\" 1>&2
-    exit 1
-  fi
-fi\
-"
-        chmod +x $output
-      fi
-      exit 0
-      ;;
-    esac
-
-    # See if we need to build an old-fashioned archive.
-    if test "$build_old_libs" = "yes"; then
-      # Transform .lo files to .o files.
-      oldobjs="$objs"`$echo "X$libobjs " | $Xsed -e 's/[^   ]*\.a //g' -e 's/\.lo /.o /g' -e 's/ $//g'`
-
-      # Do each command in the archive commands.
-      if test -n "$old_archive_from_new_cmds" && test "$build_libtool_libs" = yes; then
-	eval cmds=\"$old_archive_from_new_cmds\"
-      else
-	eval cmds=\"$old_archive_cmds\"
-      fi
-      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-      for cmd in $cmds; do
-        IFS="$save_ifs"
-        $show "$cmd"
-        $run eval "$cmd" || exit $?
-      done
-      IFS="$save_ifs"
-    fi
-
-    # Now create the libtool archive.
-    case "$output" in
-    *.la)
-      old_library=
-      test "$build_old_libs" = yes && old_library="$libname.a"
-
-      $show "creating $output"
-
-      # Only create the output if not a dry run.
-      if test -z "$run"; then
-        $echo > $output "\
-# $output - a libtool library file
-# Generated by ltmain.sh - GNU $PACKAGE $VERSION
-
-# The name that we can dlopen(3).
-dlname='$dlname'
-
-# Names of this library.
-library_names='$library_names'
-
-# The name of the static archive.
-old_library='$old_library'
-
-# Libraries that this one depends upon.
-dependency_libs='$dependency_libs'
-
-# Version information for $libname.
-current=$current
-age=$age
-revision=$revision
-
-# Directory that this library needs to be installed in:
-libdir='$install_libdir'\
-"
-      fi
-
-      # Do a symbolic link so that the libtool archive can be found in
-      # LD_LIBRARY_PATH before the program is installed.
-      $show "(cd $objdir && $LN_S ../$output $output)"
-      $run eval "(cd $objdir && $LN_S ../$output $output)" || exit 1
-      ;;
-    esac
-    exit 0
-    ;;
-
-  # libtool install mode
-  install)
-    modename="$modename: install"
-
-    # There may be an optional /bin/sh argument at the beginning of
-    # install_prog (especially on Windows NT).
-    if test "$nonopt" = "$SHELL"; then
-      # Aesthetically quote it.
-      arg=`$echo "X$nonopt" | $Xsed -e "$sed_quote_subst"`
-      case "$arg" in
-      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
-	arg="\"$arg\""
-	;;
-      esac
-      install_prog="$arg "
-      arg="$1"
-      shift
-    else
-      install_prog=
-      arg="$nonopt"
-    fi
-
-    # The real first argument should be the name of the installation program.
-    # Aesthetically quote it.
-    arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
-    case "$arg" in
-    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
-      arg="\"$arg\""
-      ;;
-    esac
-    install_prog="$install_prog$arg"
-
-    # We need to accept at least all the BSD install flags.
-    dest=
-    files=
-    opts=
-    prev=
-    install_type=
-    isdir=
-    stripme=
-    for arg
-    do
-      if test -n "$dest"; then
-        files="$files $dest"
-        dest="$arg"
-        continue
-      fi
-
-      case "$arg" in
-      -d) isdir=yes ;;
-      -f) prev="-f" ;;
-      -g) prev="-g" ;;
-      -m) prev="-m" ;;
-      -o) prev="-o" ;;
-      -s)
-        stripme=" -s"
-        continue
-        ;;
-      -*) ;;
-
-      *)
-        # If the previous option needed an argument, then skip it.
-        if test -n "$prev"; then
-          prev=
-        else
-          dest="$arg"
-          continue
-        fi
-        ;;
-      esac
-
-      # Aesthetically quote the argument.
-      arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
-      case "$arg" in
-      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
-	arg="\"$arg\""
-	;;
-      esac
-      install_prog="$install_prog $arg"
-    done
-
-    if test -z "$install_prog"; then
-      $echo "$modename: you must specify an install program" 1>&2
-      $echo "$help" 1>&2
-      exit 1
-    fi
-
-    if test -n "$prev"; then
-      $echo "$modename: the \`$prev' option requires an argument" 1>&2
-      $echo "$help" 1>&2
-      exit 1
-    fi
-
-    if test -z "$files"; then
-      if test -z "$dest"; then
-        $echo "$modename: no file or destination specified" 1>&2
-      else
-        $echo "$modename: you must specify a destination" 1>&2
-      fi
-      $echo "$help" 1>&2
-      exit 1
-    fi
-
-    # Strip any trailing slash from the destination.
-    dest=`$echo "X$dest" | $Xsed -e 's%/$%%'`
-
-    # Check to see that the destination is a directory.
-    test -d "$dest" && isdir=yes
-    if test -n "$isdir"; then
-      destdir="$dest"
-      destname=
-    else
-      destdir=`$echo "X$dest" | $Xsed -e 's%/[^/]*$%%'`
-      test "X$destdir" = "X$dest" && destdir=.
-      destname=`$echo "X$dest" | $Xsed -e 's%^.*/%%'`
-
-      # Not a directory, so check to see that there is only one file specified.
-      set dummy $files
-      if test $# -gt 2; then
-        $echo "$modename: \`$dest' is not a directory" 1>&2
-        $echo "$help" 1>&2
-        exit 1
-      fi
-    fi
-    case "$destdir" in
-    /* | [A-Za-z]:\\*) ;;
-    *)
-      for file in $files; do
-        case "$file" in
-        *.lo) ;;
-        *)
-          $echo "$modename: \`$destdir' must be an absolute directory name" 1>&2
-          $echo "$help" 1>&2
-          exit 1
-          ;;
-        esac
-      done
-      ;;
-    esac
-
-    # This variable tells wrapper scripts just to set variables rather
-    # than running their programs.
-    libtool_install_magic="$magic"
-
-    staticlibs=
-    future_libdirs=
-    current_libdirs=
-    for file in $files; do
-
-      # Do each installation.
-      case "$file" in
-      *.a)
-        # Do the static libraries later.
-        staticlibs="$staticlibs $file"
-        ;;
-
-      *.la)
-        # Check to see that this really is a libtool archive.
-        if (sed -e '2q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then :
-        else
-          $echo "$modename: \`$file' is not a valid libtool archive" 1>&2
-          $echo "$help" 1>&2
-          exit 1
-        fi
-
-        library_names=
-        old_library=
-        # If there is no directory component, then add one.
-        case "$file" in
-        */* | *\\*) . $file ;;
-        *) . ./$file ;;
-        esac
-
-        # Add the libdir to current_libdirs if it is the destination.
-        if test "X$destdir" = "X$libdir"; then
-          case "$current_libdirs " in
-          *" $libdir "*) ;;
-          *) current_libdirs="$current_libdirs $libdir" ;;
-          esac
-        else
-          # Note the libdir as a future libdir.
-          case "$future_libdirs " in
-          *" $libdir "*) ;;
-          *) future_libdirs="$future_libdirs $libdir" ;;
-          esac
-        fi
-
-        dir="`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`/"
-        test "X$dir" = "X$file/" && dir=
-        dir="$dir$objdir"
-
-        # See the names of the shared library.
-        set dummy $library_names
-        if test -n "$2"; then
-          realname="$2"
-          shift
-          shift
-
-          # Install the shared library and build the symlinks.
-          $show "$install_prog $dir/$realname $destdir/$realname"
-          $run eval "$install_prog $dir/$realname $destdir/$realname" || exit $?
-          test "X$dlname" = "X$realname" && dlname=
-
-          if test $# -gt 0; then
-            # Delete the old symlinks.
-            rmcmd="$rm"
-            for linkname
-            do
-              rmcmd="$rmcmd $destdir/$linkname"
-            done
-            $show "$rmcmd"
-            $run $rmcmd
-
-            # ... and create new ones.
-            for linkname
-            do
-              test "X$dlname" = "X$linkname" && dlname=
-              $show "(cd $destdir && $LN_S $realname $linkname)"
-              $run eval "(cd $destdir && $LN_S $realname $linkname)"
-            done
-          fi
-
-          if test -n "$dlname"; then
-            # Install the dynamically-loadable library.
-            $show "$install_prog $dir/$dlname $destdir/$dlname"
-            $run eval "$install_prog $dir/$dlname $destdir/$dlname" || exit $?
-          fi
-
-          # Do each command in the postinstall commands.
-          lib="$destdir/$realname"
-          eval cmds=\"$postinstall_cmds\"
-          IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-          for cmd in $cmds; do
-            IFS="$save_ifs"
-            $show "$cmd"
-            $run eval "$cmd" || exit $?
-          done
-          IFS="$save_ifs"
-        fi
-
-        # Install the pseudo-library for information purposes.
-        name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
-        $show "$install_prog $file $destdir/$name"
-        $run eval "$install_prog $file $destdir/$name" || exit $?
-
-        # Maybe install the static library, too.
-        test -n "$old_library" && staticlibs="$staticlibs $dir/$old_library"
-        ;;
-
-      *.lo)
-        # Install (i.e. copy) a libtool object.
-
-        # Figure out destination file name, if it wasn't already specified.
-        if test -n "$destname"; then
-          destfile="$destdir/$destname"
-        else
-          destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
-          destfile="$destdir/$destfile"
-        fi
-
-        # Deduce the name of the destination old-style object file.
-        case "$destfile" in
-        *.lo)
-          staticdest=`$echo "X$destfile" | $Xsed -e 's/\.lo$/\.o/'`
-          ;;
-        *.o)
-          staticdest="$destfile"
-          destfile=
-          ;;
-        *)
-          $echo "$modename: cannot copy a libtool object to \`$destfile'" 1>&2
-          $echo "$help" 1>&2
-          exit 1
-          ;;
-        esac
-
-        # Install the libtool object if requested.
-        if test -n "$destfile"; then
-          $show "$install_prog $file $destfile"
-          $run eval "$install_prog $file $destfile" || exit $?
-        fi
-
-        # Install the old object if enabled.
-        if test "$build_old_libs" = yes; then
-          # Deduce the name of the old-style object file.
-          staticobj=`$echo "X$file" | $Xsed -e 's/\.lo$/\.o/'`
-
-          $show "$install_prog $staticobj $staticdest"
-          $run eval "$install_prog \$staticobj \$staticdest" || exit $?
-        fi
-        exit 0
-        ;;
-
-      *)
-        # Do a test to see if this is really a libtool program.
-        if (sed -e '4q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then
-          link_against_libtool_libs=
-          finalize_command=
-
-          # If there is no directory component, then add one.
-          case "$file" in
-          */* | *\\*) . $file ;;
-          *) . ./$file ;;
-          esac
-
-          # Check the variables that should have been set.
-          if test -z "$link_against_libtool_libs" || test -z "$finalize_command"; then
-            $echo "$modename: invalid libtool wrapper script \`$file'" 1>&2
-            exit 1
-          fi
-
-          finalize=yes
-          for lib in $link_against_libtool_libs; do
-            # Check to see that each library is installed.
-            libdir=
-            if test -f "$lib"; then
-              # If there is no directory component, then add one.
-              case "$lib" in
-              */* | *\\*) . $lib ;;
-              *) . ./$lib ;;
-              esac
-            fi
-            libfile="$libdir/`$echo "X$lib" | $Xsed -e 's%^.*/%%g'`"
-            if test -z "$libdir"; then
-              $echo "$modename: warning: \`$lib' contains no -rpath information" 1>&2
-            elif test -f "$libfile"; then :
-            else
-              $echo "$modename: warning: \`$lib' has not been installed in \`$libdir'" 1>&2
-              finalize=no
-            fi
-          done
-
-          if test "$hardcode_action" = relink; then
-            if test "$finalize" = yes; then
-              $echo "$modename: warning: relinking \`$file' on behalf of your buggy system linker" 1>&2
-              $show "$finalize_command"
-              if $run eval "$finalize_command"; then :
-              else
-                $echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
-                continue
-              fi
-              file="$objdir/$file"T
-            else
-              $echo "$modename: warning: cannot relink \`$file' on behalf of your buggy system linker" 1>&2
-            fi
-          else
-            # Install the binary that we compiled earlier.
-	    file=`$echo "X$file" | $Xsed -e "s%\([^/]*\)$%$objdir/\1%"`
-          fi
-        fi
-
-        $show "$install_prog$stripme $file $dest"
-        $run eval "$install_prog\$stripme \$file \$dest" || exit $?
-        ;;
-      esac
-    done
-
-    for file in $staticlibs; do
-      name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
-
-      # Set up the ranlib parameters.
-      oldlib="$destdir/$name"
-
-      $show "$install_prog $file $oldlib"
-      $run eval "$install_prog \$file \$oldlib" || exit $?
-
-      # Do each command in the postinstall commands.
-      eval cmds=\"$old_postinstall_cmds\"
-      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-      for cmd in $cmds; do
-        IFS="$save_ifs"
-        $show "$cmd"
-        $run eval "$cmd" || exit $?
-      done
-      IFS="$save_ifs"
-    done
-
-    if test -n "$future_libdirs"; then
-      $echo "$modename: warning: remember to run \`$progname --finish$future_libdirs'" 1>&2
-    fi
-
-    if test -n "$current_libdirs"; then
-      # Maybe just do a dry run.
-      test -n "$run" && current_libdirs=" -n$current_libdirs"
-      exec $SHELL $0 --finish$current_libdirs
-      exit 1
-    fi
-
-    exit 0
-    ;;
-
-  # libtool finish mode
-  finish)
-    modename="$modename: finish"
-    libdirs="$nonopt"
-
-    if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then
-      for dir
-      do
-        libdirs="$libdirs $dir"
-      done
-
-      for libdir in $libdirs; do
-	if test -n "$finish_cmds"; then
-	  # Do each command in the finish commands.
-	  eval cmds=\"$finish_cmds\"
-          IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-          for cmd in $cmds; do
-            IFS="$save_ifs"
-            $show "$cmd"
-            $run eval "$cmd"
-          done
-          IFS="$save_ifs"
-	fi
-	if test -n "$finish_eval"; then
-	  # Do the single finish_eval.
-	  eval cmds=\"$finish_eval\"
-	  $run eval "$cmds"
-	fi
-      done
-    fi
-
-    echo "------------------------------------------------------------------------------"
-    echo "Libraries have been installed in:"
-    for libdir in $libdirs; do
-      echo "   $libdir"
-    done
-    echo
-    echo "To link against installed libraries in a given directory, LIBDIR,"
-    echo "you must use the \`-LLIBDIR' flag during linking."
-    echo
-    echo " You will also need to do one of the following:"
-    if test -n "$shlibpath_var"; then
-      echo "   - add LIBDIR to the \`$shlibpath_var' environment variable"
-      echo "     during execution"
-    fi
-    if test -n "$runpath_var"; then
-      echo "   - add LIBDIR to the \`$runpath_var' environment variable"
-      echo "     during linking"
-    fi
-    if test -n "$hardcode_libdir_flag_spec"; then
-      libdir=LIBDIR
-      eval flag=\"$hardcode_libdir_flag_spec\"
-
-      echo "   - use the \`$flag' linker flag"
-    fi
-    if test -f /etc/ld.so.conf; then
-      echo "   - have your system administrator add LIBDIR to \`/etc/ld.so.conf'"
-    fi
-    echo
-    echo "See any operating system documentation about shared libraries for"
-    echo "more information, such as the ld(1) and ld.so(8) manual pages."
-    echo "------------------------------------------------------------------------------"
-    exit 0
-    ;;
-
-  # libtool execute mode
-  execute)
-    modename="$modename: execute"
-
-    # The first argument is the command name.
-    cmd="$nonopt"
-    if test -z "$cmd"; then
-      $echo "$modename: you must specify a COMMAND" 1>&2
-      $echo "$help"
-      exit 1
-    fi
-
-    # Handle -dlopen flags immediately.
-    for file in $execute_dlfiles; do
-      if test -f "$file"; then :
-      else
-	$echo "$modename: \`$file' is not a file" 1>&2
-	$echo "$help" 1>&2
-	exit 1
-      fi
-
-      dir=
-      case "$file" in
-      *.la)
-        # Check to see that this really is a libtool archive.
-        if (sed -e '2q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then :
-        else
-          $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
-          $echo "$help" 1>&2
-          exit 1
-        fi
-
-	# Read the libtool library.
-	dlname=
-	library_names=
-
-        # If there is no directory component, then add one.
-	case "$file" in
-	*/* | *\\*) . $file ;;
-        *) . ./$file ;;
-	esac
-
-	# Skip this library if it cannot be dlopened.
-	if test -z "$dlname"; then
-	  # Warn if it was a shared library.
-	  test -n "$library_names" && $echo "$modename: warning: \`$file' was not linked with \`-export-dynamic'"
-	  continue
-	fi
-
-	dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
-	test "X$dir" = "X$file" && dir=.
-
-	if test -f "$dir/$objdir/$dlname"; then
-	  dir="$dir/$objdir"
-	else
-	  $echo "$modename: cannot find \`$dlname' in \`$dir' or \`$dir/$objdir'" 1>&2
-	  exit 1
-	fi
-	;;
-
-      *.lo)
-	# Just add the directory containing the .lo file.
-	dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
-	test "X$dir" = "X$file" && dir=.
-	;;
-
-      *)
-	$echo "$modename: warning \`-dlopen' is ignored for non-libtool libraries and objects" 1>&2
-        continue
-	;;
-      esac
-
-      # Get the absolute pathname.
-      absdir=`cd "$dir" && pwd`
-      test -n "$absdir" && dir="$absdir"
-
-      # Now add the directory to shlibpath_var.
-      if eval "test -z \"\$$shlibpath_var\""; then
-	eval "$shlibpath_var=\"\$dir\""
-      else
-	eval "$shlibpath_var=\"\$dir:\$$shlibpath_var\""
-      fi
-    done
-
-    # This variable tells wrapper scripts just to set shlibpath_var
-    # rather than running their programs.
-    libtool_execute_magic="$magic"
-
-    # Check if any of the arguments is a wrapper script.
-    args=
-    for file
-    do
-      case "$file" in
-      -*) ;;
-      *)
-        # Do a test to see if this is really a libtool program.
-        if (sed -e '4q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then
-	  # If there is no directory component, then add one.
-	  case "$file" in
-	  */* | *\\*) . $file ;;
-	  *) . ./$file ;;
-	  esac
-
-	  # Transform arg to wrapped name.
-	  file="$progdir/$program"
-	fi
-        ;;
-      esac
-      # Quote arguments (to preserve shell metacharacters).
-      file=`$echo "X$file" | $Xsed -e "$sed_quote_subst"`
-      args="$args \"$file\""
-    done
-
-    if test -z "$run"; then
-      # Export the shlibpath_var.
-      eval "export $shlibpath_var"
-
-      # Now actually exec the command.
-      eval "exec \$cmd$args"
-
-      $echo "$modename: cannot exec \$cmd$args"
-      exit 1
-    else
-      # Display what would be done.
-      eval "\$echo \"\$shlibpath_var=\$$shlibpath_var\""
-      $echo "export $shlibpath_var"
-      $echo "$cmd$args"
-      exit 0
-    fi
-    ;;
-
-  # libtool uninstall mode
-  uninstall)
-    modename="$modename: uninstall"
-    rm="$nonopt"
-    files=
-
-    for arg
-    do
-      case "$arg" in
-      -*) rm="$rm $arg" ;;
-      *) files="$files $arg" ;;
-      esac
-    done
-
-    if test -z "$rm"; then
-      $echo "$modename: you must specify an RM program" 1>&2
-      $echo "$help" 1>&2
-      exit 1
-    fi
-
-    for file in $files; do
-      dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
-      test "X$dir" = "X$file" && dir=.
-      name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
-
-      rmfiles="$file"
-
-      case "$name" in
-      *.la)
-        # Possibly a libtool archive, so verify it.
-        if (sed -e '2q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then
-          . $dir/$name
-
-          # Delete the libtool libraries and symlinks.
-          for n in $library_names; do
-            rmfiles="$rmfiles $dir/$n"
-            test "X$n" = "X$dlname" && dlname=
-          done
-          test -n "$dlname" && rmfiles="$rmfiles $dir/$dlname"
-          test -n "$old_library" && rmfiles="$rmfiles $dir/$old_library"
-
-	  $show "$rm $rmfiles"
-	  $run $rm $rmfiles
-
-	  if test -n "$library_names"; then
-	    # Do each command in the postuninstall commands.
-	    eval cmds=\"$postuninstall_cmds\"
-	    IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-	    for cmd in $cmds; do
-	      IFS="$save_ifs"
-	      $show "$cmd"
-	      $run eval "$cmd"
-	    done
-	    IFS="$save_ifs"
-	  fi
-
-          if test -n "$old_library"; then
-	    # Do each command in the old_postuninstall commands.
-	    eval cmds=\"$old_postuninstall_cmds\"
-	    IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-	    for cmd in $cmds; do
-	      IFS="$save_ifs"
-	      $show "$cmd"
-	      $run eval "$cmd"
-	    done
-	    IFS="$save_ifs"
-	  fi
-
-          # FIXME: should reinstall the best remaining shared library.
-        fi
-        ;;
-
-      *.lo)
-        if test "$build_old_libs" = yes; then
-          oldobj=`$echo "X$name" | $Xsed -e 's/\.lo$/\.o/'`
-          rmfiles="$rmfiles $dir/$oldobj"
-        fi
-	$show "$rm $rmfiles"
-	$run $rm $rmfiles
-        ;;
-
-      *)
-      	$show "$rm $rmfiles"
-	$run $rm $rmfiles
-	;;
-      esac
-    done
-    exit 0
-    ;;
-
-  "")
-    $echo "$modename: you must specify a MODE" 1>&2
-    $echo "$generic_help" 1>&2
-    exit 1
-    ;;
-  esac
-
-  $echo "$modename: invalid operation mode \`$mode'" 1>&2
-  $echo "$generic_help" 1>&2
-  exit 1
-fi # test -z "$show_help"
-
-# We need to display help for each of the modes.
-case "$mode" in
-"") $echo \
-"Usage: $modename [OPTION]... [MODE-ARG]...
-
-Provide generalized library-building support services.
-
--n, --dry-run         display commands without modifying any files
-    --features        display configuration information and exit
-    --finish          same as \`--mode=finish'
-    --help            display this help message and exit
-    --mode=MODE       use operation mode MODE [default=inferred from MODE-ARGS]
-    --quiet           same as \`--silent'
-    --silent          don't print informational messages
-    --version         print version information
-
-MODE must be one of the following:
-
-      compile         compile a source file into a libtool object
-      execute         automatically set library path, then run a program
-      finish          complete the installation of libtool libraries
-      install         install libraries or executables
-      link            create a library or an executable
-      uninstall       remove libraries from an installed directory
-
-MODE-ARGS vary depending on the MODE.  Try \`$modename --help --mode=MODE' for
-a more detailed description of MODE."
-  exit 0
-  ;;
-
-compile)
-  $echo \
-"Usage: $modename [OPTION]... --mode=compile COMPILE-COMMAND... SOURCEFILE
-
-Compile a source file into a libtool library object.
-
-COMPILE-COMMAND is a command to be used in creating a \`standard' object file
-from the given SOURCEFILE.
-
-The output file name is determined by removing the directory component from
-SOURCEFILE, then substituting the C source code suffix \`.c' with the
-library object suffix, \`.lo'."
-  ;;
-
-execute)
-  $echo \
-"Usage: $modename [OPTION]... --mode=execute COMMAND [ARGS]...
-
-Automatically set library path, then run a program.
-
-This mode accepts the following additional options:
-
-  -dlopen FILE      add the directory containing FILE to the library path
-
-This mode sets the library path environment variable according to \`-dlopen'
-flags.
-
-If any of the ARGS are libtool executable wrappers, then they are translated
-into their corresponding uninstalled binary, and any of their required library
-directories are added to the library path.
-
-Then, COMMAND is executed, with ARGS as arguments."
-  ;;
-
-finish)
-  $echo \
-"Usage: $modename [OPTION]... --mode=finish [LIBDIR]...
-
-Complete the installation of libtool libraries.
-
-Each LIBDIR is a directory that contains libtool libraries.
-
-The commands that this mode executes may require superuser privileges.  Use
-the \`--dry-run' option if you just want to see what would be executed."
-  ;;
-
-install)
-  $echo \
-"Usage: $modename [OPTION]... --mode=install INSTALL-COMMAND...
-
-Install executables or libraries.
-
-INSTALL-COMMAND is the installation command.  The first component should be
-either the \`install' or \`cp' program.
-
-The rest of the components are interpreted as arguments to that command (only
-BSD-compatible install options are recognized)."
-  ;;
-
-link)
-  $echo \
-"Usage: $modename [OPTION]... --mode=link LINK-COMMAND...
-
-Link object files or libraries together to form another library, or to
-create an executable program.
-
-LINK-COMMAND is a command using the C compiler that you would use to create
-a program from several object files.
-
-The following components of LINK-COMMAND are treated specially:
-
-  -all-static       do not do any dynamic linking at all
-  -dlopen FILE      \`-dlpreopen' FILE if it cannot be dlopened at runtime
-  -dlpreopen FILE   link in FILE and add its symbols to dld_preloaded_symbols
-  -export-dynamic   allow symbols from OUTPUT-FILE to be resolved with dlsym(3)
-  -LLIBDIR          search LIBDIR for required installed libraries
-  -lNAME            OUTPUT-FILE requires the installed library libNAME
-  -no-undefined     declare that a library does not refer to external symbols
-  -o OUTPUT-FILE    create OUTPUT-FILE from the specified objects
-  -release RELEASE  specify package release information
-  -rpath LIBDIR     the created library will eventually be installed in LIBDIR
-  -static           do not do any dynamic linking of libtool libraries
-  -version-info CURRENT[:REVISION[:AGE]]
-                    specify library version info [each variable defaults to 0]
-
-All other options (arguments beginning with \`-') are ignored.
-
-Every other argument is treated as a filename.  Files ending in \`.la' are
-treated as uninstalled libtool libraries, other files are standard or library
-object files.
-
-If the OUTPUT-FILE ends in \`.la', then a libtool library is created, only
-library objects (\`.lo' files) may be specified, and \`-rpath' is required.
-
-If OUTPUT-FILE ends in \`.a', then a standard library is created using \`ar'
-and \`ranlib'.
-
-If OUTPUT-FILE ends in \`.lo' or \`.o', then a reloadable object file is
-created, otherwise an executable program is created."
-  ;;
-
-uninstall)
-  $echo
-"Usage: $modename [OPTION]... --mode=uninstall RM [RM-OPTION]... FILE...
-
-Remove libraries from an installation directory.
-
-RM is the name of the program to use to delete files associated with each FILE
-(typically \`/bin/rm').  RM-OPTIONS are options (such as \`-f') to be passed
-to RM.
-
-If FILE is a libtool library, all the files associated with it are deleted.
-Otherwise, only FILE itself is deleted using RM."
-  ;;
-
-*)
-  $echo "$modename: invalid operation mode \`$mode'" 1>&2
-  $echo "$help" 1>&2
-  exit 1
-  ;;
-esac
-
-echo
-$echo "Try \`$modename --help' for more information about other modes."
-
-exit 0
-
-# Local Variables:
-# mode:shell-script
-# sh-indentation:2
-# End:

diff --git a/makcjpeg.st b/makcjpeg.st
deleted file mode 100644
index fc72c89..0000000
--- a/makcjpeg.st
+++ /dev/null

@@ -1,38 +0,0 @@
-; Project file for Independent JPEG Group's software
-;
-; This project file is for Atari ST/STE/TT systems using Pure C or Turbo C.
-; Thanks to Frank Moehle (Frank.Moehle@arbi.informatik.uni-oldenburg.de),
-; Dr. B. Setzepfandt (bernd@gina.uni-muenster.de),
-; and Guido Vollbeding (guivol@esc.de).
-;
-; To use this file, rename it to cjpeg.prj.
-; If you are using Turbo C, change filenames beginning with "pc..." to "tc..."
-; Read installation instructions before trying to make the program!
-;
-;
-;      * * * Output file * * *
-cjpeg.ttp
-;
-; * * * COMPILER OPTIONS * * *  
-.C[-P]        ; absolute calls
-.C[-M]        ; and no string merging, folks
-.C[-w-cln]    ; no "constant is long" warnings
-.C[-w-par]    ; no "parameter xxxx unused"
-.C[-w-rch]    ; no "unreachable code"
-.C[-wsig]     ; warn if significant digits may be lost
-=
-; * * * * List of modules * * * * 
-pcstart.o
-cjpeg.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h,jversion.h)
-cdjpeg.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdswitch.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdppm.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdgif.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdtarga.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdbmp.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdrle.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-libjpeg.lib        ; built by libjpeg.prj
-pcfltlib.lib       ; floating point library
-; the float library can be omitted if you've turned off DCT_FLOAT_SUPPORTED
-pcstdlib.lib       ; standard library
-pcextlib.lib       ; extended library

diff --git a/makdjpeg.st b/makdjpeg.st
deleted file mode 100644
index 3226726..0000000
--- a/makdjpeg.st
+++ /dev/null

@@ -1,38 +0,0 @@
-; Project file for Independent JPEG Group's software
-;
-; This project file is for Atari ST/STE/TT systems using Pure C or Turbo C.
-; Thanks to Frank Moehle (Frank.Moehle@arbi.informatik.uni-oldenburg.de),
-; Dr. B. Setzepfandt (bernd@gina.uni-muenster.de),
-; and Guido Vollbeding (guivol@esc.de).
-;
-; To use this file, rename it to djpeg.prj.
-; If you are using Turbo C, change filenames beginning with "pc..." to "tc..."
-; Read installation instructions before trying to make the program!
-;
-;
-;      * * * Output file * * *
-djpeg.ttp
-;
-; * * * COMPILER OPTIONS * * *  
-.C[-P]        ; absolute calls
-.C[-M]        ; and no string merging, folks
-.C[-w-cln]    ; no "constant is long" warnings
-.C[-w-par]    ; no "parameter xxxx unused"
-.C[-w-rch]    ; no "unreachable code"
-.C[-wsig]     ; warn if significant digits may be lost
-=
-; * * * * List of modules * * * * 
-pcstart.o
-djpeg.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h,jversion.h)
-cdjpeg.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdcolmap.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-wrppm.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-wrgif.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-wrtarga.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-wrbmp.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-wrrle.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-libjpeg.lib        ; built by libjpeg.prj
-pcfltlib.lib       ; floating point library
-; the float library can be omitted if you've turned off DCT_FLOAT_SUPPORTED
-pcstdlib.lib       ; standard library
-pcextlib.lib       ; extended library

diff --git a/makeapps.ds b/makeapps.ds
deleted file mode 100644
index bedd038..0000000
--- a/makeapps.ds
+++ /dev/null

@@ -1,828 +0,0 @@
-# Microsoft Developer Studio Generated NMAKE File, Format Version 4.20
-# ** DO NOT EDIT **
-
-# TARGTYPE "Win32 (x86) Console Application" 0x0103
-
-!IF "$(CFG)" == ""
-CFG=cjpeg - Win32
-!MESSAGE No configuration specified.  Defaulting to cjpeg - Win32.
-!ENDIF 
-
-!IF "$(CFG)" != "cjpeg - Win32" && "$(CFG)" != "djpeg - Win32" &&\
- "$(CFG)" != "jpegtran - Win32" && "$(CFG)" != "rdjpgcom - Win32" &&\
- "$(CFG)" != "wrjpgcom - Win32"
-!MESSAGE Invalid configuration "$(CFG)" specified.
-!MESSAGE You can specify a configuration when running NMAKE on this makefile
-!MESSAGE by defining the macro CFG on the command line.  For example:
-!MESSAGE 
-!MESSAGE NMAKE /f "apps.mak" CFG="cjpeg - Win32"
-!MESSAGE 
-!MESSAGE Possible choices for configuration are:
-!MESSAGE 
-!MESSAGE "cjpeg - Win32" (based on "Win32 (x86) Console Application")
-!MESSAGE "djpeg - Win32" (based on "Win32 (x86) Console Application")
-!MESSAGE "jpegtran - Win32" (based on "Win32 (x86) Console Application")
-!MESSAGE "rdjpgcom - Win32" (based on "Win32 (x86) Console Application")
-!MESSAGE "wrjpgcom - Win32" (based on "Win32 (x86) Console Application")
-!MESSAGE 
-!ERROR An invalid configuration is specified.
-!ENDIF 
-
-!IF "$(OS)" == "Windows_NT"
-NULL=
-!ELSE 
-NULL=nul
-!ENDIF 
-################################################################################
-# Begin Project
-# PROP Target_Last_Scanned "cjpeg - Win32"
-CPP=cl.exe
-RSC=rc.exe
-
-!IF  "$(CFG)" == "cjpeg - Win32"
-
-# PROP BASE Use_MFC 0
-# PROP BASE Use_Debug_Libraries 0
-# PROP BASE Output_Dir "cjpeg\Release"
-# PROP BASE Intermediate_Dir "cjpeg\Release"
-# PROP BASE Target_Dir "cjpeg"
-# PROP Use_MFC 0
-# PROP Use_Debug_Libraries 0
-# PROP Output_Dir "cjpeg\Release"
-# PROP Intermediate_Dir "cjpeg\Release"
-# PROP Target_Dir "cjpeg"
-OUTDIR=.\cjpeg\Release
-INTDIR=.\cjpeg\Release
-
-ALL : "$(OUTDIR)\cjpeg.exe"
-
-CLEAN : 
-	-@erase "$(INTDIR)\cjpeg.obj"
-	-@erase "$(INTDIR)\rdppm.obj"
-	-@erase "$(INTDIR)\rdgif.obj"
-	-@erase "$(INTDIR)\rdtarga.obj"
-	-@erase "$(INTDIR)\rdrle.obj"
-	-@erase "$(INTDIR)\rdbmp.obj"
-	-@erase "$(INTDIR)\rdswitch.obj"
-	-@erase "$(INTDIR)\cdjpeg.obj"
-	-@erase "$(OUTDIR)\cjpeg.exe"
-
-"$(OUTDIR)" :
-    if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)"
-
-# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE"\
- /Fp"$(INTDIR)/cjpeg.pch" /YX /Fo"$(INTDIR)/" /c 
-CPP_OBJS=.\cjpeg\Release/
-CPP_SBRS=.\.
-# ADD BASE RSC /l 0x409 /d "NDEBUG"
-# ADD RSC /l 0x409 /d "NDEBUG"
-BSC32=bscmake.exe
-# ADD BASE BSC32 /nologo
-# ADD BSC32 /nologo
-BSC32_FLAGS=/nologo /o"$(OUTDIR)/cjpeg.bsc" 
-BSC32_SBRS= \
-	
-LINK32=link.exe
-# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-# ADD LINK32 Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-LINK32_FLAGS=Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib\
- comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib\
- odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no\
- /pdb:"$(OUTDIR)/cjpeg.pdb" /machine:I386 /out:"$(OUTDIR)/cjpeg.exe" 
-LINK32_OBJS= \
-	"$(INTDIR)\cjpeg.obj" \
-	"$(INTDIR)\rdppm.obj" \
-	"$(INTDIR)\rdgif.obj" \
-	"$(INTDIR)\rdtarga.obj" \
-	"$(INTDIR)\rdrle.obj" \
-	"$(INTDIR)\rdbmp.obj" \
-	"$(INTDIR)\rdswitch.obj" \
-	"$(INTDIR)\cdjpeg.obj" \
-
-
-"$(OUTDIR)\cjpeg.exe" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
-    $(LINK32) @<<
-  $(LINK32_FLAGS) $(LINK32_OBJS)
-<<
-
-!ELSEIF  "$(CFG)" == "djpeg - Win32"
-
-# PROP BASE Use_MFC 0
-# PROP BASE Use_Debug_Libraries 0
-# PROP BASE Output_Dir "djpeg\Release"
-# PROP BASE Intermediate_Dir "djpeg\Release"
-# PROP BASE Target_Dir "djpeg"
-# PROP Use_MFC 0
-# PROP Use_Debug_Libraries 0
-# PROP Output_Dir "djpeg\Release"
-# PROP Intermediate_Dir "djpeg\Release"
-# PROP Target_Dir "djpeg"
-OUTDIR=.\djpeg\Release
-INTDIR=.\djpeg\Release
-
-ALL : "$(OUTDIR)\djpeg.exe"
-
-CLEAN : 
-	-@erase "$(INTDIR)\djpeg.obj"
-	-@erase "$(INTDIR)\wrppm.obj"
-	-@erase "$(INTDIR)\wrgif.obj"
-	-@erase "$(INTDIR)\wrtarga.obj"
-	-@erase "$(INTDIR)\wrrle.obj"
-	-@erase "$(INTDIR)\wrbmp.obj"
-	-@erase "$(INTDIR)\rdcolmap.obj"
-	-@erase "$(INTDIR)\cdjpeg.obj"
-	-@erase "$(OUTDIR)\djpeg.exe"
-
-"$(OUTDIR)" :
-    if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)"
-
-# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE"\
- /Fp"$(INTDIR)/djpeg.pch" /YX /Fo"$(INTDIR)/" /c 
-CPP_OBJS=.\djpeg\Release/
-CPP_SBRS=.\.
-# ADD BASE RSC /l 0x409 /d "NDEBUG"
-# ADD RSC /l 0x409 /d "NDEBUG"
-BSC32=bscmake.exe
-# ADD BASE BSC32 /nologo
-# ADD BSC32 /nologo
-BSC32_FLAGS=/nologo /o"$(OUTDIR)/djpeg.bsc" 
-BSC32_SBRS= \
-	
-LINK32=link.exe
-# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-# ADD LINK32 Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-LINK32_FLAGS=Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib\
- comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib\
- odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no\
- /pdb:"$(OUTDIR)/djpeg.pdb" /machine:I386 /out:"$(OUTDIR)/djpeg.exe" 
-LINK32_OBJS= \
-	"$(INTDIR)\djpeg.obj" \
-	"$(INTDIR)\wrppm.obj" \
-	"$(INTDIR)\wrgif.obj" \
-	"$(INTDIR)\wrtarga.obj" \
-	"$(INTDIR)\wrrle.obj" \
-	"$(INTDIR)\wrbmp.obj" \
-	"$(INTDIR)\rdcolmap.obj" \
-	"$(INTDIR)\cdjpeg.obj" \
-
-
-"$(OUTDIR)\djpeg.exe" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
-    $(LINK32) @<<
-  $(LINK32_FLAGS) $(LINK32_OBJS)
-<<
-
-!ELSEIF  "$(CFG)" == "jpegtran - Win32"
-
-# PROP BASE Use_MFC 0
-# PROP BASE Use_Debug_Libraries 0
-# PROP BASE Output_Dir "jpegtran\Release"
-# PROP BASE Intermediate_Dir "jpegtran\Release"
-# PROP BASE Target_Dir "jpegtran"
-# PROP Use_MFC 0
-# PROP Use_Debug_Libraries 0
-# PROP Output_Dir "jpegtran\Release"
-# PROP Intermediate_Dir "jpegtran\Release"
-# PROP Target_Dir "jpegtran"
-OUTDIR=.\jpegtran\Release
-INTDIR=.\jpegtran\Release
-
-ALL : "$(OUTDIR)\jpegtran.exe"
-
-CLEAN : 
-	-@erase "$(INTDIR)\jpegtran.obj"
-	-@erase "$(INTDIR)\rdswitch.obj"
-	-@erase "$(INTDIR)\cdjpeg.obj"
-	-@erase "$(INTDIR)\transupp.obj"
-	-@erase "$(OUTDIR)\jpegtran.exe"
-
-"$(OUTDIR)" :
-    if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)"
-
-# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE"\
- /Fp"$(INTDIR)/jpegtran.pch" /YX /Fo"$(INTDIR)/" /c 
-CPP_OBJS=.\jpegtran\Release/
-CPP_SBRS=.\.
-# ADD BASE RSC /l 0x409 /d "NDEBUG"
-# ADD RSC /l 0x409 /d "NDEBUG"
-BSC32=bscmake.exe
-# ADD BASE BSC32 /nologo
-# ADD BSC32 /nologo
-BSC32_FLAGS=/nologo /o"$(OUTDIR)/jpegtran.bsc" 
-BSC32_SBRS= \
-	
-LINK32=link.exe
-# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-# ADD LINK32 Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-LINK32_FLAGS=Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib\
- comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib\
- odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no\
- /pdb:"$(OUTDIR)/jpegtran.pdb" /machine:I386 /out:"$(OUTDIR)/jpegtran.exe" 
-LINK32_OBJS= \
-	"$(INTDIR)\jpegtran.obj" \
-	"$(INTDIR)\rdswitch.obj" \
-	"$(INTDIR)\cdjpeg.obj" \
-	"$(INTDIR)\transupp.obj" \
-
-
-"$(OUTDIR)\jpegtran.exe" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
-    $(LINK32) @<<
-  $(LINK32_FLAGS) $(LINK32_OBJS)
-<<
-
-!ELSEIF  "$(CFG)" == "rdjpgcom - Win32"
-
-# PROP BASE Use_MFC 0
-# PROP BASE Use_Debug_Libraries 0
-# PROP BASE Output_Dir "rdjpgcom\Release"
-# PROP BASE Intermediate_Dir "rdjpgcom\Release"
-# PROP BASE Target_Dir "rdjpgcom"
-# PROP Use_MFC 0
-# PROP Use_Debug_Libraries 0
-# PROP Output_Dir "rdjpgcom\Release"
-# PROP Intermediate_Dir "rdjpgcom\Release"
-# PROP Target_Dir "rdjpgcom"
-OUTDIR=.\rdjpgcom\Release
-INTDIR=.\rdjpgcom\Release
-
-ALL : "$(OUTDIR)\rdjpgcom.exe"
-
-CLEAN : 
-	-@erase "$(INTDIR)\rdjpgcom.obj"
-	-@erase "$(OUTDIR)\rdjpgcom.exe"
-
-"$(OUTDIR)" :
-    if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)"
-
-# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE"\
- /Fp"$(INTDIR)/rdjpgcom.pch" /YX /Fo"$(INTDIR)/" /c 
-CPP_OBJS=.\rdjpgcom\Release/
-CPP_SBRS=.\.
-# ADD BASE RSC /l 0x409 /d "NDEBUG"
-# ADD RSC /l 0x409 /d "NDEBUG"
-BSC32=bscmake.exe
-# ADD BASE BSC32 /nologo
-# ADD BSC32 /nologo
-BSC32_FLAGS=/nologo /o"$(OUTDIR)/rdjpgcom.bsc" 
-BSC32_SBRS= \
-	
-LINK32=link.exe
-# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-# ADD LINK32 Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-LINK32_FLAGS=Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib\
- comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib\
- odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no\
- /pdb:"$(OUTDIR)/rdjpgcom.pdb" /machine:I386 /out:"$(OUTDIR)/rdjpgcom.exe" 
-LINK32_OBJS= \
-	"$(INTDIR)\rdjpgcom.obj"
-
-"$(OUTDIR)\rdjpgcom.exe" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
-    $(LINK32) @<<
-  $(LINK32_FLAGS) $(LINK32_OBJS)
-<<
-
-!ELSEIF  "$(CFG)" == "wrjpgcom - Win32"
-
-# PROP BASE Use_MFC 0
-# PROP BASE Use_Debug_Libraries 0
-# PROP BASE Output_Dir "wrjpgcom\Release"
-# PROP BASE Intermediate_Dir "wrjpgcom\Release"
-# PROP BASE Target_Dir "wrjpgcom"
-# PROP Use_MFC 0
-# PROP Use_Debug_Libraries 0
-# PROP Output_Dir "wrjpgcom\Release"
-# PROP Intermediate_Dir "wrjpgcom\Release"
-# PROP Target_Dir "wrjpgcom"
-OUTDIR=.\wrjpgcom\Release
-INTDIR=.\wrjpgcom\Release
-
-ALL : "$(OUTDIR)\wrjpgcom.exe"
-
-CLEAN : 
-	-@erase "$(INTDIR)\wrjpgcom.obj"
-	-@erase "$(OUTDIR)\wrjpgcom.exe"
-
-"$(OUTDIR)" :
-    if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)"
-
-# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE"\
- /Fp"$(INTDIR)/wrjpgcom.pch" /YX /Fo"$(INTDIR)/" /c 
-CPP_OBJS=.\wrjpgcom\Release/
-CPP_SBRS=.\.
-# ADD BASE RSC /l 0x409 /d "NDEBUG"
-# ADD RSC /l 0x409 /d "NDEBUG"
-BSC32=bscmake.exe
-# ADD BASE BSC32 /nologo
-# ADD BSC32 /nologo
-BSC32_FLAGS=/nologo /o"$(OUTDIR)/wrjpgcom.bsc" 
-BSC32_SBRS= \
-	
-LINK32=link.exe
-# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-# ADD LINK32 Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-LINK32_FLAGS=Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib\
- comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib\
- odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no\
- /pdb:"$(OUTDIR)/wrjpgcom.pdb" /machine:I386 /out:"$(OUTDIR)/wrjpgcom.exe" 
-LINK32_OBJS= \
-	"$(INTDIR)\wrjpgcom.obj"
-
-"$(OUTDIR)\wrjpgcom.exe" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
-    $(LINK32) @<<
-  $(LINK32_FLAGS) $(LINK32_OBJS)
-<<
-
-!ENDIF 
-
-.c{$(CPP_OBJS)}.obj:
-   $(CPP) $(CPP_PROJ) $<  
-
-.cpp{$(CPP_OBJS)}.obj:
-   $(CPP) $(CPP_PROJ) $<  
-
-.cxx{$(CPP_OBJS)}.obj:
-   $(CPP) $(CPP_PROJ) $<  
-
-.c{$(CPP_SBRS)}.sbr:
-   $(CPP) $(CPP_PROJ) $<  
-
-.cpp{$(CPP_SBRS)}.sbr:
-   $(CPP) $(CPP_PROJ) $<  
-
-.cxx{$(CPP_SBRS)}.sbr:
-   $(CPP) $(CPP_PROJ) $<  
-
-################################################################################
-# Begin Target
-
-# Name "cjpeg - Win32"
-
-!IF  "$(CFG)" == "cjpeg - Win32"
-
-!ENDIF 
-
-################################################################################
-# Begin Source File
-
-SOURCE="cjpeg.c"
-DEP_CPP_CJPEG=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	"jversion.h"\
-	
-
-"$(INTDIR)\cjpeg.obj" : $(SOURCE) $(DEP_CPP_CJPEG) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="cdjpeg.c"
-DEP_CPP_CDJPE=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\cdjpeg.obj" : $(SOURCE) $(DEP_CPP_CDJPE) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdswitch.c"
-DEP_CPP_RDSWI=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\rdswitch.obj" : $(SOURCE) $(DEP_CPP_RDSWI) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdppm.c"
-DEP_CPP_RDPPM=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\rdppm.obj" : $(SOURCE) $(DEP_CPP_RDPPM) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdgif.c"
-DEP_CPP_RDGIF=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\rdgif.obj" : $(SOURCE) $(DEP_CPP_RDGIF) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdtarga.c"
-DEP_CPP_RDTAR=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\rdtarga.obj" : $(SOURCE) $(DEP_CPP_RDTAR) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdbmp.c"
-DEP_CPP_RDBMP=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\rdbmp.obj" : $(SOURCE) $(DEP_CPP_RDBMP) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdrle.c"
-DEP_CPP_RDRLE=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\rdrle.obj" : $(SOURCE) $(DEP_CPP_RDRLE) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-# End Target
-################################################################################
-# Begin Target
-
-# Name "djpeg - Win32"
-
-!IF  "$(CFG)" == "djpeg - Win32"
-
-!ENDIF 
-
-################################################################################
-# Begin Source File
-
-SOURCE="djpeg.c"
-DEP_CPP_DJPEG=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	"jversion.h"\
-	
-
-"$(INTDIR)\djpeg.obj" : $(SOURCE) $(DEP_CPP_DJPEG) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="cdjpeg.c"
-DEP_CPP_CDJPE=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\cdjpeg.obj" : $(SOURCE) $(DEP_CPP_CDJPE) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdcolmap.c"
-DEP_CPP_RDCOL=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\rdcolmap.obj" : $(SOURCE) $(DEP_CPP_RDCOL) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="wrppm.c"
-DEP_CPP_WRPPM=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\wrppm.obj" : $(SOURCE) $(DEP_CPP_WRPPM) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="wrgif.c"
-DEP_CPP_WRGIF=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\wrgif.obj" : $(SOURCE) $(DEP_CPP_WRGIF) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="wrtarga.c"
-DEP_CPP_WRTAR=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\wrtarga.obj" : $(SOURCE) $(DEP_CPP_WRTAR) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="wrbmp.c"
-DEP_CPP_WRBMP=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\wrbmp.obj" : $(SOURCE) $(DEP_CPP_WRBMP) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="wrrle.c"
-DEP_CPP_WRRLE=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\wrrle.obj" : $(SOURCE) $(DEP_CPP_WRRLE) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-# End Target
-################################################################################
-# Begin Target
-
-# Name "jpegtran - Win32"
-
-!IF  "$(CFG)" == "jpegtran - Win32"
-
-!ENDIF 
-
-################################################################################
-# Begin Source File
-
-SOURCE="jpegtran.c"
-DEP_CPP_JPEGT=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	"transupp.h"\
-	"jversion.h"\
-	
-
-"$(INTDIR)\jpegtran.obj" : $(SOURCE) $(DEP_CPP_JPEGT) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="cdjpeg.c"
-DEP_CPP_CDJPE=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\cdjpeg.obj" : $(SOURCE) $(DEP_CPP_CDJPE) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdswitch.c"
-DEP_CPP_RDSWI=\
-	"cdjpeg.h"\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	"cderror.h"\
-	
-
-"$(INTDIR)\rdswitch.obj" : $(SOURCE) $(DEP_CPP_RDSWI) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="transupp.c"
-DEP_CPP_TRANS=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"transupp.h"\
-	
-
-"$(INTDIR)\transupp.obj" : $(SOURCE) $(DEP_CPP_TRANS) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-# End Target
-################################################################################
-# Begin Target
-
-# Name "rdjpgcom - Win32"
-
-!IF  "$(CFG)" == "rdjpgcom - Win32"
-
-!ENDIF 
-
-################################################################################
-# Begin Source File
-
-SOURCE="rdjpgcom.c"
-DEP_CPP_RDJPG=\
-	"jinclude.h"\
-	"jconfig.h"\
-	
-
-"$(INTDIR)\rdjpgcom.obj" : $(SOURCE) $(DEP_CPP_RDJPG) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-# End Target
-################################################################################
-# Begin Target
-
-# Name "wrjpgcom - Win32"
-
-!IF  "$(CFG)" == "wrjpgcom - Win32"
-
-!ENDIF 
-
-################################################################################
-# Begin Source File
-
-SOURCE="wrjpgcom.c"
-DEP_CPP_WRJPG=\
-	"jinclude.h"\
-	"jconfig.h"\
-	
-
-"$(INTDIR)\wrjpgcom.obj" : $(SOURCE) $(DEP_CPP_WRJPG) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-# End Target
-# End Project
-################################################################################
-

diff --git a/makefile.ansi b/makefile.ansi
deleted file mode 100644
index 8291913..0000000
--- a/makefile.ansi
+++ /dev/null

@@ -1,214 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is suitable for Unix-like systems with ANSI-capable compilers.
-# If you have a non-ANSI compiler, makefile.unix is a better starting point.
-
-# Read installation instructions before saying "make" !!
-
-# The name of your C compiler:
-CC= cc
-
-# You may need to adjust these cc options:
-CFLAGS= -O
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-
-# Link-time cc options:
-LDFLAGS= 
-
-# To link any special libraries, add the necessary -l commands here.
-LDLIBS= 
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.  For Unix this is usually jmemnobs.o, but you may want
-# to use jmemansi.o or jmemname.o if you have limited swap space.
-SYSDEPMEM= jmemnobs.o
-
-# miscellaneous OS-dependent stuff
-# linker
-LN= $(CC)
-# file deletion command
-RM= rm -f
-# library (.a) file creation command
-AR= ar rc
-# second step in .a creation (use "touch" if not needed)
-AR2= ranlib
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
-        jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
-        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
-        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
-        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
-        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
-        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
-        makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
-        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
-        jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
-        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
-        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
-# decompression library object files
-DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
-        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
-# These objectfiles are included in libjpeg.a
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
-        cdjpeg.o
-DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
-        cdjpeg.o
-TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
-
-
-all: libjpeg.a cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-
-libjpeg.a: $(LIBOBJECTS)
-	$(RM) libjpeg.a
-	$(AR) libjpeg.a  $(LIBOBJECTS)
-	$(AR2) libjpeg.a
-
-cjpeg: $(COBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o cjpeg $(COBJECTS) libjpeg.a $(LDLIBS)
-
-djpeg: $(DOBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o djpeg $(DOBJECTS) libjpeg.a $(LDLIBS)
-
-jpegtran: $(TROBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o jpegtran $(TROBJECTS) libjpeg.a $(LDLIBS)
-
-rdjpgcom: rdjpgcom.o
-	$(LN) $(LDFLAGS) -o rdjpgcom rdjpgcom.o $(LDLIBS)
-
-wrjpgcom: wrjpgcom.o
-	$(LN) $(LDFLAGS) -o wrjpgcom wrjpgcom.o $(LDLIBS)
-
-jconfig.h: jconfig.doc
-	echo You must prepare a system-dependent jconfig.h file.
-	echo Please read the installation directions in install.doc.
-	exit 1
-
-clean:
-	$(RM) *.o cjpeg djpeg jpegtran libjpeg.a rdjpgcom wrjpgcom
-	$(RM) core testout*
-
-test: cjpeg djpeg jpegtran
-	$(RM) testout*
-	./djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
-	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
-	./cjpeg -dct int -outfile testout.jpg  testimg.ppm
-	./djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-	./jpegtran -outfile testoutt.jpg testprog.jpg
-	cmp testimg.ppm testout.ppm
-	cmp testimg.bmp testout.bmp
-	cmp testimg.jpg testout.jpg
-	cmp testimg.ppm testoutp.ppm
-	cmp testimgp.jpg testoutp.jpg
-	cmp testorig.jpg testoutt.jpg
-
-
-jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h

diff --git a/makefile.bcc b/makefile.bcc
deleted file mode 100644
index a1cfcde..0000000
--- a/makefile.bcc
+++ /dev/null

@@ -1,285 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is suitable for Borland C on MS-DOS or OS/2.
-# It works with Borland C++ for DOS, revision 3.0 or later,
-# and has been tested with Borland C++ for OS/2.
-# Watch out for optimization bugs in the OS/2 compilers --- see notes below!
-# Thanks to Tom Wright and Ge' Weijers (original DOS) and
-# Ken Porter (OS/2) for this file.
-
-# Read installation instructions before saying "make" !!
-
-# Are we under DOS or OS/2?
-!if !$d(DOS) && !$d(OS2)
-!if $d(__OS2__)
-OS2=1
-!else
-DOS=1
-!endif
-!endif
-
-# The name of your C compiler:
-CC= bcc
-
-# You may need to adjust these cc options:
-!if $d(DOS)
-CFLAGS= -O2 -mm -w-par -w-stu -w-ccc -w-rch
-!else
-CFLAGS= -O1 -w-par -w-stu -w-ccc -w-rch
-!endif
-# -O2 enables full code optimization (for pre-3.0 Borland C++, use -O -G -Z).
-# -O2 is buggy in Borland OS/2 C++ revision 2.0, so use -O1 there for now.
-# If you have Borland OS/2 C++ revision 1.0, use -O or no optimization at all.
-# -mm selects medium memory model (near data, far code pointers; DOS only!)
-# -w-par suppresses warnings about unused function parameters
-# -w-stu suppresses warnings about incomplete structures
-# -w-ccc suppresses warnings about compile-time-constant conditions
-# -w-rch suppresses warnings about unreachable code
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-
-# Link-time cc options:
-!if $d(DOS)
-LDFLAGS= -mm
-# memory model option here must match CFLAGS!
-!else
-LDFLAGS=
-# -lai full-screen app
-# -lc case-significant link
-!endif
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.
-# For DOS, we recommend jmemdos.c and jmemdosa.asm.
-# For OS/2, we recommend jmemnobs.c (flat memory!)
-# SYSDEPMEMLIB must list the same files with "+" signs for the librarian.
-!if $d(DOS)
-SYSDEPMEM= jmemdos.obj jmemdosa.obj
-SYSDEPMEMLIB= +jmemdos.obj +jmemdosa.obj
-!else
-SYSDEPMEM= jmemnobs.obj
-SYSDEPMEMLIB= +jmemnobs.obj
-!endif
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
-        jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
-        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
-        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
-        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
-        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
-        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
-        makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
-        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
-        jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
-        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
-        jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
-        jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
-        jcdctmgr.obj jfdctfst.obj jfdctflt.obj jfdctint.obj
-# decompression library object files
-DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
-        jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
-        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jidctfst.obj \
-        jidctflt.obj jidctint.obj jidctred.obj jdsample.obj jdcolor.obj \
-        jquant1.obj jquant2.obj jdmerge.obj
-# These objectfiles are included in libjpeg.lib
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj \
-        rdswitch.obj cdjpeg.obj
-DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
-        rdcolmap.obj cdjpeg.obj
-TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
-
-
-all: libjpeg.lib cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
-
-libjpeg.lib: $(LIBOBJECTS)
-	- del libjpeg.lib
-	tlib libjpeg.lib /E /C @&&|
-+jcapimin.obj +jcapistd.obj +jctrans.obj +jcparam.obj +jdatadst.obj &
-+jcinit.obj +jcmaster.obj +jcmarker.obj +jcmainct.obj +jcprepct.obj &
-+jccoefct.obj +jccolor.obj +jcsample.obj +jchuff.obj +jcphuff.obj &
-+jcdctmgr.obj +jfdctfst.obj +jfdctflt.obj +jfdctint.obj +jdapimin.obj &
-+jdapistd.obj +jdtrans.obj +jdatasrc.obj +jdmaster.obj +jdinput.obj &
-+jdmarker.obj +jdhuff.obj +jdphuff.obj +jdmainct.obj +jdcoefct.obj &
-+jdpostct.obj +jddctmgr.obj +jidctfst.obj +jidctflt.obj +jidctint.obj &
-+jidctred.obj +jdsample.obj +jdcolor.obj +jquant1.obj +jquant2.obj &
-+jdmerge.obj +jcomapi.obj +jutils.obj +jerror.obj +jmemmgr.obj &
-$(SYSDEPMEMLIB)
-|
-
-cjpeg.exe: $(COBJECTS) libjpeg.lib
-	$(CC) $(LDFLAGS) -ecjpeg.exe $(COBJECTS) libjpeg.lib
-
-djpeg.exe: $(DOBJECTS) libjpeg.lib
-	$(CC) $(LDFLAGS) -edjpeg.exe $(DOBJECTS) libjpeg.lib
-
-jpegtran.exe: $(TROBJECTS) libjpeg.lib
-	$(CC) $(LDFLAGS) -ejpegtran.exe $(TROBJECTS) libjpeg.lib
-
-rdjpgcom.exe: rdjpgcom.c
-!if $d(DOS)
-	$(CC) -ms -O rdjpgcom.c
-!else
-	$(CC) $(CFLAGS) rdjpgcom.c
-!endif
-
-# On DOS, wrjpgcom needs large model so it can malloc a 64K chunk
-wrjpgcom.exe: wrjpgcom.c
-!if $d(DOS)
-	$(CC) -ml -O wrjpgcom.c
-!else
-	$(CC) $(CFLAGS) wrjpgcom.c
-!endif
-
-# This "{}" syntax allows Borland Make to "batch" source files.
-# In this way, each run of the compiler can build many modules.
-.c.obj:
-	$(CC) $(CFLAGS) -c{ $<}
-
-jconfig.h: jconfig.doc
-	echo You must prepare a system-dependent jconfig.h file.
-	echo Please read the installation directions in install.doc.
-	exit 1
-
-clean:
-	- del *.obj
-	- del libjpeg.lib
-	- del cjpeg.exe
-	- del djpeg.exe
-	- del jpegtran.exe
-	- del rdjpgcom.exe
-	- del wrjpgcom.exe
-	- del testout*.*
-
-test: cjpeg.exe djpeg.exe jpegtran.exe
-	- del testout*.*
-	djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
-	djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
-	cjpeg -dct int -outfile testout.jpg  testimg.ppm
-	djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-	cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-	jpegtran -outfile testoutt.jpg testprog.jpg
-!if $d(DOS)
-	fc /b testimg.ppm testout.ppm
-	fc /b testimg.bmp testout.bmp
-	fc /b testimg.jpg testout.jpg
-	fc /b testimg.ppm testoutp.ppm
-	fc /b testimgp.jpg testoutp.jpg
-	fc /b testorig.jpg testoutt.jpg
-!else
-	echo n > n.tmp
-	comp testimg.ppm testout.ppm < n.tmp
-	comp testimg.bmp testout.bmp < n.tmp
-	comp testimg.jpg testout.jpg < n.tmp
-	comp testimg.ppm testoutp.ppm < n.tmp
-	comp testimgp.jpg testoutp.jpg < n.tmp
-	comp testorig.jpg testoutt.jpg < n.tmp
-	del n.tmp
-!endif
-
-
-jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.obj: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.obj: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.obj: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.obj: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.obj: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.obj: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.obj: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.obj: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.obj: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.obj: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.obj: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.obj: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.obj: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.obj: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.obj: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.obj: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.obj: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.obj: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.obj: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.obj: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.obj: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.obj: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.obj: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.obj: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.obj: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.obj: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.obj: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.obj: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-jmemdosa.obj: jmemdosa.asm
-	tasm /mx jmemdosa.asm

diff --git a/makefile.cfg b/makefile.cfg
deleted file mode 100644
index f25e42e..0000000
--- a/makefile.cfg
+++ /dev/null

@@ -1,319 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# makefile.cfg is edited by configure to produce a custom Makefile.
-
-# Read installation instructions before saying "make" !!
-
-# For compiling with source and object files in different directories.
-srcdir = @srcdir@
-VPATH = @srcdir@
-
-# Where to install the programs and man pages.
-prefix = @prefix@
-exec_prefix = @exec_prefix@
-bindir = $(exec_prefix)/bin
-libdir = $(exec_prefix)/lib
-includedir = $(prefix)/include
-binprefix =
-manprefix =
-manext = 1
-mandir = $(prefix)/man/man$(manext)
-
-# The name of your C compiler:
-CC= @CC@
-
-# You may need to adjust these cc options:
-CFLAGS= @CFLAGS@ @CPPFLAGS@ @INCLUDEFLAGS@
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-# However, any special defines for ansi2knr.c may be included here:
-ANSI2KNRFLAGS= @ANSI2KNRFLAGS@
-
-# Link-time cc options:
-LDFLAGS= @LDFLAGS@
-
-# To link any special libraries, add the necessary -l commands here.
-LDLIBS= @LIBS@
-
-# If using GNU libtool, LIBTOOL references it; if not, LIBTOOL is empty.
-LIBTOOL = @LIBTOOL@
-# $(O) expands to "lo" if using libtool, plain "o" if not.
-# Similarly, $(A) expands to "la" or "a".
-O = @O@
-A = @A@
-
-# Library version ID; libtool uses this for the shared library version number.
-# Note: we suggest this match the macro of the same name in jpeglib.h.
-JPEG_LIB_VERSION = @JPEG_LIB_VERSION@
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.  For Unix this is usually jmemnobs.o, but you may want
-# to use jmemansi.o or jmemname.o if you have limited swap space.
-SYSDEPMEM= @MEMORYMGR@
-
-# miscellaneous OS-dependent stuff
-SHELL= /bin/sh
-# linker
-LN= @LN@
-# file deletion command
-RM= rm -f
-# directory creation command
-MKDIR= mkdir
-# library (.a) file creation command
-AR= ar rc
-# second step in .a creation (use "touch" if not needed)
-AR2= @RANLIB@
-# installation program
-INSTALL= @INSTALL@
-INSTALL_PROGRAM= @INSTALL_PROGRAM@
-INSTALL_LIB= @INSTALL_LIB@
-INSTALL_DATA= @INSTALL_DATA@
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
-        jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
-        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
-        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
-        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
-        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
-        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
-        makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
-        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
-        jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
-        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.$(O) jutils.$(O) jerror.$(O) jmemmgr.$(O) $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.$(O) jcapistd.$(O) jctrans.$(O) jcparam.$(O) \
-        jdatadst.$(O) jcinit.$(O) jcmaster.$(O) jcmarker.$(O) jcmainct.$(O) \
-        jcprepct.$(O) jccoefct.$(O) jccolor.$(O) jcsample.$(O) jchuff.$(O) \
-        jcphuff.$(O) jcdctmgr.$(O) jfdctfst.$(O) jfdctflt.$(O) \
-        jfdctint.$(O)
-# decompression library object files
-DLIBOBJECTS= jdapimin.$(O) jdapistd.$(O) jdtrans.$(O) jdatasrc.$(O) \
-        jdmaster.$(O) jdinput.$(O) jdmarker.$(O) jdhuff.$(O) jdphuff.$(O) \
-        jdmainct.$(O) jdcoefct.$(O) jdpostct.$(O) jddctmgr.$(O) \
-        jidctfst.$(O) jidctflt.$(O) jidctint.$(O) jidctred.$(O) \
-        jdsample.$(O) jdcolor.$(O) jquant1.$(O) jquant2.$(O) jdmerge.$(O)
-# These objectfiles are included in libjpeg.a
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.$(O) rdppm.$(O) rdgif.$(O) rdtarga.$(O) rdrle.$(O) \
-        rdbmp.$(O) rdswitch.$(O) cdjpeg.$(O)
-DOBJECTS= djpeg.$(O) wrppm.$(O) wrgif.$(O) wrtarga.$(O) wrrle.$(O) \
-        wrbmp.$(O) rdcolmap.$(O) cdjpeg.$(O)
-TROBJECTS= jpegtran.$(O) rdswitch.$(O) cdjpeg.$(O) transupp.$(O)
-
-
-all: @A2K_DEPS@ libjpeg.$(A) cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-
-# Special compilation rules to support ansi2knr and libtool.
-.SUFFIXES: .lo .la
-
-# How to compile with libtool.
-@COM_LT@.c.lo:
-@COM_LT@	$(LIBTOOL) --mode=compile $(CC) $(CFLAGS) -c $(srcdir)/$*.c
-
-# How to use ansi2knr, when not using libtool.
-@COM_A2K@.c.o:
-@COM_A2K@	./ansi2knr $(srcdir)/$*.c knr/$*.c
-@COM_A2K@	$(CC) $(CFLAGS) -c knr/$*.c
-@COM_A2K@	$(RM) knr/$*.c
-
-# How to use ansi2knr AND libtool.
-@COM_A2K@.c.lo:
-@COM_A2K@	./ansi2knr $(srcdir)/$*.c knr/$*.c
-@COM_A2K@	$(LIBTOOL) --mode=compile $(CC) $(CFLAGS) -c knr/$*.c
-@COM_A2K@	$(RM) knr/$*.c
-
-ansi2knr: ansi2knr.c
-	$(CC) $(CFLAGS) $(ANSI2KNRFLAGS) -o ansi2knr $(srcdir)/ansi2knr.c
-	$(MKDIR) knr
-
-# the library:
-
-# without libtool:
-libjpeg.a: @A2K_DEPS@ $(LIBOBJECTS)
-	$(RM) libjpeg.a
-	$(AR) libjpeg.a  $(LIBOBJECTS)
-	$(AR2) libjpeg.a
-
-# with libtool:
-libjpeg.la: @A2K_DEPS@ $(LIBOBJECTS)
-	$(LIBTOOL) --mode=link $(CC) -o libjpeg.la $(LIBOBJECTS) \
-		-rpath $(libdir) -version-info $(JPEG_LIB_VERSION)
-
-# sample programs:
-
-cjpeg: $(COBJECTS) libjpeg.$(A)
-	$(LN) $(LDFLAGS) -o cjpeg $(COBJECTS) libjpeg.$(A) $(LDLIBS)
-
-djpeg: $(DOBJECTS) libjpeg.$(A)
-	$(LN) $(LDFLAGS) -o djpeg $(DOBJECTS) libjpeg.$(A) $(LDLIBS)
-
-jpegtran: $(TROBJECTS) libjpeg.$(A)
-	$(LN) $(LDFLAGS) -o jpegtran $(TROBJECTS) libjpeg.$(A) $(LDLIBS)
-
-rdjpgcom: rdjpgcom.$(O)
-	$(LN) $(LDFLAGS) -o rdjpgcom rdjpgcom.$(O) $(LDLIBS)
-
-wrjpgcom: wrjpgcom.$(O)
-	$(LN) $(LDFLAGS) -o wrjpgcom wrjpgcom.$(O) $(LDLIBS)
-
-# Installation rules:
-
-install: cjpeg djpeg jpegtran rdjpgcom wrjpgcom @FORCE_INSTALL_LIB@
-	$(INSTALL_PROGRAM) cjpeg $(bindir)/$(binprefix)cjpeg
-	$(INSTALL_PROGRAM) djpeg $(bindir)/$(binprefix)djpeg
-	$(INSTALL_PROGRAM) jpegtran $(bindir)/$(binprefix)jpegtran
-	$(INSTALL_PROGRAM) rdjpgcom $(bindir)/$(binprefix)rdjpgcom
-	$(INSTALL_PROGRAM) wrjpgcom $(bindir)/$(binprefix)wrjpgcom
-	$(INSTALL_DATA) $(srcdir)/cjpeg.1 $(mandir)/$(manprefix)cjpeg.$(manext)
-	$(INSTALL_DATA) $(srcdir)/djpeg.1 $(mandir)/$(manprefix)djpeg.$(manext)
-	$(INSTALL_DATA) $(srcdir)/jpegtran.1 $(mandir)/$(manprefix)jpegtran.$(manext)
-	$(INSTALL_DATA) $(srcdir)/rdjpgcom.1 $(mandir)/$(manprefix)rdjpgcom.$(manext)
-	$(INSTALL_DATA) $(srcdir)/wrjpgcom.1 $(mandir)/$(manprefix)wrjpgcom.$(manext)
-
-install-lib: libjpeg.$(A) install-headers
-	$(INSTALL_LIB) libjpeg.$(A) $(libdir)/$(binprefix)libjpeg.$(A)
-
-install-headers: jconfig.h
-	$(INSTALL_DATA) jconfig.h $(includedir)/jconfig.h
-	$(INSTALL_DATA) $(srcdir)/jpeglib.h $(includedir)/jpeglib.h
-	$(INSTALL_DATA) $(srcdir)/jmorecfg.h $(includedir)/jmorecfg.h
-	$(INSTALL_DATA) $(srcdir)/jerror.h $(includedir)/jerror.h
-
-clean:
-	$(RM) *.o *.lo libjpeg.a libjpeg.la
-	$(RM) cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-	$(RM) ansi2knr core testout* config.log config.status
-	$(RM) -r knr .libs _libs
-
-distclean: clean
-	$(RM) Makefile jconfig.h libtool config.cache
-
-test: cjpeg djpeg jpegtran
-	$(RM) testout*
-	./djpeg -dct int -ppm -outfile testout.ppm  $(srcdir)/testorig.jpg
-	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp  $(srcdir)/testorig.jpg
-	./cjpeg -dct int -outfile testout.jpg  $(srcdir)/testimg.ppm
-	./djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)/testprog.jpg
-	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)/testimg.ppm
-	./jpegtran -outfile testoutt.jpg $(srcdir)/testprog.jpg
-	cmp $(srcdir)/testimg.ppm testout.ppm
-	cmp $(srcdir)/testimg.bmp testout.bmp
-	cmp $(srcdir)/testimg.jpg testout.jpg
-	cmp $(srcdir)/testimg.ppm testoutp.ppm
-	cmp $(srcdir)/testimgp.jpg testoutp.jpg
-	cmp $(srcdir)/testorig.jpg testoutt.jpg
-
-check: test
-
-# Mistake catcher:
-
-jconfig.h: jconfig.doc
-	echo You must prepare a system-dependent jconfig.h file.
-	echo Please read the installation directions in install.doc.
-	exit 1
-
-# GNU Make likes to know which target names are not really files to be made:
-.PHONY: all install install-lib install-headers clean distclean test check
-
-
-jcapimin.$(O): jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.$(O): jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.$(O): jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.$(O): jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.$(O): jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.$(O): jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.$(O): jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.$(O): jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.$(O): jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.$(O): jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.$(O): jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.$(O): jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.$(O): jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.$(O): jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.$(O): jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.$(O): jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.$(O): jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.$(O): jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.$(O): jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.$(O): jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.$(O): jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.$(O): jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.$(O): jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.$(O): jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.$(O): jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.$(O): jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.$(O): jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.$(O): jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.$(O): jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.$(O): jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.$(O): jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.$(O): jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.$(O): jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.$(O): jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.$(O): jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.$(O): jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.$(O): jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.$(O): jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.$(O): jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.$(O): jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.$(O): jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.$(O): jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.$(O): jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.$(O): jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.$(O): jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.$(O): jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.$(O): jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.$(O): jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.$(O): jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.$(O): jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.$(O): cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.$(O): djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.$(O): jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.$(O): rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.$(O): wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.$(O): cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.$(O): rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.$(O): rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.$(O): transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.$(O): rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.$(O): wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.$(O): rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.$(O): wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.$(O): rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.$(O): wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.$(O): rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.$(O): wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.$(O): rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.$(O): wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h

diff --git a/makefile.dj b/makefile.dj
deleted file mode 100644
index f766d25..0000000
--- a/makefile.dj
+++ /dev/null

@@ -1,220 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is for DJGPP (Delorie's GNU C port on MS-DOS), v2.0 or later.
-# Thanks to Frank J. Donahoe for this version.
-
-# Read installation instructions before saying "make" !!
-
-# The name of your C compiler:
-CC= gcc
-
-# You may need to adjust these cc options:
-CFLAGS= -O2 -Wall -I.
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-
-# Link-time cc options:
-LDFLAGS= -s
-
-# To link any special libraries, add the necessary -l commands here.
-LDLIBS= 
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.  For DJGPP this is usually jmemnobs.o, but you could
-# use jmemname.o if you want to use named temp files instead of swap space.
-SYSDEPMEM= jmemnobs.o
-
-# miscellaneous OS-dependent stuff
-# linker
-LN= $(CC)
-# file deletion command
-RM= del
-# library (.a) file creation command
-AR= ar rc
-# second step in .a creation (use "touch" if not needed)
-AR2= ranlib
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
-        jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
-        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
-        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
-        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
-        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
-        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
-        makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
-        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
-        jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
-        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
-        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
-# decompression library object files
-DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
-        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
-# These objectfiles are included in libjpeg.a
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
-        cdjpeg.o
-DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
-        cdjpeg.o
-TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
-
-
-all: libjpeg.a cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
-
-libjpeg.a: $(LIBOBJECTS)
-	$(RM) libjpeg.a
-	$(AR) libjpeg.a  $(LIBOBJECTS)
-	$(AR2) libjpeg.a
-
-cjpeg.exe: $(COBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o cjpeg.exe $(COBJECTS) libjpeg.a $(LDLIBS)
-
-djpeg.exe: $(DOBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o djpeg.exe $(DOBJECTS) libjpeg.a $(LDLIBS)
-
-jpegtran.exe: $(TROBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o jpegtran.exe $(TROBJECTS) libjpeg.a $(LDLIBS)
-
-rdjpgcom.exe: rdjpgcom.o
-	$(LN) $(LDFLAGS) -o rdjpgcom.exe rdjpgcom.o $(LDLIBS)
-
-wrjpgcom.exe: wrjpgcom.o
-	$(LN) $(LDFLAGS) -o wrjpgcom.exe wrjpgcom.o $(LDLIBS)
-
-jconfig.h: jconfig.doc
-	echo You must prepare a system-dependent jconfig.h file.
-	echo Please read the installation directions in install.doc.
-	exit 1
-
-clean:
-	$(RM) *.o
-	$(RM) cjpeg.exe
-	$(RM) djpeg.exe
-	$(RM) jpegtran.exe
-	$(RM) rdjpgcom.exe
-	$(RM) wrjpgcom.exe
-	$(RM) libjpeg.a
-	$(RM) testout*.*
-
-test: cjpeg.exe djpeg.exe jpegtran.exe
-	$(RM) testout*.*
-	./djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
-	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
-	./cjpeg -dct int -outfile testout.jpg  testimg.ppm
-	./djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-	./jpegtran -outfile testoutt.jpg testprog.jpg
-	fc /b testimg.ppm testout.ppm
-	fc /b testimg.bmp testout.bmp
-	fc /b testimg.jpg testout.jpg
-	fc /b testimg.ppm testoutp.ppm
-	fc /b testimgp.jpg testoutp.jpg
-	fc /b testorig.jpg testoutt.jpg
-
-
-jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h

diff --git a/makefile.manx b/makefile.manx
deleted file mode 100644
index 4cb42d1..0000000
--- a/makefile.manx
+++ /dev/null

@@ -1,214 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is for Amiga systems using Manx Aztec C ver 5.x.
-# Thanks to D.J. James (djjames@cup.portal.com) for this version.
-
-# Read installation instructions before saying "make" !!
-
-# The name of your C compiler:
-CC= cc
-
-# You may need to adjust these cc options:
-# Uncomment for generic 68000 code (will work on any Amiga)
-ARCHFLAGS= -sn
-
-# Uncomment for 68020/68030 code (faster, but won't run on 68000 CPU)
-#ARCHFLAGS= -c2
-
-CFLAGS= -MC -MD $(ARCHFLAGS) -spfam -r4
-
-# Link-time cc options:
-LDFLAGS= -g
-
-# To link any special libraries, add the necessary -l commands here.
-LDLIBS= -lml -lcl
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.  For Amiga we recommend jmemname.o.
-SYSDEPMEM= jmemname.o
-
-# miscellaneous OS-dependent stuff
-# linker
-LN= ln
-# file deletion command
-RM= delete quiet
-# library (.lib) file creation command
-AR= lb
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
-        jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
-        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
-        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
-        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
-        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
-        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
-        makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
-        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
-        jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
-        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
-        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
-# decompression library object files
-DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
-        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
-# These objectfiles are included in libjpeg.lib
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
-        cdjpeg.o
-DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
-        cdjpeg.o
-TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
-
-
-all: libjpeg.lib cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-
-libjpeg.lib: $(LIBOBJECTS)
-	-$(RM) libjpeg.lib
-	$(AR) libjpeg.lib  $(LIBOBJECTS)
-
-cjpeg: $(COBJECTS) libjpeg.lib
-	$(LN) $(LDFLAGS) -o cjpeg $(COBJECTS) libjpeg.lib $(LDLIBS)
-
-djpeg: $(DOBJECTS) libjpeg.lib
-	$(LN) $(LDFLAGS) -o djpeg $(DOBJECTS) libjpeg.lib $(LDLIBS)
-
-jpegtran: $(TROBJECTS) libjpeg.lib
-	$(LN) $(LDFLAGS) -o jpegtran $(TROBJECTS) libjpeg.lib $(LDLIBS)
-
-rdjpgcom: rdjpgcom.o
-	$(LN) $(LDFLAGS) -o rdjpgcom rdjpgcom.o $(LDLIBS)
-
-wrjpgcom: wrjpgcom.o
-	$(LN) $(LDFLAGS) -o wrjpgcom wrjpgcom.o $(LDLIBS)
-
-jconfig.h: jconfig.doc
-	echo You must prepare a system-dependent jconfig.h file.
-	echo Please read the installation directions in install.doc.
-	exit 1
-
-clean:
-	-$(RM) *.o cjpeg djpeg jpegtran libjpeg.lib rdjpgcom wrjpgcom
-	-$(RM) core testout*.*
-
-test: cjpeg djpeg jpegtran
-	-$(RM) testout*.*
-	djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
-	djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
-	cjpeg -dct int -outfile testout.jpg  testimg.ppm
-	djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-	cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-	jpegtran -outfile testoutt.jpg testprog.jpg
-	cmp testimg.ppm testout.ppm
-	cmp testimg.bmp testout.bmp
-	cmp testimg.jpg testout.jpg
-	cmp testimg.ppm testoutp.ppm
-	cmp testimgp.jpg testoutp.jpg
-	cmp testorig.jpg testoutt.jpg
-
-
-jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h

diff --git a/makefile.mc6 b/makefile.mc6
deleted file mode 100644
index 6aff054..0000000
--- a/makefile.mc6
+++ /dev/null

@@ -1,249 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is for Microsoft C for MS-DOS, version 6.00A and up.
-# Use NMAKE, not Microsoft's brain-damaged MAKE.
-# Thanks to Alan Wright and Chris Turner of Olivetti Research Ltd.
-
-# Read installation instructions before saying "nmake" !!
-
-# You may need to adjust these compiler options:
-CFLAGS = -AM -Oecigt -Gs -W3
-# -AM medium memory model (or use -AS for small model, if you remove features)
-# -Oecigt -Gs  maximum safe optimisation (-Ol has bugs in MSC 6.00A)
-# -W3 warning level 3
-# You might also want to add -G2 if you have an 80286, etc.
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-
-# Jan-Herman Buining suggests the following switches for MS C 8.0 and a 486:
-# CFLAGS = /AM /f- /FPi87 /G3 /Gs /Gy /Ob1 /Oc /Oe /Og /Oi /Ol /On /Oo /Ot \
-#          /OV4 /W3
-# except for jquant1.c, which must be compiled with /Oo- to avoid a compiler
-# crash.
-
-# Ingar Steinsland suggests the following switches when building
-# a 16-bit Windows DLL:
-# CFLAGS = -ALw -Gsw -Zpe -W3 -O2 -Zi -Zd
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.  For DOS, we recommend jmemdos.c and jmemdosa.asm.
-# (But not for Windows; see install.doc if you use this makefile for Windows.)
-SYSDEPMEM= jmemdos.obj jmemdosa.obj
-# SYSDEPMEMLIB must list the same files with "+" signs for the librarian.
-SYSDEPMEMLIB= +jmemdos.obj +jmemdosa.obj
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
-        jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
-        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
-        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
-        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
-        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
-        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
-        makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
-        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
-        jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
-        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
-        jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
-        jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
-        jcdctmgr.obj jfdctfst.obj jfdctflt.obj jfdctint.obj
-# decompression library object files
-DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
-        jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
-        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jidctfst.obj \
-        jidctflt.obj jidctint.obj jidctred.obj jdsample.obj jdcolor.obj \
-        jquant1.obj jquant2.obj jdmerge.obj
-# These objectfiles are included in libjpeg.lib
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj \
-        rdswitch.obj cdjpeg.obj
-DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
-        rdcolmap.obj cdjpeg.obj
-TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
-
-# need linker response file because file list > 128 chars
-RFILE = libjpeg.ans
-
-
-all: libjpeg.lib cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
-
-libjpeg.lib: $(LIBOBJECTS) $(RFILE)
-	del libjpeg.lib
-	lib @$(RFILE)
-
-# linker response file for building libjpeg.lib
-$(RFILE) : makefile
-	del $(RFILE)
-	echo libjpeg.lib >$(RFILE)
-# silly want-to-create-it prompt:
-	echo y >>$(RFILE)
-	echo +jcapimin.obj +jcapistd.obj +jctrans.obj +jcparam.obj & >>$(RFILE)
-	echo +jdatadst.obj +jcinit.obj +jcmaster.obj +jcmarker.obj & >>$(RFILE)
-	echo +jcmainct.obj +jcprepct.obj +jccoefct.obj & >>$(RFILE)
-	echo +jccolor.obj +jcsample.obj +jchuff.obj +jcphuff.obj & >>$(RFILE)
-	echo +jcdctmgr.obj +jfdctfst.obj +jfdctflt.obj & >>$(RFILE)
-	echo +jfdctint.obj +jdapimin.obj +jdapistd.obj & >>$(RFILE)
-	echo +jdtrans.obj +jdatasrc.obj +jdmaster.obj +jdinput.obj & >>$(RFILE)
-	echo +jdmarker.obj +jdhuff.obj +jdphuff.obj +jdmainct.obj & >>$(RFILE)
-	echo +jdcoefct.obj +jdpostct.obj +jddctmgr.obj & >>$(RFILE)
-	echo +jidctfst.obj +jidctflt.obj +jidctint.obj & >>$(RFILE)
-	echo +jidctred.obj +jdsample.obj +jdcolor.obj +jquant1.obj & >>$(RFILE)
-	echo +jquant2.obj +jdmerge.obj +jcomapi.obj +jutils.obj & >>$(RFILE)
-	echo +jerror.obj +jmemmgr.obj & >>$(RFILE)
-	echo $(SYSDEPMEMLIB) ; >>$(RFILE)
-
-cjpeg.exe: $(COBJECTS) libjpeg.lib
-	echo $(COBJECTS) >cjpeg.lst
-	link /STACK:4096 /EXEPACK @cjpeg.lst, cjpeg.exe, , libjpeg.lib, ;
-	del cjpeg.lst
-
-djpeg.exe: $(DOBJECTS) libjpeg.lib
-	echo $(DOBJECTS) >djpeg.lst
-	link /STACK:4096 /EXEPACK @djpeg.lst, djpeg.exe, , libjpeg.lib, ;
-	del djpeg.lst
-
-jpegtran.exe: $(TROBJECTS) libjpeg.lib
-	link /STACK:4096 /EXEPACK $(TROBJECTS), jpegtran.exe, , libjpeg.lib, ;
-
-rdjpgcom.exe: rdjpgcom.c
-	$(CC) -AS -O -W3 rdjpgcom.c
-
-# wrjpgcom needs large model so it can malloc a 64K chunk
-wrjpgcom.exe: wrjpgcom.c
-	$(CC) -AL -O -W3 wrjpgcom.c
-
-jconfig.h: jconfig.doc
-	echo You must prepare a system-dependent jconfig.h file.
-	echo Please read the installation directions in install.doc.
-	exit 1
-
-clean:
-	del *.obj
-	del libjpeg.lib
-	del cjpeg.exe
-	del djpeg.exe
-	del jpegtran.exe
-	del rdjpgcom.exe
-	del wrjpgcom.exe
-	del testout*.*
-
-test: cjpeg.exe djpeg.exe jpegtran.exe
-	del testout*.*
-	djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
-	djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
-	cjpeg -dct int -outfile testout.jpg  testimg.ppm
-	djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-	cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-	jpegtran -outfile testoutt.jpg testprog.jpg
-	fc /b testimg.ppm testout.ppm
-	fc /b testimg.bmp testout.bmp
-	fc /b testimg.jpg testout.jpg
-	fc /b testimg.ppm testoutp.ppm
-	fc /b testimgp.jpg testoutp.jpg
-	fc /b testorig.jpg testoutt.jpg
-
-
-jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.obj: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.obj: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.obj: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.obj: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.obj: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.obj: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.obj: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.obj: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.obj: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.obj: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.obj: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.obj: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.obj: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.obj: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.obj: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.obj: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.obj: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.obj: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.obj: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.obj: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.obj: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.obj: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.obj: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.obj: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.obj: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.obj: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.obj: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.obj: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-jmemdosa.obj : jmemdosa.asm
-	masm /mx $*;

diff --git a/makefile.mms b/makefile.mms
deleted file mode 100644
index cf130e5..0000000
--- a/makefile.mms
+++ /dev/null

@@ -1,218 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is for use with MMS on Digital VMS systems.
-# Thanks to Rick Dyson (dyson@iowasp.physics.uiowa.edu)
-# and Tim Bell (tbell@netcom.com) for their help.
-
-# Read installation instructions before saying "MMS" !!
-
-# You may need to adjust these cc options:
-CFLAGS= $(CFLAGS) /NoDebug /Optimize
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via /Define switches here.
-.ifdef ALPHA
-OPT=
-.else
-OPT= ,Sys$Disk:[]MAKVMS.OPT/Option
-.endif
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.  For Unix this is usually jmemnobs.o, but you may want
-# to use jmemansi.o or jmemname.o if you have limited swap space.
-SYSDEPMEM= jmemnobs.obj
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
-        jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
-        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
-        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
-        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
-        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
-        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
-        makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
-        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
-        jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
-        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
-        jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
-        jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
-        jcdctmgr.obj jfdctfst.obj jfdctflt.obj jfdctint.obj
-# decompression library object files
-DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
-        jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
-        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jidctfst.obj \
-        jidctflt.obj jidctint.obj jidctred.obj jdsample.obj jdcolor.obj \
-        jquant1.obj jquant2.obj jdmerge.obj
-# These objectfiles are included in libjpeg.olb
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj \
-        rdswitch.obj cdjpeg.obj
-DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
-        rdcolmap.obj cdjpeg.obj
-TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
-# objectfile lists with commas --- what a crock
-COBJLIST= cjpeg.obj,rdppm.obj,rdgif.obj,rdtarga.obj,rdrle.obj,rdbmp.obj,\
-          rdswitch.obj,cdjpeg.obj
-DOBJLIST= djpeg.obj,wrppm.obj,wrgif.obj,wrtarga.obj,wrrle.obj,wrbmp.obj,\
-          rdcolmap.obj,cdjpeg.obj
-TROBJLIST= jpegtran.obj,rdswitch.obj,cdjpeg.obj,transupp.obj
-LIBOBJLIST= jcapimin.obj,jcapistd.obj,jctrans.obj,jcparam.obj,jdatadst.obj,\
-          jcinit.obj,jcmaster.obj,jcmarker.obj,jcmainct.obj,jcprepct.obj,\
-          jccoefct.obj,jccolor.obj,jcsample.obj,jchuff.obj,jcphuff.obj,\
-          jcdctmgr.obj,jfdctfst.obj,jfdctflt.obj,jfdctint.obj,jdapimin.obj,\
-          jdapistd.obj,jdtrans.obj,jdatasrc.obj,jdmaster.obj,jdinput.obj,\
-          jdmarker.obj,jdhuff.obj,jdphuff.obj,jdmainct.obj,jdcoefct.obj,\
-          jdpostct.obj,jddctmgr.obj,jidctfst.obj,jidctflt.obj,jidctint.obj,\
-          jidctred.obj,jdsample.obj,jdcolor.obj,jquant1.obj,jquant2.obj,\
-          jdmerge.obj,jcomapi.obj,jutils.obj,jerror.obj,jmemmgr.obj,$(SYSDEPMEM)
-
-
-.first
-	@- Define /NoLog Sys Sys$Library
-
-ALL : libjpeg.olb cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
-	@ Continue
-
-libjpeg.olb : $(LIBOBJECTS)
-	Library /Create libjpeg.olb $(LIBOBJLIST)
-
-cjpeg.exe : $(COBJECTS) libjpeg.olb
-	$(LINK) $(LFLAGS) /Executable = cjpeg.exe $(COBJLIST),libjpeg.olb/Library$(OPT)
-
-djpeg.exe : $(DOBJECTS) libjpeg.olb
-	$(LINK) $(LFLAGS) /Executable = djpeg.exe $(DOBJLIST),libjpeg.olb/Library$(OPT)
-
-jpegtran.exe : $(TROBJECTS) libjpeg.olb
-	$(LINK) $(LFLAGS) /Executable = jpegtran.exe $(TROBJLIST),libjpeg.olb/Library$(OPT)
-
-rdjpgcom.exe : rdjpgcom.obj
-	$(LINK) $(LFLAGS) /Executable = rdjpgcom.exe rdjpgcom.obj$(OPT)
-
-wrjpgcom.exe : wrjpgcom.obj
-	$(LINK) $(LFLAGS) /Executable = wrjpgcom.exe wrjpgcom.obj$(OPT)
-
-jconfig.h : jconfig.vms
-	@- Copy jconfig.vms jconfig.h
-
-clean :
-	@- Set Protection = Owner:RWED *.*;-1
-	@- Set Protection = Owner:RWED *.OBJ
-	- Purge /NoLog /NoConfirm *.*
-	- Delete /NoLog /NoConfirm *.OBJ;
-
-test : cjpeg.exe djpeg.exe jpegtran.exe
-	mcr sys$disk:[]djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
-	mcr sys$disk:[]djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
-	mcr sys$disk:[]cjpeg -dct int      -outfile testout.jpg testimg.ppm
-	mcr sys$disk:[]djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-	mcr sys$disk:[]cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-	mcr sys$disk:[]jpegtran -outfile testoutt.jpg testprog.jpg
-	- Backup /Compare/Log	  testimg.ppm testout.ppm
-	- Backup /Compare/Log	  testimg.bmp testout.bmp
-	- Backup /Compare/Log	  testimg.jpg testout.jpg
-	- Backup /Compare/Log	  testimg.ppm testoutp.ppm
-	- Backup /Compare/Log	  testimgp.jpg testoutp.jpg
-	- Backup /Compare/Log	  testorig.jpg testoutt.jpg
-
-
-jcapimin.obj : jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.obj : jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.obj : jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.obj : jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.obj : jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.obj : jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.obj : jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.obj : jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.obj : jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.obj : jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.obj : jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.obj : jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.obj : jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.obj : jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.obj : jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.obj : jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.obj : jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.obj : jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.obj : jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.obj : jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.obj : jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.obj : jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.obj : jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.obj : jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.obj : jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.obj : jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.obj : jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.obj : jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.obj : jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.obj : jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.obj : jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.obj : jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.obj : jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.obj : jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.obj : jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.obj : jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.obj : jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.obj : jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.obj : jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.obj : jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.obj : jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.obj : jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.obj : jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.obj : jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.obj : jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.obj : jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.obj : jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.obj : jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.obj : jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.obj : jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.obj : cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.obj : djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.obj : jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.obj : rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.obj : wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.obj : cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.obj : rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.obj : rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.obj : transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.obj : rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.obj : wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.obj : rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.obj : wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.obj : rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.obj : wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.obj : rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.obj : wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.obj : rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.obj : wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h

diff --git a/makefile.sas b/makefile.sas
deleted file mode 100644
index f296faf..0000000
--- a/makefile.sas
+++ /dev/null

@@ -1,252 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is for Amiga systems using SAS C 6.0 and up.
-# Thanks to Ed Hanway, Mark Rinfret, and Jim Zepeda.
-
-# Read installation instructions before saying "make" !!
-
-# The name of your C compiler:
-CC= sc
-
-# You may need to adjust these cc options:
-# Uncomment the following lines for generic 680x0 version
-ARCHFLAGS= cpu=any
-SUFFIX=
-
-# Uncomment the following lines for 68030-only version
-#ARCHFLAGS= cpu=68030
-#SUFFIX=.030
-
-CFLAGS= nostackcheck data=near parms=register optimize $(ARCHFLAGS) \
-	ignore=104 ignore=304 ignore=306
-# ignore=104 disables warnings for mismatched const qualifiers
-# ignore=304 disables warnings for variables being optimized out
-# ignore=306 disables warnings for the inlining of functions
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via define switches here.
-
-# Link-time cc options:
-LDFLAGS= SC SD ND BATCH
-
-# To link any special libraries, add the necessary commands here.
-LDLIBS= LIB:scm.lib LIB:sc.lib
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.  For Amiga we recommend jmemname.o.
-SYSDEPMEM= jmemname.o
-
-# miscellaneous OS-dependent stuff
-# linker
-LN= slink
-# file deletion command
-RM= delete quiet
-# library (.lib) file creation command
-AR= oml
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
-        jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
-        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
-        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
-        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
-        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
-        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
-        makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
-        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
-        jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
-        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
-        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
-# decompression library object files
-DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
-        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
-# These objectfiles are included in libjpeg.lib
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
-        cdjpeg.o
-DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
-        cdjpeg.o
-TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
-
-
-all: libjpeg.lib cjpeg$(SUFFIX) djpeg$(SUFFIX) jpegtran$(SUFFIX) rdjpgcom$(SUFFIX) wrjpgcom$(SUFFIX)
-
-# note: do several AR steps to avoid command line length limitations
-
-libjpeg.lib: $(LIBOBJECTS)
-	-$(RM) libjpeg.lib
-	$(AR) libjpeg.lib r $(CLIBOBJECTS)
-	$(AR) libjpeg.lib r $(DLIBOBJECTS)
-	$(AR) libjpeg.lib r $(COMOBJECTS)
-
-cjpeg$(SUFFIX): $(COBJECTS) libjpeg.lib
-	$(LN) <WITH <
-$(LDFLAGS)
-TO cjpeg$(SUFFIX)
-FROM LIB:c.o $(COBJECTS)
-LIB libjpeg.lib $(LDLIBS)
-<
-
-djpeg$(SUFFIX): $(DOBJECTS) libjpeg.lib
-	$(LN) <WITH <
-$(LDFLAGS)
-TO djpeg$(SUFFIX)
-FROM LIB:c.o $(DOBJECTS)
-LIB libjpeg.lib $(LDLIBS)
-<
-
-jpegtran$(SUFFIX): $(TROBJECTS) libjpeg.lib
-	$(LN) <WITH <
-$(LDFLAGS)
-TO jpegtran$(SUFFIX)
-FROM LIB:c.o $(TROBJECTS)
-LIB libjpeg.lib $(LDLIBS)
-<
-
-rdjpgcom$(SUFFIX): rdjpgcom.o
-	$(LN) <WITH <
-$(LDFLAGS)
-TO rdjpgcom$(SUFFIX)
-FROM LIB:c.o rdjpgcom.o
-LIB $(LDLIBS)
-<
-
-wrjpgcom$(SUFFIX): wrjpgcom.o
-	$(LN) <WITH <
-$(LDFLAGS)
-TO wrjpgcom$(SUFFIX)
-FROM LIB:c.o wrjpgcom.o
-LIB $(LDLIBS)
-<
-
-jconfig.h: jconfig.doc
-	echo You must prepare a system-dependent jconfig.h file.
-	echo Please read the installation directions in install.doc.
-	exit 1
-
-clean:
-	-$(RM) *.o cjpeg djpeg jpegtran cjpeg.030 djpeg.030 jpegtran.030
-	-$(RM) rdjpgcom wrjpgcom rdjpgcom.030 wrjpgcom.030
-	-$(RM) libjpeg.lib core testout*.*
-
-test: cjpeg djpeg jpegtran
-	-$(RM) testout*.*
-	djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
-	djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
-	cjpeg -dct int -outfile testout.jpg  testimg.ppm
-	djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-	cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-	jpegtran -outfile testoutt.jpg testprog.jpg
-	cmp testimg.ppm testout.ppm
-	cmp testimg.bmp testout.bmp
-	cmp testimg.jpg testout.jpg
-	cmp testimg.ppm testoutp.ppm
-	cmp testimgp.jpg testoutp.jpg
-	cmp testorig.jpg testoutt.jpg
-
-
-jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h

diff --git a/makefile.unix b/makefile.unix
deleted file mode 100644
index 00455ab..0000000
--- a/makefile.unix
+++ /dev/null

@@ -1,228 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is suitable for Unix-like systems with non-ANSI compilers.
-# If you have an ANSI compiler, makefile.ansi is a better starting point.
-
-# Read installation instructions before saying "make" !!
-
-# The name of your C compiler:
-CC= cc
-
-# You may need to adjust these cc options:
-CFLAGS= -O
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-# However, any special defines for ansi2knr.c may be included here:
-ANSI2KNRFLAGS= 
-
-# Link-time cc options:
-LDFLAGS= 
-
-# To link any special libraries, add the necessary -l commands here.
-LDLIBS= 
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.  For Unix this is usually jmemnobs.o, but you may want
-# to use jmemansi.o or jmemname.o if you have limited swap space.
-SYSDEPMEM= jmemnobs.o
-
-# miscellaneous OS-dependent stuff
-# linker
-LN= $(CC)
-# file deletion command
-RM= rm -f
-# file rename command
-MV= mv
-# library (.a) file creation command
-AR= ar rc
-# second step in .a creation (use "touch" if not needed)
-AR2= ranlib
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
-        jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
-        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
-        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
-        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
-        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
-        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
-        makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
-        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
-        jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
-        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
-        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
-# decompression library object files
-DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
-        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
-# These objectfiles are included in libjpeg.a
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
-        cdjpeg.o
-DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
-        cdjpeg.o
-TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
-
-
-all: ansi2knr libjpeg.a cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-
-# This rule causes ansi2knr to be invoked.
-.c.o:
-	./ansi2knr $*.c T$*.c
-	$(CC) $(CFLAGS) -c T$*.c
-	$(RM) T$*.c $*.o
-	$(MV) T$*.o $*.o
-
-ansi2knr: ansi2knr.c
-	$(CC) $(CFLAGS) $(ANSI2KNRFLAGS) -o ansi2knr ansi2knr.c
-
-libjpeg.a: ansi2knr $(LIBOBJECTS)
-	$(RM) libjpeg.a
-	$(AR) libjpeg.a  $(LIBOBJECTS)
-	$(AR2) libjpeg.a
-
-cjpeg: ansi2knr $(COBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o cjpeg $(COBJECTS) libjpeg.a $(LDLIBS)
-
-djpeg: ansi2knr $(DOBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o djpeg $(DOBJECTS) libjpeg.a $(LDLIBS)
-
-jpegtran: ansi2knr $(TROBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o jpegtran $(TROBJECTS) libjpeg.a $(LDLIBS)
-
-rdjpgcom: rdjpgcom.o
-	$(LN) $(LDFLAGS) -o rdjpgcom rdjpgcom.o $(LDLIBS)
-
-wrjpgcom: wrjpgcom.o
-	$(LN) $(LDFLAGS) -o wrjpgcom wrjpgcom.o $(LDLIBS)
-
-jconfig.h: jconfig.doc
-	echo You must prepare a system-dependent jconfig.h file.
-	echo Please read the installation directions in install.doc.
-	exit 1
-
-clean:
-	$(RM) *.o cjpeg djpeg jpegtran libjpeg.a rdjpgcom wrjpgcom
-	$(RM) ansi2knr core testout*
-
-test: cjpeg djpeg jpegtran
-	$(RM) testout*
-	./djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
-	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
-	./cjpeg -dct int -outfile testout.jpg  testimg.ppm
-	./djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-	./jpegtran -outfile testoutt.jpg testprog.jpg
-	cmp testimg.ppm testout.ppm
-	cmp testimg.bmp testout.bmp
-	cmp testimg.jpg testout.jpg
-	cmp testimg.ppm testoutp.ppm
-	cmp testimgp.jpg testoutp.jpg
-	cmp testorig.jpg testoutt.jpg
-
-
-jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h

diff --git a/makefile.vc b/makefile.vc
deleted file mode 100644
index 2acf069..0000000
--- a/makefile.vc
+++ /dev/null

@@ -1,211 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is for Microsoft Visual C++ on Windows NT (and 95?).
-# It builds the IJG library as a statically linkable library (.LIB),
-# and builds the sample applications as console-mode apps.
-# Thanks to Xingong Chang, Raymond Everly and others.
-
-# Read installation instructions before saying "nmake" !!
-# To build an optimized library without debug info, say "nmake nodebug=1".
-
-# Pull in standard variable definitions
-!include <win32.mak>
-
-# You may want to adjust these compiler options:
-CFLAGS= $(cflags) $(cdebug) $(cvars) -I.
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-
-# Link-time options:
-LDFLAGS= $(ldebug) $(conlflags)
-
-# To link any special libraries, add the necessary commands here.
-LDLIBS= $(conlibs)
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.  For NT we suggest jmemnobs.obj, which expects the OS to
-# provide adequate virtual memory.
-SYSDEPMEM= jmemnobs.obj
-
-# miscellaneous OS-dependent stuff
-# file deletion command
-RM= del
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
-        jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
-        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
-        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
-        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
-        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
-        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
-        makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
-        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
-        jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
-        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
-        jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
-        jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
-        jcdctmgr.obj jfdctfst.obj jfdctflt.obj jfdctint.obj
-# decompression library object files
-DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
-        jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
-        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jidctfst.obj \
-        jidctflt.obj jidctint.obj jidctred.obj jdsample.obj jdcolor.obj \
-        jquant1.obj jquant2.obj jdmerge.obj
-# These objectfiles are included in libjpeg.lib
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj \
-        rdswitch.obj cdjpeg.obj
-DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
-        rdcolmap.obj cdjpeg.obj
-TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
-
-# Template command for compiling .c to .obj
-.c.obj:
-	$(cc) $(CFLAGS) $*.c
-
-
-all: libjpeg.lib cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
-
-libjpeg.lib: $(LIBOBJECTS)
-	$(RM) libjpeg.lib
-	lib -out:libjpeg.lib  $(LIBOBJECTS)
-
-cjpeg.exe: $(COBJECTS) libjpeg.lib
-	$(link) $(LDFLAGS) -out:cjpeg.exe $(COBJECTS) libjpeg.lib $(LDLIBS)
-
-djpeg.exe: $(DOBJECTS) libjpeg.lib
-	$(link) $(LDFLAGS) -out:djpeg.exe $(DOBJECTS) libjpeg.lib $(LDLIBS)
-
-jpegtran.exe: $(TROBJECTS) libjpeg.lib
-	$(link) $(LDFLAGS) -out:jpegtran.exe $(TROBJECTS) libjpeg.lib $(LDLIBS)
-
-rdjpgcom.exe: rdjpgcom.obj
-	$(link) $(LDFLAGS) -out:rdjpgcom.exe rdjpgcom.obj $(LDLIBS)
-
-wrjpgcom.exe: wrjpgcom.obj
-	$(link) $(LDFLAGS) -out:wrjpgcom.exe wrjpgcom.obj $(LDLIBS)
-
-
-clean:
-	$(RM) *.obj *.exe libjpeg.lib
-	$(RM) testout*
-
-test: cjpeg.exe djpeg.exe jpegtran.exe
-	$(RM) testout*
-	.\djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
-	.\djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
-	.\cjpeg -dct int -outfile testout.jpg  testimg.ppm
-	.\djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-	.\cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-	.\jpegtran -outfile testoutt.jpg testprog.jpg
-	fc /b testimg.ppm testout.ppm
-	fc /b testimg.bmp testout.bmp
-	fc /b testimg.jpg testout.jpg
-	fc /b testimg.ppm testoutp.ppm
-	fc /b testimgp.jpg testoutp.jpg
-	fc /b testorig.jpg testoutt.jpg
-
-
-jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.obj: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.obj: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.obj: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.obj: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.obj: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.obj: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.obj: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.obj: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.obj: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.obj: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.obj: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.obj: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.obj: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.obj: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.obj: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.obj: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.obj: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.obj: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.obj: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.obj: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.obj: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.obj: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.obj: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.obj: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.obj: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.obj: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.obj: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.obj: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h

diff --git a/makefile.vms b/makefile.vms
deleted file mode 100644
index a42358d..0000000
--- a/makefile.vms
+++ /dev/null

@@ -1,142 +0,0 @@
-$! Makefile for Independent JPEG Group's software
-$!
-$! This is a command procedure for Digital VMS systems that do not have MMS.
-$! It builds the JPEG software by brute force, recompiling everything whether
-$! or not it is necessary.  It then runs the basic self-test.
-$! Thanks to Rick Dyson (dyson@iowasp.physics.uiowa.edu)
-$! and Tim Bell (tbell@netcom.com) for their help.
-$!
-$! Read installation instructions before running this!!
-$!
-$ If F$Mode () .eqs. "INTERACTIVE"
-$   Then
-$       VERIFY = F$Verify (0)
-$   Else
-$       VERIFY = F$Verify (1)
-$ EndIf
-$ On Control_Y Then GoTo End
-$ On Error     Then GoTo End
-$
-$ If F$GetSyi ("HW_MODEL") .gt. 1023 
-$   Then
-$       OPT = ""
-$   Else
-$       OPT = ",Sys$Disk:[]makvms.opt/Option"
-$ EndIf
-$ 
-$ DoCompile := CC /NoDebug /Optimize /NoList
-$!
-$ DoCompile jcapimin.c
-$ DoCompile jcapistd.c
-$ DoCompile jctrans.c
-$ DoCompile jcparam.c
-$ DoCompile jdatadst.c
-$ DoCompile jcinit.c
-$ DoCompile jcmaster.c
-$ DoCompile jcmarker.c
-$ DoCompile jcmainct.c
-$ DoCompile jcprepct.c
-$ DoCompile jccoefct.c
-$ DoCompile jccolor.c
-$ DoCompile jcsample.c
-$ DoCompile jchuff.c
-$ DoCompile jcphuff.c
-$ DoCompile jcdctmgr.c
-$ DoCompile jfdctfst.c
-$ DoCompile jfdctflt.c
-$ DoCompile jfdctint.c
-$ DoCompile jdapimin.c
-$ DoCompile jdapistd.c
-$ DoCompile jdtrans.c
-$ DoCompile jdatasrc.c
-$ DoCompile jdmaster.c
-$ DoCompile jdinput.c
-$ DoCompile jdmarker.c
-$ DoCompile jdhuff.c
-$ DoCompile jdphuff.c
-$ DoCompile jdmainct.c
-$ DoCompile jdcoefct.c
-$ DoCompile jdpostct.c
-$ DoCompile jddctmgr.c
-$ DoCompile jidctfst.c
-$ DoCompile jidctflt.c
-$ DoCompile jidctint.c
-$ DoCompile jidctred.c
-$ DoCompile jdsample.c
-$ DoCompile jdcolor.c
-$ DoCompile jquant1.c
-$ DoCompile jquant2.c
-$ DoCompile jdmerge.c
-$ DoCompile jcomapi.c
-$ DoCompile jutils.c
-$ DoCompile jerror.c
-$ DoCompile jmemmgr.c
-$ DoCompile jmemnobs.c
-$!
-$ Library /Create libjpeg.olb  jcapimin.obj,jcapistd.obj,jctrans.obj, -
-          jcparam.obj,jdatadst.obj,jcinit.obj,jcmaster.obj,jcmarker.obj, -
-          jcmainct.obj,jcprepct.obj,jccoefct.obj,jccolor.obj,jcsample.obj, -
-          jchuff.obj,jcphuff.obj,jcdctmgr.obj,jfdctfst.obj,jfdctflt.obj, -
-          jfdctint.obj,jdapimin.obj,jdapistd.obj,jdtrans.obj,jdatasrc.obj, -
-          jdmaster.obj,jdinput.obj,jdmarker.obj,jdhuff.obj,jdphuff.obj, -
-          jdmainct.obj,jdcoefct.obj,jdpostct.obj,jddctmgr.obj,jidctfst.obj, -
-          jidctflt.obj,jidctint.obj,jidctred.obj,jdsample.obj,jdcolor.obj, -
-          jquant1.obj,jquant2.obj,jdmerge.obj,jcomapi.obj,jutils.obj, -
-          jerror.obj,jmemmgr.obj,jmemnobs.obj
-$!
-$ DoCompile cjpeg.c
-$ DoCompile rdppm.c
-$ DoCompile rdgif.c
-$ DoCompile rdtarga.c
-$ DoCompile rdrle.c
-$ DoCompile rdbmp.c
-$ DoCompile rdswitch.c
-$ DoCompile cdjpeg.c
-$!
-$ Link /NoMap /Executable = cjpeg.exe  cjpeg.obj,rdppm.obj,rdgif.obj, -
-          rdtarga.obj,rdrle.obj,rdbmp.obj,rdswitch.obj,cdjpeg.obj,libjpeg.olb/Library'OPT'
-$!
-$ DoCompile djpeg.c
-$ DoCompile wrppm.c
-$ DoCompile wrgif.c
-$ DoCompile wrtarga.c
-$ DoCompile wrrle.c
-$ DoCompile wrbmp.c
-$ DoCompile rdcolmap.c
-$ DoCompile cdjpeg.c
-$!
-$ Link /NoMap /Executable = djpeg.exe  djpeg.obj,wrppm.obj,wrgif.obj, -
-          wrtarga.obj,wrrle.obj,wrbmp.obj,rdcolmap.obj,cdjpeg.obj,libjpeg.olb/Library'OPT'
-$!
-$ DoCompile jpegtran.c
-$ DoCompile rdswitch.c
-$ DoCompile cdjpeg.c
-$ DoCompile transupp.c
-$!
-$ Link /NoMap /Executable = jpegtran.exe  jpegtran.obj,rdswitch.obj, -
-          cdjpeg.obj,transupp.obj,libjpeg.olb/Library'OPT'
-$!
-$ DoCompile rdjpgcom.c
-$ Link /NoMap /Executable = rdjpgcom.exe  rdjpgcom.obj'OPT'
-$!
-$ DoCompile wrjpgcom.c
-$ Link /NoMap /Executable = wrjpgcom.exe  wrjpgcom.obj'OPT'
-$!
-$! Run the self-test
-$!
-$ mcr sys$disk:[]djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
-$ mcr sys$disk:[]djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
-$ mcr sys$disk:[]cjpeg -dct int      -outfile testout.jpg testimg.ppm
-$ mcr sys$disk:[]djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-$ mcr sys$disk:[]cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-$ mcr sys$disk:[]jpegtran -outfile testoutt.jpg testprog.jpg
-$ Backup /Compare/Log testimg.ppm testout.ppm
-$ Backup /Compare/Log testimg.bmp testout.bmp
-$ Backup /Compare/Log testimg.jpg testout.jpg
-$ Backup /Compare/Log testimg.ppm testoutp.ppm
-$ Backup /Compare/Log testimgp.jpg testoutp.jpg
-$ Backup /Compare/Log testorig.jpg testoutt.jpg
-$!
-$End:
-$   If Verify Then Set Verify
-$ Exit

diff --git a/makefile.wat b/makefile.wat
deleted file mode 100644
index d953e46..0000000
--- a/makefile.wat
+++ /dev/null

@@ -1,233 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is suitable for Watcom C/C++ 10.0 on MS-DOS (using
-# dos4g extender), OS/2, and Windows NT console mode.
-# Thanks to Janos Haide, jhaide@btrvtech.com.
-
-# Read installation instructions before saying "wmake" !!
-
-# Uncomment line for desired system
-SYSTEM=DOS
-#SYSTEM=OS2
-#SYSTEM=NT
-
-# The name of your C compiler:
-CC= wcl386
-
-# You may need to adjust these cc options:
-CFLAGS= -4r -ort -wx -zq -bt=$(SYSTEM)
-# Caution: avoid -ol or -ox; these generate bad code with 10.0 or 10.0a.
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-
-# Link-time cc options:
-!ifeq SYSTEM DOS
-LDFLAGS= -zq -l=dos4g
-!else ifeq SYSTEM OS2
-LDFLAGS= -zq -l=os2v2
-!else ifeq SYSTEM NT
-LDFLAGS= -zq -l=nt
-!endif
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.  jmemnobs should work fine for dos4g or OS/2 environment.
-SYSDEPMEM= jmemnobs.obj
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c &
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c &
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c &
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c &
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c &
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c &
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c &
-        jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c &
-        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c &
-        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h &
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 &
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc &
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc &
-        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds &
-        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st &
-        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms &
-        makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat &
-        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas &
-        jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg &
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) &
-        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj &
-        jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj &
-        jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj &
-        jcdctmgr.obj jfdctfst.obj jfdctflt.obj jfdctint.obj
-# decompression library object files
-DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj &
-        jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj &
-        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jidctfst.obj &
-        jidctflt.obj jidctint.obj jidctred.obj jdsample.obj jdcolor.obj &
-        jquant1.obj jquant2.obj jdmerge.obj
-# These objectfiles are included in libjpeg.lib
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj &
-        rdswitch.obj cdjpeg.obj
-DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj &
-        rdcolmap.obj cdjpeg.obj
-TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
-
-
-all: libjpeg.lib cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
-
-libjpeg.lib: $(LIBOBJECTS)
-	- del libjpeg.lib
-	* wlib -n libjpeg.lib $(LIBOBJECTS)
-
-cjpeg.exe: $(COBJECTS) libjpeg.lib
-	$(CC) $(LDFLAGS) $(COBJECTS) libjpeg.lib
-
-djpeg.exe: $(DOBJECTS) libjpeg.lib
-	$(CC) $(LDFLAGS) $(DOBJECTS) libjpeg.lib
-
-jpegtran.exe: $(TROBJECTS) libjpeg.lib
-	$(CC) $(LDFLAGS) $(TROBJECTS) libjpeg.lib
-
-rdjpgcom.exe: rdjpgcom.c
-	$(CC) $(CFLAGS) $(LDFLAGS) rdjpgcom.c
-
-wrjpgcom.exe: wrjpgcom.c
-	$(CC) $(CFLAGS) $(LDFLAGS) wrjpgcom.c
-
-.c.obj:
-	$(CC) $(CFLAGS) -c $<
-
-jconfig.h: jconfig.doc
-	echo You must prepare a system-dependent jconfig.h file.
-	echo Please read the installation directions in install.doc.
-	exit 1
-
-clean: .SYMBOLIC
-	- del *.obj
-	- del libjpeg.lib
-	- del cjpeg.exe
-	- del djpeg.exe
-	- del jpegtran.exe
-	- del rdjpgcom.exe
-	- del wrjpgcom.exe
-	- del testout*.*
-
-test: cjpeg.exe djpeg.exe jpegtran.exe  .SYMBOLIC
-	- del testout*.*
-	djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
-	djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
-	cjpeg -dct int -outfile testout.jpg  testimg.ppm
-	djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-	cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-	jpegtran -outfile testoutt.jpg testprog.jpg
-!ifeq SYSTEM DOS
-	fc /b testimg.ppm testout.ppm
-	fc /b testimg.bmp testout.bmp
-	fc /b testimg.jpg testout.jpg
-	fc /b testimg.ppm testoutp.ppm
-	fc /b testimgp.jpg testoutp.jpg
-	fc /b testorig.jpg testoutt.jpg
-!else
-	echo n > n.tmp
-	comp testimg.ppm testout.ppm < n.tmp
-	comp testimg.bmp testout.bmp < n.tmp
-	comp testimg.jpg testout.jpg < n.tmp
-	comp testimg.ppm testoutp.ppm < n.tmp
-	comp testimgp.jpg testoutp.jpg < n.tmp
-	comp testorig.jpg testoutt.jpg < n.tmp
-	del n.tmp
-!endif
-
-
-jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.obj: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.obj: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.obj: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.obj: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.obj: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.obj: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.obj: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.obj: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.obj: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.obj: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.obj: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.obj: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.obj: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.obj: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.obj: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.obj: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.obj: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.obj: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.obj: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.obj: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.obj: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.obj: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.obj: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.obj: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.obj: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.obj: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.obj: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.obj: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h

diff --git a/makelib.ds b/makelib.ds
deleted file mode 100644
index c7ad36d..0000000
--- a/makelib.ds
+++ /dev/null

@@ -1,1046 +0,0 @@
-# Microsoft Developer Studio Generated NMAKE File, Format Version 4.20
-# ** DO NOT EDIT **
-
-# TARGTYPE "Win32 (x86) Static Library" 0x0104
-
-!IF "$(CFG)" == ""
-CFG=jpeg - Win32
-!MESSAGE No configuration specified.  Defaulting to jpeg - Win32.
-!ENDIF 
-
-!IF "$(CFG)" != "jpeg - Win32"
-!MESSAGE Invalid configuration "$(CFG)" specified.
-!MESSAGE You can specify a configuration when running NMAKE on this makefile
-!MESSAGE by defining the macro CFG on the command line.  For example:
-!MESSAGE 
-!MESSAGE NMAKE /f "jpeg.mak" CFG="jpeg - Win32"
-!MESSAGE 
-!MESSAGE Possible choices for configuration are:
-!MESSAGE 
-!MESSAGE "jpeg - Win32" (based on "Win32 (x86) Static Library")
-!MESSAGE 
-!ERROR An invalid configuration is specified.
-!ENDIF 
-
-!IF "$(OS)" == "Windows_NT"
-NULL=
-!ELSE 
-NULL=nul
-!ENDIF 
-################################################################################
-# Begin Project
-# PROP Target_Last_Scanned "jpeg - Win32"
-CPP=cl.exe
-
-!IF  "$(CFG)" == "jpeg - Win32"
-
-# PROP BASE Use_MFC 0
-# PROP BASE Use_Debug_Libraries 0
-# PROP BASE Output_Dir "Release"
-# PROP BASE Intermediate_Dir "Release"
-# PROP BASE Target_Dir ""
-# PROP Use_MFC 0
-# PROP Use_Debug_Libraries 0
-# PROP Output_Dir "Release"
-# PROP Intermediate_Dir "Release"
-# PROP Target_Dir ""
-OUTDIR=.\Release
-INTDIR=.\Release
-
-ALL : "$(OUTDIR)\jpeg.lib"
-
-CLEAN : 
-	-@erase "$(INTDIR)\jcapimin.obj"
-	-@erase "$(INTDIR)\jcapistd.obj"
-	-@erase "$(INTDIR)\jctrans.obj"
-	-@erase "$(INTDIR)\jcparam.obj"
-	-@erase "$(INTDIR)\jdatadst.obj"
-	-@erase "$(INTDIR)\jcinit.obj"
-	-@erase "$(INTDIR)\jcmaster.obj"
-	-@erase "$(INTDIR)\jcmarker.obj"
-	-@erase "$(INTDIR)\jcmainct.obj"
-	-@erase "$(INTDIR)\jcprepct.obj"
-	-@erase "$(INTDIR)\jccoefct.obj"
-	-@erase "$(INTDIR)\jccolor.obj"
-	-@erase "$(INTDIR)\jcsample.obj"
-	-@erase "$(INTDIR)\jchuff.obj"
-	-@erase "$(INTDIR)\jcphuff.obj"
-	-@erase "$(INTDIR)\jcdctmgr.obj"
-	-@erase "$(INTDIR)\jfdctfst.obj"
-	-@erase "$(INTDIR)\jfdctflt.obj"
-	-@erase "$(INTDIR)\jfdctint.obj"
-	-@erase "$(INTDIR)\jdapimin.obj"
-	-@erase "$(INTDIR)\jdapistd.obj"
-	-@erase "$(INTDIR)\jdtrans.obj"
-	-@erase "$(INTDIR)\jdatasrc.obj"
-	-@erase "$(INTDIR)\jdmaster.obj"
-	-@erase "$(INTDIR)\jdinput.obj"
-	-@erase "$(INTDIR)\jdmarker.obj"
-	-@erase "$(INTDIR)\jdhuff.obj"
-	-@erase "$(INTDIR)\jdphuff.obj"
-	-@erase "$(INTDIR)\jdmainct.obj"
-	-@erase "$(INTDIR)\jdcoefct.obj"
-	-@erase "$(INTDIR)\jdpostct.obj"
-	-@erase "$(INTDIR)\jddctmgr.obj"
-	-@erase "$(INTDIR)\jidctfst.obj"
-	-@erase "$(INTDIR)\jidctflt.obj"
-	-@erase "$(INTDIR)\jidctint.obj"
-	-@erase "$(INTDIR)\jidctred.obj"
-	-@erase "$(INTDIR)\jdsample.obj"
-	-@erase "$(INTDIR)\jdcolor.obj"
-	-@erase "$(INTDIR)\jquant1.obj"
-	-@erase "$(INTDIR)\jquant2.obj"
-	-@erase "$(INTDIR)\jdmerge.obj"
-	-@erase "$(INTDIR)\jcomapi.obj"
-	-@erase "$(INTDIR)\jutils.obj"
-	-@erase "$(INTDIR)\jerror.obj"
-	-@erase "$(INTDIR)\jmemmgr.obj"
-	-@erase "$(INTDIR)\jmemnobs.obj"
-	-@erase "$(OUTDIR)\jpeg.lib"
-
-"$(OUTDIR)" :
-    if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)"
-
-# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /YX /c
-# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /YX /c
-CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS"\
- /Fp"$(INTDIR)/jpeg.pch" /YX /Fo"$(INTDIR)/" /c 
-CPP_OBJS=.\Release/
-CPP_SBRS=.\.
-BSC32=bscmake.exe
-# ADD BASE BSC32 /nologo
-# ADD BSC32 /nologo
-BSC32_FLAGS=/nologo /o"$(OUTDIR)/jpeg.bsc" 
-BSC32_SBRS= \
-	
-LIB32=link.exe -lib
-# ADD BASE LIB32 /nologo
-# ADD LIB32 /nologo
-LIB32_FLAGS=/nologo /out:"$(OUTDIR)/jpeg.lib" 
-LIB32_OBJS= \
-	"$(INTDIR)\jcapimin.obj" \
-	"$(INTDIR)\jcapistd.obj" \
-	"$(INTDIR)\jctrans.obj" \
-	"$(INTDIR)\jcparam.obj" \
-	"$(INTDIR)\jdatadst.obj" \
-	"$(INTDIR)\jcinit.obj" \
-	"$(INTDIR)\jcmaster.obj" \
-	"$(INTDIR)\jcmarker.obj" \
-	"$(INTDIR)\jcmainct.obj" \
-	"$(INTDIR)\jcprepct.obj" \
-	"$(INTDIR)\jccoefct.obj" \
-	"$(INTDIR)\jccolor.obj" \
-	"$(INTDIR)\jcsample.obj" \
-	"$(INTDIR)\jchuff.obj" \
-	"$(INTDIR)\jcphuff.obj" \
-	"$(INTDIR)\jcdctmgr.obj" \
-	"$(INTDIR)\jfdctfst.obj" \
-	"$(INTDIR)\jfdctflt.obj" \
-	"$(INTDIR)\jfdctint.obj" \
-	"$(INTDIR)\jdapimin.obj" \
-	"$(INTDIR)\jdapistd.obj" \
-	"$(INTDIR)\jdtrans.obj" \
-	"$(INTDIR)\jdatasrc.obj" \
-	"$(INTDIR)\jdmaster.obj" \
-	"$(INTDIR)\jdinput.obj" \
-	"$(INTDIR)\jdmarker.obj" \
-	"$(INTDIR)\jdhuff.obj" \
-	"$(INTDIR)\jdphuff.obj" \
-	"$(INTDIR)\jdmainct.obj" \
-	"$(INTDIR)\jdcoefct.obj" \
-	"$(INTDIR)\jdpostct.obj" \
-	"$(INTDIR)\jddctmgr.obj" \
-	"$(INTDIR)\jidctfst.obj" \
-	"$(INTDIR)\jidctflt.obj" \
-	"$(INTDIR)\jidctint.obj" \
-	"$(INTDIR)\jidctred.obj" \
-	"$(INTDIR)\jdsample.obj" \
-	"$(INTDIR)\jdcolor.obj" \
-	"$(INTDIR)\jquant1.obj" \
-	"$(INTDIR)\jquant2.obj" \
-	"$(INTDIR)\jdmerge.obj" \
-	"$(INTDIR)\jcomapi.obj" \
-	"$(INTDIR)\jutils.obj" \
-	"$(INTDIR)\jerror.obj" \
-	"$(INTDIR)\jmemmgr.obj" \
-	"$(INTDIR)\jmemnobs.obj"
-
-"$(OUTDIR)\jpeg.lib" : "$(OUTDIR)" $(DEF_FILE) $(LIB32_OBJS)
-    $(LIB32) @<<
-  $(LIB32_FLAGS) $(DEF_FLAGS) $(LIB32_OBJS)
-<<
-
-!ENDIF 
-
-.c{$(CPP_OBJS)}.obj:
-   $(CPP) $(CPP_PROJ) $<  
-
-.cpp{$(CPP_OBJS)}.obj:
-   $(CPP) $(CPP_PROJ) $<  
-
-.cxx{$(CPP_OBJS)}.obj:
-   $(CPP) $(CPP_PROJ) $<  
-
-.c{$(CPP_SBRS)}.sbr:
-   $(CPP) $(CPP_PROJ) $<  
-
-.cpp{$(CPP_SBRS)}.sbr:
-   $(CPP) $(CPP_PROJ) $<  
-
-.cxx{$(CPP_SBRS)}.sbr:
-   $(CPP) $(CPP_PROJ) $<  
-
-################################################################################
-# Begin Target
-
-# Name "jpeg - Win32"
-
-!IF  "$(CFG)" == "jpeg - Win32"
-
-!ENDIF 
-
-################################################################################
-# Begin Source File
-
-SOURCE="jcapimin.c"
-DEP_CPP_JCAPI=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jcapimin.obj" : $(SOURCE) $(DEP_CPP_JCAPI) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcapistd.c"
-DEP_CPP_JCAPIS=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jcapistd.obj" : $(SOURCE) $(DEP_CPP_JCAPIS) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jccoefct.c"
-DEP_CPP_JCCOE=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jccoefct.obj" : $(SOURCE) $(DEP_CPP_JCCOE) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jccolor.c"
-DEP_CPP_JCCOL=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jccolor.obj" : $(SOURCE) $(DEP_CPP_JCCOL) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcdctmgr.c"
-DEP_CPP_JCDCT=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jdct.h"\
-	
-
-"$(INTDIR)\jcdctmgr.obj" : $(SOURCE) $(DEP_CPP_JCDCT) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jchuff.c"
-DEP_CPP_JCHUF=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jchuff.h"\
-	
-
-"$(INTDIR)\jchuff.obj" : $(SOURCE) $(DEP_CPP_JCHUF) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcinit.c"
-DEP_CPP_JCINI=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jcinit.obj" : $(SOURCE) $(DEP_CPP_JCINI) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcmainct.c"
-DEP_CPP_JCMAI=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jcmainct.obj" : $(SOURCE) $(DEP_CPP_JCMAI) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcmarker.c"
-DEP_CPP_JCMAR=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jcmarker.obj" : $(SOURCE) $(DEP_CPP_JCMAR) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcmaster.c"
-DEP_CPP_JCMAS=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jcmaster.obj" : $(SOURCE) $(DEP_CPP_JCMAS) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcomapi.c"
-DEP_CPP_JCOMA=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jcomapi.obj" : $(SOURCE) $(DEP_CPP_JCOMA) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcparam.c"
-DEP_CPP_JCPAR=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jcparam.obj" : $(SOURCE) $(DEP_CPP_JCPAR) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcphuff.c"
-DEP_CPP_JCPHU=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jchuff.h"\
-	
-
-"$(INTDIR)\jcphuff.obj" : $(SOURCE) $(DEP_CPP_JCPHU) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcprepct.c"
-DEP_CPP_JCPRE=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jcprepct.obj" : $(SOURCE) $(DEP_CPP_JCPRE) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcsample.c"
-DEP_CPP_JCSAM=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jcsample.obj" : $(SOURCE) $(DEP_CPP_JCSAM) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jctrans.c"
-DEP_CPP_JCTRA=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jctrans.obj" : $(SOURCE) $(DEP_CPP_JCTRA) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdapimin.c"
-DEP_CPP_JDAPI=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdapimin.obj" : $(SOURCE) $(DEP_CPP_JDAPI) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdapistd.c"
-DEP_CPP_JDAPIS=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdapistd.obj" : $(SOURCE) $(DEP_CPP_JDAPIS) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdatadst.c"
-DEP_CPP_JDATA=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdatadst.obj" : $(SOURCE) $(DEP_CPP_JDATA) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdatasrc.c"
-DEP_CPP_JDATAS=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdatasrc.obj" : $(SOURCE) $(DEP_CPP_JDATAS) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdcoefct.c"
-DEP_CPP_JDCOE=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdcoefct.obj" : $(SOURCE) $(DEP_CPP_JDCOE) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdcolor.c"
-DEP_CPP_JDCOL=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdcolor.obj" : $(SOURCE) $(DEP_CPP_JDCOL) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jddctmgr.c"
-DEP_CPP_JDDCT=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jdct.h"\
-	
-
-"$(INTDIR)\jddctmgr.obj" : $(SOURCE) $(DEP_CPP_JDDCT) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdhuff.c"
-DEP_CPP_JDHUF=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jdhuff.h"\
-	
-
-"$(INTDIR)\jdhuff.obj" : $(SOURCE) $(DEP_CPP_JDHUF) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdinput.c"
-DEP_CPP_JDINP=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdinput.obj" : $(SOURCE) $(DEP_CPP_JDINP) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdmainct.c"
-DEP_CPP_JDMAI=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdmainct.obj" : $(SOURCE) $(DEP_CPP_JDMAI) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdmarker.c"
-DEP_CPP_JDMAR=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdmarker.obj" : $(SOURCE) $(DEP_CPP_JDMAR) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdmaster.c"
-DEP_CPP_JDMAS=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdmaster.obj" : $(SOURCE) $(DEP_CPP_JDMAS) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdmerge.c"
-DEP_CPP_JDMER=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdmerge.obj" : $(SOURCE) $(DEP_CPP_JDMER) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdphuff.c"
-DEP_CPP_JDPHU=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jdhuff.h"\
-	
-
-"$(INTDIR)\jdphuff.obj" : $(SOURCE) $(DEP_CPP_JDPHU) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdpostct.c"
-DEP_CPP_JDPOS=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdpostct.obj" : $(SOURCE) $(DEP_CPP_JDPOS) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdsample.c"
-DEP_CPP_JDSAM=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdsample.obj" : $(SOURCE) $(DEP_CPP_JDSAM) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdtrans.c"
-DEP_CPP_JDTRA=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jdtrans.obj" : $(SOURCE) $(DEP_CPP_JDTRA) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jerror.c"
-DEP_CPP_JERRO=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jversion.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jerror.obj" : $(SOURCE) $(DEP_CPP_JERRO) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jfdctflt.c"
-DEP_CPP_JFDCT=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jdct.h"\
-	
-
-"$(INTDIR)\jfdctflt.obj" : $(SOURCE) $(DEP_CPP_JFDCT) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jfdctfst.c"
-DEP_CPP_JFDCTF=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jdct.h"\
-	
-
-"$(INTDIR)\jfdctfst.obj" : $(SOURCE) $(DEP_CPP_JFDCTF) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jfdctint.c"
-DEP_CPP_JFDCTI=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jdct.h"\
-	
-
-"$(INTDIR)\jfdctint.obj" : $(SOURCE) $(DEP_CPP_JFDCTI) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jidctflt.c"
-DEP_CPP_JIDCT=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jdct.h"\
-	
-
-"$(INTDIR)\jidctflt.obj" : $(SOURCE) $(DEP_CPP_JIDCT) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jidctfst.c"
-DEP_CPP_JIDCTF=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jdct.h"\
-	
-
-"$(INTDIR)\jidctfst.obj" : $(SOURCE) $(DEP_CPP_JIDCTF) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jidctint.c"
-DEP_CPP_JIDCTI=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jdct.h"\
-	
-
-"$(INTDIR)\jidctint.obj" : $(SOURCE) $(DEP_CPP_JIDCTI) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jidctred.c"
-DEP_CPP_JIDCTR=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jdct.h"\
-	
-
-"$(INTDIR)\jidctred.obj" : $(SOURCE) $(DEP_CPP_JIDCTR) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jquant1.c"
-DEP_CPP_JQUAN=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jquant1.obj" : $(SOURCE) $(DEP_CPP_JQUAN) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jquant2.c"
-DEP_CPP_JQUANT=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jquant2.obj" : $(SOURCE) $(DEP_CPP_JQUANT) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jutils.c"
-DEP_CPP_JUTIL=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	
-
-"$(INTDIR)\jutils.obj" : $(SOURCE) $(DEP_CPP_JUTIL) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jmemmgr.c"
-DEP_CPP_JMEMM=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jmemsys.h"\
-	
-
-"$(INTDIR)\jmemmgr.obj" : $(SOURCE) $(DEP_CPP_JMEMM) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jmemnobs.c"
-DEP_CPP_JMEMN=\
-	"jinclude.h"\
-	"jconfig.h"\
-	"jpeglib.h"\
-	"jmorecfg.h"\
-	"jpegint.h"\
-	"jerror.h"\
-	"jmemsys.h"\
-	
-
-"$(INTDIR)\jmemnobs.obj" : $(SOURCE) $(DEP_CPP_JMEMN) "$(INTDIR)"
-   $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-# End Target
-# End Project
-################################################################################
-

diff --git a/makeproj.mac b/makeproj.mac
deleted file mode 100644
index ed277c8..0000000
--- a/makeproj.mac
+++ /dev/null

@@ -1,213 +0,0 @@
---
--- makeproj.mac
---
--- This AppleScript builds Code Warrior PRO Release 2 project files for the
--- libjpeg library as well as the test programs 'cjpeg', 'djpeg', 'jpegtran'.
--- (We'd distribute real project files, except they're not text
--- and would create maintenance headaches.)
---
--- The script then compiles and links the library and the test programs.
--- NOTE: if you haven't already created a 'jconfig.h' file, the script
--- automatically copies 'jconfig.mac' to 'jconfig.h'.
---
--- To use this script, you must have AppleScript 1.1 or later installed
--- and a suitable AppleScript editor like Script Editor or Script Debugger
--- (http://www.latenightsw.com). Open this file with your AppleScript
--- editor and execute the "run" command to build the projects.
---
--- Thanks to Dan Sears and Don Agro for this script.
--- Questions about this script can be addressed to dogpark@interlog.com
---
-
-on run
-
-	choose folder with prompt ">>> Select IJG source folder <<<"
-	set ijg_folder to result
-
-	choose folder with prompt ">>> Select MetroWerks folder <<<"
-	set cw_folder to result
-
-	-- if jconfig.h doesn't already exist, copy jconfig.mac
-
-	tell application "Finder"
-		if not (exists file "jconfig.h" of ijg_folder) then
-			duplicate {file "jconfig.mac" of folder ijg_folder}
-			select file "jconfig.mac copy" of folder ijg_folder
-			set name of selection to "jconfig.h"
-		end if
-	end tell
-
-	tell application "CodeWarrior IDE 2.1"
-	  with timeout of 10000 seconds
-
-		-- create libjpeg project
-
-		activate
-		Create Project (ijg_folder as string) & "libjpeg.proj"
-		Set Preferences of panel "Target Settings" to {Target Name:"libjpeg"}
-		Set Preferences of panel "PPC Project" to {File Name:"libjpeg"}
-		Set Preferences of panel "Target Settings" to {Linker:"MacOS PPC Linker"}
-		Set Preferences of panel "PPC Project" to {Project Type:library}
-		Set Preferences of panel "C/C++ Compiler" to {ANSI Strict:true}
-		Set Preferences of panel "C/C++ Compiler" to {Enums Always Ints:true}
-		Set Preferences of panel "PPC Codegen" to {Struct Alignment:PowerPC}
-		Set Preferences of panel "PPC Linker" to {Generate SYM File:false}
-
-		Add Files (ijg_folder as string) & "jcapimin.c" To Segment 1
-		Add Files (ijg_folder as string) & "jcapistd.c" To Segment 1
-		Add Files (ijg_folder as string) & "jctrans.c" To Segment 1
-		Add Files (ijg_folder as string) & "jcparam.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdatadst.c" To Segment 1
-		Add Files (ijg_folder as string) & "jcinit.c" To Segment 1
-		Add Files (ijg_folder as string) & "jcmaster.c" To Segment 1
-		Add Files (ijg_folder as string) & "jcmarker.c" To Segment 1
-		Add Files (ijg_folder as string) & "jcmainct.c" To Segment 1
-		Add Files (ijg_folder as string) & "jcprepct.c" To Segment 1
-		Add Files (ijg_folder as string) & "jccoefct.c" To Segment 1
-		Add Files (ijg_folder as string) & "jccolor.c" To Segment 1
-		Add Files (ijg_folder as string) & "jcsample.c" To Segment 1
-		Add Files (ijg_folder as string) & "jchuff.c" To Segment 1
-		Add Files (ijg_folder as string) & "jcphuff.c" To Segment 1
-		Add Files (ijg_folder as string) & "jcdctmgr.c" To Segment 1
-		Add Files (ijg_folder as string) & "jfdctfst.c" To Segment 1
-		Add Files (ijg_folder as string) & "jfdctflt.c" To Segment 1
-		Add Files (ijg_folder as string) & "jfdctint.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdapimin.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdapistd.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdtrans.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdatasrc.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdmaster.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdinput.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdmarker.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdhuff.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdphuff.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdmainct.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdcoefct.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdpostct.c" To Segment 1
-		Add Files (ijg_folder as string) & "jddctmgr.c" To Segment 1
-		Add Files (ijg_folder as string) & "jidctfst.c" To Segment 1
-		Add Files (ijg_folder as string) & "jidctflt.c" To Segment 1
-		Add Files (ijg_folder as string) & "jidctint.c" To Segment 1
-		Add Files (ijg_folder as string) & "jidctred.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdsample.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdcolor.c" To Segment 1
-		Add Files (ijg_folder as string) & "jquant1.c" To Segment 1
-		Add Files (ijg_folder as string) & "jquant2.c" To Segment 1
-		Add Files (ijg_folder as string) & "jdmerge.c" To Segment 1
-		Add Files (ijg_folder as string) & "jcomapi.c" To Segment 1
-		Add Files (ijg_folder as string) & "jutils.c" To Segment 1
-		Add Files (ijg_folder as string) & "jerror.c" To Segment 1
-		Add Files (ijg_folder as string) & "jmemmgr.c" To Segment 1
-		Add Files (ijg_folder as string) & "jmemmac.c" To Segment 1
-
-		-- compile and link the library
-
-		Make Project
-		Close Project
-
-		-- create cjpeg project
-
-		activate
-		Create Project (ijg_folder as string) & "cjpeg.proj"
-		Set Preferences of panel "Target Settings" to {Target Name:"cjpeg"}
-		Set Preferences of panel "PPC Project" to {File Name:"cjpeg"}
-		Set Preferences of panel "Target Settings" to {Linker:"MacOS PPC Linker"}
-		Set Preferences of panel "C/C++ Compiler" to {ANSI Strict:true}
-		Set Preferences of panel "C/C++ Compiler" to {Enums Always Ints:true}
-		Set Preferences of panel "PPC Codegen" to {Struct Alignment:PowerPC}
-		Set Preferences of panel "PPC Linker" to {Generate SYM File:false}
-
-		Add Files (ijg_folder as string) & "cjpeg.c" To Segment 1
-		Add Files (ijg_folder as string) & "rdppm.c" To Segment 1
-		Add Files (ijg_folder as string) & "rdgif.c" To Segment 1
-		Add Files (ijg_folder as string) & "rdtarga.c" To Segment 1
-		Add Files (ijg_folder as string) & "rdrle.c" To Segment 1
-		Add Files (ijg_folder as string) & "rdbmp.c" To Segment 1
-		Add Files (ijg_folder as string) & "rdswitch.c" To Segment 1
-		Add Files (ijg_folder as string) & "cdjpeg.c" To Segment 1
-
-		Add Files (ijg_folder as string) & "libjpeg" To Segment 2
-
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:Metrowerks Standard Library:MSL C:Bin:MSL C.PPC.Lib" To Segment 3
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:Metrowerks Standard Library:MSL C:Bin:MSL SIOUX.PPC.Lib" To Segment 3
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:Runtime:Runtime PPC:MSL RuntimePPC.Lib" To Segment 3
-
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:MacOS Common:InterfaceLib" To Segment 4
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:MacOS Common:MathLib" To Segment 4
-
-		-- compile and link cjpeg
-
-		Make Project
-		Close Project
-
-		-- create djpeg project
-
-		activate
-		Create Project (ijg_folder as string) & "djpeg.proj"
-		Set Preferences of panel "Target Settings" to {Target Name:"djpeg"}
-		Set Preferences of panel "PPC Project" to {File Name:"djpeg"}
-		Set Preferences of panel "Target Settings" to {Linker:"MacOS PPC Linker"}
-		Set Preferences of panel "C/C++ Compiler" to {ANSI Strict:true}
-		Set Preferences of panel "C/C++ Compiler" to {Enums Always Ints:true}
-		Set Preferences of panel "PPC Codegen" to {Struct Alignment:PowerPC}
-		Set Preferences of panel "PPC Linker" to {Generate SYM File:false}
-
-		Add Files (ijg_folder as string) & "djpeg.c" To Segment 1
-		Add Files (ijg_folder as string) & "wrppm.c" To Segment 1
-		Add Files (ijg_folder as string) & "wrgif.c" To Segment 1
-		Add Files (ijg_folder as string) & "wrtarga.c" To Segment 1
-		Add Files (ijg_folder as string) & "wrrle.c" To Segment 1
-		Add Files (ijg_folder as string) & "wrbmp.c" To Segment 1
-		Add Files (ijg_folder as string) & "rdcolmap.c" To Segment 1
-		Add Files (ijg_folder as string) & "cdjpeg.c" To Segment 1
-
-		Add Files (ijg_folder as string) & "libjpeg" To Segment 2
-
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:Metrowerks Standard Library:MSL C:Bin:MSL C.PPC.Lib" To Segment 3
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:Metrowerks Standard Library:MSL C:Bin:MSL SIOUX.PPC.Lib" To Segment 3
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:Runtime:Runtime PPC:MSL RuntimePPC.Lib" To Segment 3
-
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:MacOS Common:InterfaceLib" To Segment 4
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:MacOS Common:MathLib" To Segment 4
-
-		-- compile and link djpeg
-
-		Make Project
-		Close Project
-
-		-- create jpegtran project
-
-		activate
-		Create Project (ijg_folder as string) & "jpegtran.proj"
-		Set Preferences of panel "Target Settings" to {Target Name:"jpegtran"}
-		Set Preferences of panel "PPC Project" to {File Name:"jpegtran"}
-		Set Preferences of panel "Target Settings" to {Linker:"MacOS PPC Linker"}
-		Set Preferences of panel "C/C++ Compiler" to {ANSI Strict:true}
-		Set Preferences of panel "C/C++ Compiler" to {Enums Always Ints:true}
-		Set Preferences of panel "PPC Codegen" to {Struct Alignment:PowerPC}
-		Set Preferences of panel "PPC Linker" to {Generate SYM File:false}
-
-		Add Files (ijg_folder as string) & "jpegtran.c" To Segment 1
-		Add Files (ijg_folder as string) & "rdswitch.c" To Segment 1
-		Add Files (ijg_folder as string) & "cdjpeg.c" To Segment 1
-		Add Files (ijg_folder as string) & "transupp.c" To Segment 1
-
-		Add Files (ijg_folder as string) & "libjpeg" To Segment 2
-
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:Metrowerks Standard Library:MSL C:Bin:MSL C.PPC.Lib" To Segment 3
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:Metrowerks Standard Library:MSL C:Bin:MSL SIOUX.PPC.Lib" To Segment 3
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:Runtime:Runtime PPC:MSL RuntimePPC.Lib" To Segment 3
-
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:MacOS Common:InterfaceLib" To Segment 4
-		Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:MacOS Common:MathLib" To Segment 4
-
-		-- compile and link jpegtran
-
-		Make Project
-		Close Project
-
-		quit
-
-	  end timeout
-	end tell
-end run

diff --git a/makljpeg.st b/makljpeg.st
deleted file mode 100644
index 813493e..0000000
--- a/makljpeg.st
+++ /dev/null

@@ -1,70 +0,0 @@
-; Project file for Independent JPEG Group's software
-;
-; This project file is for Atari ST/STE/TT systems using Pure C or Turbo C.
-; Thanks to Frank Moehle (Frank.Moehle@arbi.informatik.uni-oldenburg.de),
-; Dr. B. Setzepfandt (bernd@gina.uni-muenster.de),
-; and Guido Vollbeding (guivol@esc.de).
-;
-; To use this file, rename it to libjpeg.prj.
-; Read installation instructions before trying to make the program!
-;
-;
-;      * * * Output file * * *
-libjpeg.lib
-;
-; * * * COMPILER OPTIONS * * *  
-.C[-P]        ; absolute calls
-.C[-M]        ; and no string merging, folks
-.C[-w-cln]    ; no "constant is long" warnings
-.C[-w-par]    ; no "parameter xxxx unused"
-.C[-w-rch]    ; no "unreachable code"
-.C[-wsig]     ; warn if significant digits may be lost
-.L[-J]        ; link new Obj-format (so we get a library)
-=
-; * * * * List of modules * * * * 
-jcapimin.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcapistd.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jccoefct.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jccolor.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcdctmgr.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jchuff.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jchuff.h)
-jcinit.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcmainct.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcmarker.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcmaster.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcomapi.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcparam.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcphuff.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jchuff.h)
-jcprepct.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcsample.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jctrans.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdapimin.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdapistd.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdatadst.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h)
-jdatasrc.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h)
-jdcoefct.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdcolor.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jddctmgr.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jdhuff.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdhuff.h)
-jdinput.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdmainct.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdmarker.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdmaster.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdmerge.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdphuff.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdhuff.h)
-jdpostct.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdsample.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdtrans.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jerror.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jversion.h,jerror.h)
-jfdctflt.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jfdctfst.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jfdctint.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jidctflt.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jidctfst.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jidctint.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jidctred.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jquant1.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jquant2.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jutils.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jmemmgr.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jmemsys.h)
-jmemansi.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jmemsys.h)

diff --git a/maktjpeg.st b/maktjpeg.st
deleted file mode 100644
index 31f4d16..0000000
--- a/maktjpeg.st
+++ /dev/null

@@ -1,32 +0,0 @@
-; Project file for Independent JPEG Group's software
-;
-; This project file is for Atari ST/STE/TT systems using Pure C or Turbo C.
-; Thanks to Frank Moehle (Frank.Moehle@arbi.informatik.uni-oldenburg.de),
-; Dr. B. Setzepfandt (bernd@gina.uni-muenster.de),
-; and Guido Vollbeding (guivol@esc.de).
-;
-; To use this file, rename it to jpegtran.prj.
-; If you are using Turbo C, change filenames beginning with "pc..." to "tc..."
-; Read installation instructions before trying to make the program!
-;
-;
-;      * * * Output file * * *
-jpegtran.ttp
-;
-; * * * COMPILER OPTIONS * * *  
-.C[-P]        ; absolute calls
-.C[-M]        ; and no string merging, folks
-.C[-w-cln]    ; no "constant is long" warnings
-.C[-w-par]    ; no "parameter xxxx unused"
-.C[-w-rch]    ; no "unreachable code"
-.C[-wsig]     ; warn if significant digits may be lost
-=
-; * * * * List of modules * * * * 
-pcstart.o
-jpegtran.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h,transupp.h,jversion.h)
-cdjpeg.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdswitch.c	(cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-transupp.c	(jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,transupp.h)
-libjpeg.lib        ; built by libjpeg.prj
-pcstdlib.lib       ; standard library
-pcextlib.lib       ; extended library

diff --git a/makvms.opt b/makvms.opt
deleted file mode 100644
index 675e8fe..0000000
--- a/makvms.opt
+++ /dev/null

@@ -1,4 +0,0 @@
-! A pointer to the VAX/VMS C Run-Time Shareable Library.
-! This file is needed by makefile.mms and makefile.vms,
-! but only for the older VAX C compiler.  DEC C does not need it.
-Sys$Library:VAXCRTL.EXE /Share

diff --git a/rrtimer.h b/rrtimer.h
new file mode 100644
index 0000000..4db5e37
--- /dev/null
+++ b/rrtimer.h

@@ -0,0 +1,114 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005 Sun Microsystems, Inc.
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version.  The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * wxWindows Library License for more details.
+ */
+
+#ifndef __RRTIMER_H__
+#define __RRTIMER_H__
+
+#ifdef __cplusplus
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <sys/time.h>
+#endif
+
+class rrtimer
+{
+	public:
+
+		rrtimer(void) : t1(0.0)
+		{
+			#ifdef _WIN32
+			highres=false;  tick=0.001;
+			LARGE_INTEGER Frequency;
+			if(QueryPerformanceFrequency(&Frequency)!=0)
+			{
+				tick=(double)1.0/(double)(Frequency.QuadPart);
+				highres=true;
+			}
+			#endif
+		}
+
+		void start(void)
+		{
+			t1=time();
+		}
+
+		double time(void)
+		{
+			#ifdef _WIN32
+			if(highres)
+			{
+				LARGE_INTEGER Time;
+				QueryPerformanceCounter(&Time);
+				return((double)(Time.QuadPart)*tick);
+			}
+			else
+				return((double)GetTickCount()*tick);
+			#else
+			struct timeval __tv;
+			gettimeofday(&__tv, (struct timezone *)NULL);
+			return((double)(__tv.tv_sec)+(double)(__tv.tv_usec)*0.000001);
+			#endif
+		}
+
+		double elapsed(void)
+		{
+			return time()-t1;
+		}
+
+	private:
+
+		#ifdef _WIN32
+		bool highres;  double tick;
+		#endif
+		double t1;
+};
+
+#endif  // __cplusplus
+
+#ifdef _WIN32
+
+#include <windows.h>
+
+__inline double rrtime(void)
+{
+	LARGE_INTEGER Frequency, Time;
+	if(QueryPerformanceFrequency(&Frequency)!=0)
+	{
+		QueryPerformanceCounter(&Time);
+		return (double)Time.QuadPart/(double)Frequency.QuadPart;
+	}
+	else return (double)GetTickCount()*0.001;
+}
+
+#else
+
+#include <sys/time.h>
+
+#ifdef sun
+#define __inline inline
+#endif
+
+static __inline double rrtime(void)
+{
+	struct timeval __tv;
+	gettimeofday(&__tv, (struct timezone *)NULL);
+	return((double)__tv.tv_sec+(double)__tv.tv_usec*0.000001);
+}
+
+#endif
+
+#endif
+

diff --git a/rrutil.h b/rrutil.h
new file mode 100644
index 0000000..4918120
--- /dev/null
+++ b/rrutil.h

@@ -0,0 +1,81 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005 Sun Microsystems, Inc.
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version.  The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * wxWindows Library License for more details.
+ */
+
+#ifndef __RRUTIL_H__
+#define __RRUTIL_H__
+
+#ifdef _WIN32
+	#include <windows.h>
+	#define sleep(t) Sleep((t)*1000)
+	#define usleep(t) Sleep((t)/1000)
+#else
+	#include <unistd.h>
+	#define stricmp strcasecmp
+	#define strnicmp strncasecmp
+#endif
+
+#ifndef min
+ #define min(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef max
+ #define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#define pow2(i) (1<<(i))
+#define isPow2(x) (((x)&(x-1))==0)
+
+#ifdef sgi
+#define _SC_NPROCESSORS_CONF _SC_NPROC_CONF
+#endif
+
+#ifdef sun
+#define __inline inline
+#endif
+
+static __inline int numprocs(void)
+{
+	#ifdef _WIN32
+	DWORD ProcAff, SysAff, i;  int count=0;
+	if(!GetProcessAffinityMask(GetCurrentProcess(), &ProcAff, &SysAff)) return(1);
+	for(i=0; i<32; i++) if(ProcAff&(1<<i)) count++;
+	return(count);
+	#elif defined (__APPLE__)
+	return(1);
+	#else
+	long count=1;
+	if((count=sysconf(_SC_NPROCESSORS_CONF))!=-1) return((int)count);
+	else return(1);
+	#endif
+}
+
+#define byteswap(i) ( \
+	(((i) & 0xff000000) >> 24) | \
+	(((i) & 0x00ff0000) >>  8) | \
+	(((i) & 0x0000ff00) <<  8) | \
+	(((i) & 0x000000ff) << 24) )
+
+#define byteswap16(i) ( \
+	(((i) & 0xff00) >> 8) | \
+	(((i) & 0x00ff) << 8) )
+
+static __inline int littleendian(void)
+{
+	unsigned int value=1;
+	unsigned char *ptr=(unsigned char *)(&value);
+	if(ptr[0]==1 && ptr[3]==0) return 1;
+	else return 0;
+}
+
+#endif

diff --git a/simd/Makefile.am b/simd/Makefile.am
new file mode 100644
index 0000000..1df6143
--- /dev/null
+++ b/simd/Makefile.am

@@ -0,0 +1,57 @@
+noinst_LTLIBRARIES = libsimd.la
+
+BUILT_SOURCES = jsimdcfg.inc
+
+EXTRA_DIST = nasm_lt.sh jcclrmmx.asm jcclrss2.asm jdclrmmx.asm jdclrss2.asm \
+	jdmrgmmx.asm jdmrgss2.asm jcclrss2-64.asm jdclrss2-64.asm \
+	jdmrgss2-64.asm
+
+if SIMD_X86_64
+
+libsimd_la_SOURCES = jsimd_x86_64.c \
+	jsimd.h jsimdcfg.inc.h \
+	jsimdext.inc jcolsamp.inc jdct.inc \
+	jfsseflt-64.asm \
+	jccolss2-64.asm jdcolss2-64.asm \
+	jcsamss2-64.asm jdsamss2-64.asm jdmerss2-64.asm \
+	jcqnts2i-64.asm jfss2fst-64.asm jfss2int-64.asm \
+	jiss2red-64.asm jiss2int-64.asm jiss2fst-64.asm \
+	jcqnts2f-64.asm jiss2flt-64.asm
+
+jccolss2-64.lo: jcclrss2-64.asm
+jdcolss2-64.lo: jdclrss2-64.asm
+jdmerss2-64.lo: jdmrgss2-64.asm
+endif
+
+if SIMD_I386
+
+libsimd_la_SOURCES = jsimd_i386.c \
+	jsimd.h jsimdcfg.inc.h \
+	jsimdext.inc jcolsamp.inc jdct.inc \
+	jsimdcpu.asm \
+	jccolmmx.asm jdcolmmx.asm \
+	jcsammmx.asm jdsammmx.asm jdmermmx.asm \
+	jcqntmmx.asm jfmmxfst.asm jfmmxint.asm \
+	jimmxred.asm jimmxint.asm jimmxfst.asm \
+	jcqnt3dn.asm jf3dnflt.asm ji3dnflt.asm \
+	jcqntsse.asm jfsseflt.asm jisseflt.asm \
+	jccolss2.asm jdcolss2.asm \
+	jcsamss2.asm jdsamss2.asm jdmerss2.asm \
+	jcqnts2i.asm jfss2fst.asm jfss2int.asm \
+	jiss2red.asm jiss2int.asm jiss2fst.asm \
+	jcqnts2f.asm jiss2flt.asm
+
+jccolmmx.lo: jcclrmmx.asm
+jccolss2.lo: jcclrss2.asm
+jdcolmmx.lo: jdclrmmx.asm
+jdcolss2.lo: jdclrss2.asm
+jdmermmx.lo: jdmrgmmx.asm
+jdmerss2.lo: jdmrgss2.asm
+endif
+
+.asm.lo:
+	$(LIBTOOL) --mode=compile --tag NASM ./nasm_lt.sh $(NASM) $(NAFLAGS) $< -o $@
+
+jsimdcfg.inc: jsimdcfg.inc.h ../jpeglib.h ../jconfig.h ../jmorecfg.h
+	$(CPP) jsimdcfg.inc.h | $(EGREP) ^[\;%] | sed 's%_cpp_protection_%%' > $@
+

diff --git a/simd/jcclrmmx.asm b/simd/jcclrmmx.asm
new file mode 100644
index 0000000..b6b8912
--- /dev/null
+++ b/simd/jcclrmmx.asm

@@ -0,0 +1,479 @@
+;
+; jcclrmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
+;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                           JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION img_width
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		8
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [img_width(eax)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edx
+	push	ebx
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	mov	ebx, JSAMPROW [ebx]	; outptr1
+	mov	edx, JSAMPROW [edx]	; outptr2
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	xor	eax,eax
+	mov	al, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	xor	edx,edx
+	mov	dx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	mmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	mmG, DWORD [esi+ecx]
+	psllq	mmA, DWORD_BIT
+	por	mmA,mmG
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	movq	mmG,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	mov	ecx, SIZEOF_MMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld16:
+	test	cl, 2*SIZEOF_MMWORD
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_ycc_cnv
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+	; mmA=(00 10 20 01 11 21 02 12)
+	; mmG=(22 03 13 23 04 14 24 05)
+	; mmF=(15 25 06 16 26 07 17 27)
+
+	movq      mmD,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 10 20 01)
+	psrlq     mmD,4*BYTE_BIT	; mmD=(11 21 02 12 -- -- -- --)
+
+	punpckhbw mmA,mmG		; mmA=(00 04 10 14 20 24 01 05)
+	psllq     mmG,4*BYTE_BIT	; mmG=(-- -- -- -- 22 03 13 23)
+
+	punpcklbw mmD,mmF		; mmD=(11 15 21 25 02 06 12 16)
+	punpckhbw mmG,mmF		; mmG=(22 26 03 07 13 17 23 27)
+
+	movq      mmE,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 04 10 14)
+	psrlq     mmE,4*BYTE_BIT	; mmE=(20 24 01 05 -- -- -- --)
+
+	punpckhbw mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	psllq     mmD,4*BYTE_BIT	; mmD=(-- -- -- -- 11 15 21 25)
+
+	punpcklbw mmE,mmG		; mmE=(20 22 24 26 01 03 05 07)
+	punpckhbw mmD,mmG		; mmD=(11 13 15 17 21 23 25 27)
+
+	pxor      mmH,mmH
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmH		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmH		; mmC=(10 12 14 16)
+
+	movq      mmB,mmE
+	punpcklbw mmE,mmH		; mmE=(20 22 24 26)
+	punpckhbw mmB,mmH		; mmB=(01 03 05 07)
+
+	movq      mmF,mmD
+	punpcklbw mmD,mmH		; mmD=(11 13 15 17)
+	punpckhbw mmF,mmH		; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_MMWORD/8
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_MMWORD/8
+	movd	mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_MMWORD/4
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_MMWORD/4
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+	test	cl, SIZEOF_MMWORD/2
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_ycc_cnv
+	movq	mmD,mmA
+	movq	mmC,mmF
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+	movq	mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+	; mmA=(00 10 20 30 01 11 21 31)
+	; mmF=(02 12 22 32 03 13 23 33)
+	; mmD=(04 14 24 34 05 15 25 35)
+	; mmC=(06 16 26 36 07 17 27 37)
+
+	movq      mmB,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 10 12 20 22 30 32)
+	punpckhbw mmB,mmF		; mmB=(01 03 11 13 21 23 31 33)
+
+	movq      mmG,mmD
+	punpcklbw mmD,mmC		; mmD=(04 06 14 16 24 26 34 36)
+	punpckhbw mmG,mmC		; mmG=(05 07 15 17 25 27 35 37)
+
+	movq      mmE,mmA
+	punpcklwd mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	punpckhwd mmE,mmD		; mmE=(20 22 24 26 30 32 34 36)
+
+	movq      mmH,mmB
+	punpcklwd mmB,mmG		; mmB=(01 03 05 07 11 13 15 17)
+	punpckhwd mmH,mmG		; mmH=(21 23 25 27 31 33 35 37)
+
+	pxor      mmF,mmF
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmF		; mmC=(10 12 14 16)
+
+	movq      mmD,mmB
+	punpcklbw mmB,mmF		; mmB=(01 03 05 07)
+	punpckhbw mmD,mmF		; mmD=(11 13 15 17)
+
+	movq      mmG,mmE
+	punpcklbw mmE,mmF		; mmE=(20 22 24 26)
+	punpckhbw mmG,mmF		; mmG=(30 32 34 36)
+
+	punpcklbw mmF,mmH
+	punpckhbw mmH,mmH
+	psrlw     mmF,BYTE_BIT		; mmF=(21 23 25 27)
+	psrlw     mmH,BYTE_BIT		; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+	; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movq      MMWORD [wk(0)], mm0	; wk(0)=RE
+	movq      MMWORD [wk(1)], mm1	; wk(1)=RO
+	movq      MMWORD [wk(2)], mm4	; wk(2)=BE
+	movq      MMWORD [wk(3)], mm5	; wk(3)=BO
+
+	movq      mm6,mm1
+	punpcklwd mm1,mm3
+	punpckhwd mm6,mm3
+	movq      mm7,mm1
+	movq      mm4,mm6
+	pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movq      MMWORD [wk(4)], mm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movq      MMWORD [wk(5)], mm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      mm1,mm1
+	pxor      mm6,mm6
+	punpcklwd mm1,mm5		; mm1=BOL
+	punpckhwd mm6,mm5		; mm6=BOH
+	psrld     mm1,1			; mm1=BOL*FIX(0.500)
+	psrld     mm6,1			; mm6=BOH*FIX(0.500)
+
+	movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
+
+	paddd     mm7,mm1
+	paddd     mm4,mm6
+	paddd     mm7,mm5
+	paddd     mm4,mm5
+	psrld     mm7,SCALEBITS		; mm7=CbOL
+	psrld     mm4,SCALEBITS		; mm4=CbOH
+	packssdw  mm7,mm4		; mm7=CbO
+
+	movq      mm1, MMWORD [wk(2)]	; mm1=BE
+
+	movq      mm6,mm0
+	punpcklwd mm0,mm2
+	punpckhwd mm6,mm2
+	movq      mm5,mm0
+	movq      mm4,mm6
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movq      MMWORD [wk(6)], mm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movq      MMWORD [wk(7)], mm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      mm0,mm0
+	pxor      mm6,mm6
+	punpcklwd mm0,mm1		; mm0=BEL
+	punpckhwd mm6,mm1		; mm6=BEH
+	psrld     mm0,1			; mm0=BEL*FIX(0.500)
+	psrld     mm6,1			; mm6=BEH*FIX(0.500)
+
+	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+	paddd     mm5,mm0
+	paddd     mm4,mm6
+	paddd     mm5,mm1
+	paddd     mm4,mm1
+	psrld     mm5,SCALEBITS		; mm5=CbEL
+	psrld     mm4,SCALEBITS		; mm4=CbEH
+	packssdw  mm5,mm4		; mm5=CbE
+
+	psllw     mm7,BYTE_BIT
+	por       mm5,mm7		; mm5=Cb
+	movq      MMWORD [ebx], mm5	; Save Cb
+
+	movq      mm0, MMWORD [wk(3)]	; mm0=BO
+	movq      mm6, MMWORD [wk(2)]	; mm6=BE
+	movq      mm1, MMWORD [wk(1)]	; mm1=RO
+
+	movq      mm4,mm0
+	punpcklwd mm0,mm3
+	punpckhwd mm4,mm3
+	movq      mm7,mm0
+	movq      mm5,mm4
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movq      mm3,[GOTOFF(eax,PD_ONEHALF)]	; mm3=[PD_ONEHALF]
+
+	paddd     mm0, MMWORD [wk(4)]
+	paddd     mm4, MMWORD [wk(5)]
+	paddd     mm0,mm3
+	paddd     mm4,mm3
+	psrld     mm0,SCALEBITS		; mm0=YOL
+	psrld     mm4,SCALEBITS		; mm4=YOH
+	packssdw  mm0,mm4		; mm0=YO
+
+	pxor      mm3,mm3
+	pxor      mm4,mm4
+	punpcklwd mm3,mm1		; mm3=ROL
+	punpckhwd mm4,mm1		; mm4=ROH
+	psrld     mm3,1			; mm3=ROL*FIX(0.500)
+	psrld     mm4,1			; mm4=ROH*FIX(0.500)
+
+	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+	paddd     mm7,mm3
+	paddd     mm5,mm4
+	paddd     mm7,mm1
+	paddd     mm5,mm1
+	psrld     mm7,SCALEBITS		; mm7=CrOL
+	psrld     mm5,SCALEBITS		; mm5=CrOH
+	packssdw  mm7,mm5		; mm7=CrO
+
+	movq      mm3, MMWORD [wk(0)]	; mm3=RE
+
+	movq      mm4,mm6
+	punpcklwd mm6,mm2
+	punpckhwd mm4,mm2
+	movq      mm1,mm6
+	movq      mm5,mm4
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movq      mm2,[GOTOFF(eax,PD_ONEHALF)]	; mm2=[PD_ONEHALF]
+
+	paddd     mm6, MMWORD [wk(6)]
+	paddd     mm4, MMWORD [wk(7)]
+	paddd     mm6,mm2
+	paddd     mm4,mm2
+	psrld     mm6,SCALEBITS		; mm6=YEL
+	psrld     mm4,SCALEBITS		; mm4=YEH
+	packssdw  mm6,mm4		; mm6=YE
+
+	psllw     mm0,BYTE_BIT
+	por       mm6,mm0		; mm6=Y
+	movq      MMWORD [edi], mm6	; Save Y
+
+	pxor      mm2,mm2
+	pxor      mm4,mm4
+	punpcklwd mm2,mm3		; mm2=REL
+	punpckhwd mm4,mm3		; mm4=REH
+	psrld     mm2,1			; mm2=REL*FIX(0.500)
+	psrld     mm4,1			; mm4=REH*FIX(0.500)
+
+	movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
+
+	paddd     mm1,mm2
+	paddd     mm5,mm4
+	paddd     mm1,mm0
+	paddd     mm5,mm0
+	psrld     mm1,SCALEBITS		; mm1=CrEL
+	psrld     mm5,SCALEBITS		; mm5=CrEH
+	packssdw  mm1,mm5		; mm1=CrE
+
+	psllw     mm7,BYTE_BIT
+	por       mm1,mm7		; mm1=Cr
+	movq      MMWORD [edx], mm1	; Save Cr
+
+	sub	ecx, byte SIZEOF_MMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; inptr
+	add	edi, byte SIZEOF_MMWORD			; outptr0
+	add	ebx, byte SIZEOF_MMWORD			; outptr1
+	add	edx, byte SIZEOF_MMWORD			; outptr2
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	pop	ebx
+	pop	edx
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jcclrss2-64.asm b/simd/jcclrss2-64.asm
new file mode 100644
index 0000000..31c5be6
--- /dev/null
+++ b/simd/jcclrss2-64.asm

@@ -0,0 +1,487 @@
+;
+; jcclrss2.asm - colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
+;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                             JDIMENSION output_row, int num_rows);
+;
+
+; r10 = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13 = JDIMENSION output_row
+; r14 = int num_rows
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		8
+
+	align	16
+
+	global	EXTN(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	push	rbx
+	collect_args
+
+	mov	rcx, r10
+	test	rcx,rcx
+	jz	near .return
+
+	push	rcx
+
+	mov rsi, r12
+	mov rcx, r13
+	mov	rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+	mov	rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+	mov	rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+	lea	rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+	pop	rcx
+
+	mov rsi, r11
+	mov	rax, r14
+	test	rax,rax
+	jle	near .return
+.rowloop:
+	push	rdx
+	push	rbx
+	push	rdi
+	push	rsi
+	push	rcx			; col
+
+	mov	rsi, JSAMPROW [rsi]	; inptr
+	mov	rdi, JSAMPROW [rdi]	; outptr0
+	mov	rbx, JSAMPROW [rbx]	; outptr1
+	mov	rdx, JSAMPROW [rdx]	; outptr2
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	rax
+	push	rdx
+	lea	rcx,[rcx+rcx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	rcx, byte SIZEOF_BYTE
+	movzx	rax, BYTE [rsi+rcx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	rcx, byte SIZEOF_WORD
+	movzx	rdx, WORD [rsi+rcx]
+	shl	rax, WORD_BIT
+	or	rax,rdx
+.column_ld4:
+	movd	xmmA,eax
+	pop	rdx
+	pop	rax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	rcx, byte SIZEOF_DWORD
+	movd	xmmF, XMM_DWORD [rsi+rcx]
+	pslldq	xmmA, SIZEOF_DWORD
+	por	xmmA,xmmF
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	sub	rcx, byte SIZEOF_MMWORD
+	movq	xmmB, XMM_MMWORD [rsi+rcx]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmB
+.column_ld16:
+	test	cl, SIZEOF_XMMWORD
+	jz	short .column_ld32
+	movdqa	xmmF,xmmA
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	mov	rcx, SIZEOF_XMMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld32:
+	test	cl, 2*SIZEOF_XMMWORD
+	mov	rcx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmB,xmmA
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	movdqu	xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	movdqa    xmmG,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+	movdqa    xmmD,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+	movdqa    xmmE,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+	pxor      xmmH,xmmH
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmB,xmmE
+	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+	movdqa    xmmF,xmmD
+	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_XMMWORD/16
+	jz	short .column_ld2
+	sub	rcx, byte SIZEOF_XMMWORD/16
+	movd	xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_XMMWORD/8
+	jz	short .column_ld4
+	sub	rcx, byte SIZEOF_XMMWORD/8
+	movq	xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmE
+.column_ld4:
+	test	cl, SIZEOF_XMMWORD/4
+	jz	short .column_ld8
+	sub	rcx, byte SIZEOF_XMMWORD/4
+	movdqa	xmmE,xmmA
+	movdqu	xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+	test	cl, SIZEOF_XMMWORD/2
+	mov	rcx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmF,xmmA
+	movdqa	xmmH,xmmE
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+	movdqu	xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+	movdqa    xmmC,xmmF
+	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+	movdqa    xmmB,xmmA
+	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+	movdqa    xmmG,xmmD
+	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+	movdqa    xmmE,xmmA
+	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+	movdqa    xmmH,xmmB
+	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+	pxor      xmmF,xmmF
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmD,xmmB
+	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+	movdqa    xmmG,xmmE
+	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+	punpcklbw xmmF,xmmH
+	punpckhbw xmmH,xmmH
+	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
+	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
+	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
+	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
+	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
+
+	movdqa    xmm6,xmm1
+	punpcklwd xmm1,xmm3
+	punpckhwd xmm6,xmm3
+	movdqa    xmm7,xmm1
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      xmm1,xmm1
+	pxor      xmm6,xmm6
+	punpcklwd xmm1,xmm5		; xmm1=BOL
+	punpckhwd xmm6,xmm5		; xmm6=BOH
+	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
+
+	movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm1
+	paddd     xmm4,xmm6
+	paddd     xmm7,xmm5
+	paddd     xmm4,xmm5
+	psrld     xmm7,SCALEBITS	; xmm7=CbOL
+	psrld     xmm4,SCALEBITS	; xmm4=CbOH
+	packssdw  xmm7,xmm4		; xmm7=CbO
+
+	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
+
+	movdqa    xmm6,xmm0
+	punpcklwd xmm0,xmm2
+	punpckhwd xmm6,xmm2
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      xmm0,xmm0
+	pxor      xmm6,xmm6
+	punpcklwd xmm0,xmm1		; xmm0=BEL
+	punpckhwd xmm6,xmm1		; xmm6=BEH
+	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
+
+	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm5,xmm0
+	paddd     xmm4,xmm6
+	paddd     xmm5,xmm1
+	paddd     xmm4,xmm1
+	psrld     xmm5,SCALEBITS	; xmm5=CbEL
+	psrld     xmm4,SCALEBITS	; xmm4=CbEH
+	packssdw  xmm5,xmm4		; xmm5=CbE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm5,xmm7		; xmm5=Cb
+	movdqa    XMMWORD [rbx], xmm5	; Save Cb
+
+	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
+	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
+
+	movdqa    xmm4,xmm0
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm4,xmm3
+	movdqa    xmm7,xmm0
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movdqa    xmm3,[rel PD_ONEHALF]	; xmm3=[PD_ONEHALF]
+
+	paddd     xmm0, XMMWORD [wk(4)]
+	paddd     xmm4, XMMWORD [wk(5)]
+	paddd     xmm0,xmm3
+	paddd     xmm4,xmm3
+	psrld     xmm0,SCALEBITS	; xmm0=YOL
+	psrld     xmm4,SCALEBITS	; xmm4=YOH
+	packssdw  xmm0,xmm4		; xmm0=YO
+
+	pxor      xmm3,xmm3
+	pxor      xmm4,xmm4
+	punpcklwd xmm3,xmm1		; xmm3=ROL
+	punpckhwd xmm4,xmm1		; xmm4=ROH
+	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
+
+	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm3
+	paddd     xmm5,xmm4
+	paddd     xmm7,xmm1
+	paddd     xmm5,xmm1
+	psrld     xmm7,SCALEBITS	; xmm7=CrOL
+	psrld     xmm5,SCALEBITS	; xmm5=CrOH
+	packssdw  xmm7,xmm5		; xmm7=CrO
+
+	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
+
+	movdqa    xmm4,xmm6
+	punpcklwd xmm6,xmm2
+	punpckhwd xmm4,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movdqa    xmm2,[rel PD_ONEHALF]	; xmm2=[PD_ONEHALF]
+
+	paddd     xmm6, XMMWORD [wk(6)]
+	paddd     xmm4, XMMWORD [wk(7)]
+	paddd     xmm6,xmm2
+	paddd     xmm4,xmm2
+	psrld     xmm6,SCALEBITS	; xmm6=YEL
+	psrld     xmm4,SCALEBITS	; xmm4=YEH
+	packssdw  xmm6,xmm4		; xmm6=YE
+
+	psllw     xmm0,BYTE_BIT
+	por       xmm6,xmm0		; xmm6=Y
+	movdqa    XMMWORD [rdi], xmm6	; Save Y
+
+	pxor      xmm2,xmm2
+	pxor      xmm4,xmm4
+	punpcklwd xmm2,xmm3		; xmm2=REL
+	punpckhwd xmm4,xmm3		; xmm4=REH
+	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
+
+	movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm1,xmm2
+	paddd     xmm5,xmm4
+	paddd     xmm1,xmm0
+	paddd     xmm5,xmm0
+	psrld     xmm1,SCALEBITS	; xmm1=CrEL
+	psrld     xmm5,SCALEBITS	; xmm5=CrEH
+	packssdw  xmm1,xmm5		; xmm1=CrE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm1,xmm7		; xmm1=Cr
+	movdqa    XMMWORD [rdx], xmm1	; Save Cr
+
+	sub	rcx, byte SIZEOF_XMMWORD
+	add	rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
+	add	rdi, byte SIZEOF_XMMWORD		; outptr0
+	add	rbx, byte SIZEOF_XMMWORD		; outptr1
+	add	rdx, byte SIZEOF_XMMWORD		; outptr2
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	rcx,rcx
+	jnz	near .column_ld1
+
+	pop	rcx			; col
+	pop	rsi
+	pop	rdi
+	pop	rbx
+	pop	rdx
+
+	add	rsi, byte SIZEOF_JSAMPROW	; input_buf
+	add	rdi, byte SIZEOF_JSAMPROW
+	add	rbx, byte SIZEOF_JSAMPROW
+	add	rdx, byte SIZEOF_JSAMPROW
+	dec	rax				; num_rows
+	jg	near .rowloop
+
+.return:
+	uncollect_args
+	pop	rbx
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jcclrss2.asm b/simd/jcclrss2.asm
new file mode 100644
index 0000000..8def718
--- /dev/null
+++ b/simd/jcclrss2.asm

@@ -0,0 +1,505 @@
+;
+; jcclrss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
+;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                             JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION img_width
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		8
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+
+	global	EXTN(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [img_width(eax)]
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edx
+	push	ebx
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	mov	ebx, JSAMPROW [ebx]	; outptr1
+	mov	edx, JSAMPROW [edx]	; outptr2
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	movzx	eax, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	movzx	edx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	xmmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	xmmF, XMM_DWORD [esi+ecx]
+	pslldq	xmmA, SIZEOF_DWORD
+	por	xmmA,xmmF
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	sub	ecx, byte SIZEOF_MMWORD
+	movq	xmmB, XMM_MMWORD [esi+ecx]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmB
+.column_ld16:
+	test	cl, SIZEOF_XMMWORD
+	jz	short .column_ld32
+	movdqa	xmmF,xmmA
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld32:
+	test	cl, 2*SIZEOF_XMMWORD
+	mov	ecx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmB,xmmA
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	movdqu	xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	movdqa    xmmG,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+	movdqa    xmmD,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+	movdqa    xmmE,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+	pxor      xmmH,xmmH
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmB,xmmE
+	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+	movdqa    xmmF,xmmD
+	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_XMMWORD/16
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_XMMWORD/16
+	movd	xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_XMMWORD/8
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_XMMWORD/8
+	movq	xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmE
+.column_ld4:
+	test	cl, SIZEOF_XMMWORD/4
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_XMMWORD/4
+	movdqa	xmmE,xmmA
+	movdqu	xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+	test	cl, SIZEOF_XMMWORD/2
+	mov	ecx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmF,xmmA
+	movdqa	xmmH,xmmE
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+	movdqu	xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+	movdqa    xmmC,xmmF
+	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+	movdqa    xmmB,xmmA
+	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+	movdqa    xmmG,xmmD
+	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+	movdqa    xmmE,xmmA
+	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+	movdqa    xmmH,xmmB
+	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+	pxor      xmmF,xmmF
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmD,xmmB
+	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+	movdqa    xmmG,xmmE
+	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+	punpcklbw xmmF,xmmH
+	punpckhbw xmmH,xmmH
+	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
+	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
+	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
+	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
+	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
+
+	movdqa    xmm6,xmm1
+	punpcklwd xmm1,xmm3
+	punpckhwd xmm6,xmm3
+	movdqa    xmm7,xmm1
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      xmm1,xmm1
+	pxor      xmm6,xmm6
+	punpcklwd xmm1,xmm5		; xmm1=BOL
+	punpckhwd xmm6,xmm5		; xmm6=BOH
+	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
+
+	movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm1
+	paddd     xmm4,xmm6
+	paddd     xmm7,xmm5
+	paddd     xmm4,xmm5
+	psrld     xmm7,SCALEBITS	; xmm7=CbOL
+	psrld     xmm4,SCALEBITS	; xmm4=CbOH
+	packssdw  xmm7,xmm4		; xmm7=CbO
+
+	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
+
+	movdqa    xmm6,xmm0
+	punpcklwd xmm0,xmm2
+	punpckhwd xmm6,xmm2
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      xmm0,xmm0
+	pxor      xmm6,xmm6
+	punpcklwd xmm0,xmm1		; xmm0=BEL
+	punpckhwd xmm6,xmm1		; xmm6=BEH
+	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
+
+	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm5,xmm0
+	paddd     xmm4,xmm6
+	paddd     xmm5,xmm1
+	paddd     xmm4,xmm1
+	psrld     xmm5,SCALEBITS	; xmm5=CbEL
+	psrld     xmm4,SCALEBITS	; xmm4=CbEH
+	packssdw  xmm5,xmm4		; xmm5=CbE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm5,xmm7		; xmm5=Cb
+	movdqa    XMMWORD [ebx], xmm5	; Save Cb
+
+	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
+	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
+
+	movdqa    xmm4,xmm0
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm4,xmm3
+	movdqa    xmm7,xmm0
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)]	; xmm3=[PD_ONEHALF]
+
+	paddd     xmm0, XMMWORD [wk(4)]
+	paddd     xmm4, XMMWORD [wk(5)]
+	paddd     xmm0,xmm3
+	paddd     xmm4,xmm3
+	psrld     xmm0,SCALEBITS	; xmm0=YOL
+	psrld     xmm4,SCALEBITS	; xmm4=YOH
+	packssdw  xmm0,xmm4		; xmm0=YO
+
+	pxor      xmm3,xmm3
+	pxor      xmm4,xmm4
+	punpcklwd xmm3,xmm1		; xmm3=ROL
+	punpckhwd xmm4,xmm1		; xmm4=ROH
+	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
+
+	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm3
+	paddd     xmm5,xmm4
+	paddd     xmm7,xmm1
+	paddd     xmm5,xmm1
+	psrld     xmm7,SCALEBITS	; xmm7=CrOL
+	psrld     xmm5,SCALEBITS	; xmm5=CrOH
+	packssdw  xmm7,xmm5		; xmm7=CrO
+
+	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
+
+	movdqa    xmm4,xmm6
+	punpcklwd xmm6,xmm2
+	punpckhwd xmm4,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)]	; xmm2=[PD_ONEHALF]
+
+	paddd     xmm6, XMMWORD [wk(6)]
+	paddd     xmm4, XMMWORD [wk(7)]
+	paddd     xmm6,xmm2
+	paddd     xmm4,xmm2
+	psrld     xmm6,SCALEBITS	; xmm6=YEL
+	psrld     xmm4,SCALEBITS	; xmm4=YEH
+	packssdw  xmm6,xmm4		; xmm6=YE
+
+	psllw     xmm0,BYTE_BIT
+	por       xmm6,xmm0		; xmm6=Y
+	movdqa    XMMWORD [edi], xmm6	; Save Y
+
+	pxor      xmm2,xmm2
+	pxor      xmm4,xmm4
+	punpcklwd xmm2,xmm3		; xmm2=REL
+	punpckhwd xmm4,xmm3		; xmm4=REH
+	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
+
+	movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm1,xmm2
+	paddd     xmm5,xmm4
+	paddd     xmm1,xmm0
+	paddd     xmm5,xmm0
+	psrld     xmm1,SCALEBITS	; xmm1=CrEL
+	psrld     xmm5,SCALEBITS	; xmm5=CrEH
+	packssdw  xmm1,xmm5		; xmm1=CrE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm1,xmm7		; xmm1=Cr
+	movdqa    XMMWORD [edx], xmm1	; Save Cr
+
+	sub	ecx, byte SIZEOF_XMMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
+	add	edi, byte SIZEOF_XMMWORD		; outptr0
+	add	ebx, byte SIZEOF_XMMWORD		; outptr1
+	add	edx, byte SIZEOF_XMMWORD		; outptr2
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	pop	ebx
+	pop	edx
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jccolmmx.asm b/simd/jccolmmx.asm
new file mode 100644
index 0000000..5e7f3be
--- /dev/null
+++ b/simd/jccolmmx.asm

@@ -0,0 +1,120 @@
+;
+; jccolmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337	times 2 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 2 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 2 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 2 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 2 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 2 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
+%include "jcclrmmx.asm"

diff --git a/simd/jccolss2-64.asm b/simd/jccolss2-64.asm
new file mode 100644
index 0000000..a419d1b
--- /dev/null
+++ b/simd/jccolss2-64.asm

@@ -0,0 +1,117 @@
+;
+; jccolss2.asm - colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jcclrss2-64.asm"

diff --git a/simd/jccolss2.asm b/simd/jccolss2.asm
new file mode 100644
index 0000000..8d1f734
--- /dev/null
+++ b/simd/jccolss2.asm

@@ -0,0 +1,117 @@
+;
+; jccolss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jcclrss2.asm"

diff --git a/simd/jcolsamp.inc b/simd/jcolsamp.inc
new file mode 100644
index 0000000..79751b7
--- /dev/null
+++ b/simd/jcolsamp.inc

@@ -0,0 +1,105 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; [TAB8]
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_RED == 0
+%define  mmA  mm0
+%define  mmB  mm1
+%define xmmA xmm0
+%define xmmB xmm1
+%elif RGB_GREEN == 0
+%define  mmA  mm2
+%define  mmB  mm3
+%define xmmA xmm2
+%define xmmB xmm3
+%elif RGB_BLUE == 0
+%define  mmA  mm4
+%define  mmB  mm5
+%define xmmA xmm4
+%define xmmB xmm5
+%else
+%define  mmA  mm6
+%define  mmB  mm7
+%define xmmA xmm6
+%define xmmB xmm7
+%endif
+
+%if RGB_RED == 1
+%define  mmC  mm0
+%define  mmD  mm1
+%define xmmC xmm0
+%define xmmD xmm1
+%elif RGB_GREEN == 1
+%define  mmC  mm2
+%define  mmD  mm3
+%define xmmC xmm2
+%define xmmD xmm3
+%elif RGB_BLUE == 1
+%define  mmC  mm4
+%define  mmD  mm5
+%define xmmC xmm4
+%define xmmD xmm5
+%else
+%define  mmC  mm6
+%define  mmD  mm7
+%define xmmC xmm6
+%define xmmD xmm7
+%endif
+
+%if RGB_RED == 2
+%define  mmE  mm0
+%define  mmF  mm1
+%define xmmE xmm0
+%define xmmF xmm1
+%elif RGB_GREEN == 2
+%define  mmE  mm2
+%define  mmF  mm3
+%define xmmE xmm2
+%define xmmF xmm3
+%elif RGB_BLUE == 2
+%define  mmE  mm4
+%define  mmF  mm5
+%define xmmE xmm4
+%define xmmF xmm5
+%else
+%define  mmE  mm6
+%define  mmF  mm7
+%define xmmE xmm6
+%define xmmF xmm7
+%endif
+
+%if RGB_RED == 3
+%define  mmG  mm0
+%define  mmH  mm1
+%define xmmG xmm0
+%define xmmH xmm1
+%elif RGB_GREEN == 3
+%define  mmG  mm2
+%define  mmH  mm3
+%define xmmG xmm2
+%define xmmH xmm3
+%elif RGB_BLUE == 3
+%define  mmG  mm4
+%define  mmH  mm5
+%define xmmG xmm4
+%define xmmH xmm5
+%else
+%define  mmG  mm6
+%define  mmH  mm7
+%define xmmG xmm6
+%define xmmH xmm7
+%endif
+
+; --------------------------------------------------------------------------

diff --git a/simd/jcqnt3dn.asm b/simd/jcqnt3dn.asm
new file mode 100644
index 0000000..182c869
--- /dev/null
+++ b/simd/jcqnt3dn.asm

@@ -0,0 +1,233 @@
+;
+; jcqnt3dn.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                             FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_float_3dnow)
+
+EXTN(jsimd_convsamp_float_3dnow):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  mm7,mm7
+	psllw    mm7,7
+	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	mm0,mm7				; mm0=(01234567)
+	psubb	mm1,mm7				; mm1=(89ABCDEF)
+
+	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
+	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
+	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
+	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
+
+	punpcklwd mm4,mm2			; mm4=(***0***1)
+	punpckhwd mm2,mm2			; mm2=(***2***3)
+	punpcklwd mm5,mm0			; mm5=(***4***5)
+	punpckhwd mm0,mm0			; mm0=(***6***7)
+
+	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
+	psrad	mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
+	pi2fd	mm4,mm4
+	pi2fd	mm2,mm2
+	psrad	mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
+	psrad	mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
+	pi2fd	mm5,mm5
+	pi2fd	mm0,mm0
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+	punpcklwd mm6,mm3			; mm6=(***8***9)
+	punpckhwd mm3,mm3			; mm3=(***A***B)
+	punpcklwd mm4,mm1			; mm4=(***C***D)
+	punpckhwd mm1,mm1			; mm1=(***E***F)
+
+	psrad	mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
+	psrad	mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
+	pi2fd	mm6,mm6
+	pi2fd	mm3,mm3
+	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
+	psrad	mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
+	pi2fd	mm4,mm4
+	pi2fd	mm1,mm1
+
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .convloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                             FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_float_3dnow)
+
+EXTN(jsimd_quantize_float_3dnow):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov       eax, 0x4B400000	; (float)0x00C00000 (rndint_magic)
+	movd      mm7,eax
+	punpckldq mm7,mm7		; mm7={12582912.0F 12582912.0F}
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+	pfadd	mm0,mm7			; mm0=(00 ** 01 **)
+	pfadd	mm1,mm7			; mm1=(02 ** 03 **)
+	pfadd	mm2,mm7			; mm0=(04 ** 05 **)
+	pfadd	mm3,mm7			; mm1=(06 ** 07 **)
+
+	movq      mm4,mm0
+	punpcklwd mm0,mm1		; mm0=(00 02 ** **)
+	punpckhwd mm4,mm1		; mm4=(01 03 ** **)
+	movq      mm5,mm2
+	punpcklwd mm2,mm3		; mm2=(04 06 ** **)
+	punpckhwd mm5,mm3		; mm5=(05 07 ** **)
+
+	punpcklwd mm0,mm4		; mm0=(00 01 02 03)
+	punpcklwd mm2,mm5		; mm2=(04 05 06 07)
+
+	movq	mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+	pfadd	mm6,mm7			; mm0=(10 ** 11 **)
+	pfadd	mm1,mm7			; mm4=(12 ** 13 **)
+	pfadd	mm3,mm7			; mm0=(14 ** 15 **)
+	pfadd	mm4,mm7			; mm4=(16 ** 17 **)
+
+	movq      mm5,mm6
+	punpcklwd mm6,mm1		; mm6=(10 12 ** **)
+	punpckhwd mm5,mm1		; mm5=(11 13 ** **)
+	movq      mm1,mm3
+	punpcklwd mm3,mm4		; mm3=(14 16 ** **)
+	punpckhwd mm1,mm4		; mm1=(15 17 ** **)
+
+	punpcklwd mm6,mm5		; mm6=(10 11 12 13)
+	punpcklwd mm3,mm1		; mm3=(14 15 16 17)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	near .quantloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jcqntmmx.asm b/simd/jcqntmmx.asm
new file mode 100644
index 0000000..08b08b7
--- /dev/null
+++ b/simd/jcqntmmx.asm

@@ -0,0 +1,274 @@
+;
+; jcqntmmx.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pxor	mm6,mm6			; mm6=(all 0's)
+	pcmpeqw	mm7,mm7
+	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm0=(01234567)
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm1=(89ABCDEF)
+
+	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm2=(GHIJKLMN)
+	movq	mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm3=(OPQRSTUV)
+
+	movq      mm4,mm0
+	punpcklbw mm0,mm6		; mm0=(0123)
+	punpckhbw mm4,mm6		; mm4=(4567)
+	movq      mm5,mm1
+	punpcklbw mm1,mm6		; mm1=(89AB)
+	punpckhbw mm5,mm6		; mm5=(CDEF)
+
+	paddw	mm0,mm7
+	paddw	mm4,mm7
+	paddw	mm1,mm7
+	paddw	mm5,mm7
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+	movq      mm0,mm2
+	punpcklbw mm2,mm6		; mm2=(GHIJ)
+	punpckhbw mm0,mm6		; mm0=(KLMN)
+	movq      mm4,mm3
+	punpcklbw mm3,mm6		; mm3=(OPQR)
+	punpckhbw mm4,mm6		; mm4=(STUV)
+
+	paddw	mm2,mm7
+	paddw	mm0,mm7
+	paddw	mm3,mm7
+	paddw	mm4,mm7
+
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+	add	esi, byte 4*SIZEOF_JSAMPROW
+	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
+;                     DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+%define SHIFT(m,n,b)      MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	ah, 2
+	alignx	16,7
+.quantloop1:
+	mov	al, DCTSIZE2/8/2
+	alignx	16,7
+.quantloop2:
+	movq	mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+	movq	mm0,mm2
+	movq	mm1,mm3
+
+	psraw	mm2,(WORD_BIT-1)  ; -1 if value < 0, 0 otherwise
+	psraw	mm3,(WORD_BIT-1)
+
+	pxor	mm0,mm2   ; val = -val
+	pxor	mm1,mm3
+	psubw	mm0,mm2
+	psubw	mm1,mm3
+
+	;
+	; MMX is an annoyingly crappy instruction set. It has two
+	; misfeatures that are causing problems here:
+	;
+	; - All multiplications are signed.
+	;
+	; - The second operand for the shifts is not treated as packed.
+	;
+	;
+	; We work around the first problem by implementing this algorithm:
+	;
+	; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+	; {
+	;   enum { SHORT_BIT = 16 };
+	;   signed short sx = (signed short) x;
+	;   signed short sy = (signed short) y;
+	;   signed long sz;
+	; 
+	;   sz = (long) sx * (long) sy;     /* signed multiply */
+	; 
+	;   if (sx < 0) sz += (long) sy << SHORT_BIT;
+	;   if (sy < 0) sz += (long) sx << SHORT_BIT;
+	; 
+	;   return (unsigned long) sz;
+	; }
+	;
+	; (note that a negative sx adds _sy_ and vice versa)
+	;
+	; For the second problem, we replace the shift by a multiplication.
+	; Unfortunately that means we have to deal with the signed issue again.
+	;
+
+	paddw	mm0, MMWORD [CORRECTION(0,0,edx)]   ; correction + roundfactor
+	paddw	mm1, MMWORD [CORRECTION(0,1,edx)]
+
+	movq	mm4,mm0   ; store current value for later
+	movq	mm5,mm1
+	pmulhw	mm0, MMWORD [RECIPROCAL(0,0,edx)]   ; reciprocal
+	pmulhw	mm1, MMWORD [RECIPROCAL(0,1,edx)]
+	paddw	mm0,mm4		; reciprocal is always negative (MSB=1),
+	paddw	mm1,mm5   ; so we always need to add the initial value
+	                ; (input value is never negative as we
+	                ; inverted it at the start of this routine)
+
+	; here it gets a bit tricky as both scale
+	; and mm0/mm1 can be negative
+	movq	mm6, MMWORD [SCALE(0,0,edx)]	; scale
+	movq	mm7, MMWORD [SCALE(0,1,edx)]
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pmulhw	mm0,mm6
+	pmulhw	mm1,mm7
+
+	psraw	mm6,(WORD_BIT-1)    ; determine if scale is negative
+	psraw	mm7,(WORD_BIT-1)
+
+	pand	mm6,mm4             ; and add input if it is
+	pand	mm7,mm5
+	paddw	mm0,mm6
+	paddw	mm1,mm7
+
+	psraw	mm4,(WORD_BIT-1)    ; then check if negative input 
+	psraw	mm5,(WORD_BIT-1)
+
+	pand	mm4, MMWORD [SCALE(0,0,edx)]	; and add scale if it is
+	pand	mm5, MMWORD [SCALE(0,1,edx)]
+	paddw	mm0,mm4
+	paddw	mm1,mm5
+
+	pxor	mm0,mm2   ; val = -val
+	pxor	mm1,mm3
+	psubw	mm0,mm2
+	psubw	mm1,mm3
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+	add	esi, byte 8*SIZEOF_DCTELEM
+	add	edx, byte 8*SIZEOF_DCTELEM
+	add	edi, byte 8*SIZEOF_JCOEF
+	dec	al
+	jnz	near .quantloop2
+	dec	ah
+	jnz	near .quantloop1	; to avoid branch misprediction
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jcqnts2f-64.asm b/simd/jcqnts2f-64.asm
new file mode 100644
index 0000000..e09387c
--- /dev/null
+++ b/simd/jcqnts2f-64.asm

@@ -0,0 +1,156 @@
+;
+; jcqnts2f.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT * workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11 = JDIMENSION start_col
+; r12 = FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+	push	rbp
+	mov	rbp,rsp
+	push	rbx
+	collect_args
+
+	pcmpeqw  xmm7,xmm7
+	psllw    xmm7,7
+	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov rsi, r10
+	mov	rax, r11
+	mov rdi, r12
+	mov	rcx, DCTSIZE/2
+.convloop:
+	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+	movq	xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+	psubb	xmm0,xmm7			; xmm0=(01234567)
+	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
+
+	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
+	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
+
+	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
+	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
+	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
+	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
+
+	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
+	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
+	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
+	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
+	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
+	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
+	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
+	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+	add	rsi, byte 2*SIZEOF_JSAMPROW
+	add	rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	rcx
+	jnz	short .convloop
+
+	uncollect_args
+	pop	rbx
+	pop	rbp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                         FAST_FLOAT * workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT * divisors
+; r12 = FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+	push	rbp
+	mov	rbp,rsp
+	collect_args
+
+	mov rsi, r12
+	mov rdx, r11
+	mov rdi, r10
+	mov	rax, DCTSIZE2/16
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+	cvtps2dq xmm0,xmm0
+	cvtps2dq xmm1,xmm1
+	cvtps2dq xmm2,xmm2
+	cvtps2dq xmm3,xmm3
+
+	packssdw xmm0,xmm1
+	packssdw xmm2,xmm3
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+	add	rsi, byte 16*SIZEOF_FAST_FLOAT
+	add	rdx, byte 16*SIZEOF_FAST_FLOAT
+	add	rdi, byte 16*SIZEOF_JCOEF
+	dec	rax
+	jnz	short .quantloop
+
+	uncollect_args
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jcqnts2f.asm b/simd/jcqnts2f.asm
new file mode 100644
index 0000000..d80ae5d
--- /dev/null
+++ b/simd/jcqnts2f.asm

@@ -0,0 +1,171 @@
+;
+; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  xmm7,xmm7
+	psllw    xmm7,7
+	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	xmm0,xmm7			; xmm0=(01234567)
+	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
+
+	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
+	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
+
+	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
+	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
+	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
+	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
+
+	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
+	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
+	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
+	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
+	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
+	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
+	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
+	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	short .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                         FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	cvtps2dq xmm0,xmm0
+	cvtps2dq xmm1,xmm1
+	cvtps2dq xmm2,xmm2
+	cvtps2dq xmm3,xmm3
+
+	packssdw xmm0,xmm1
+	packssdw xmm2,xmm3
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	short .quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jcqnts2i-64.asm b/simd/jcqnts2i-64.asm
new file mode 100644
index 0000000..4568dfc
--- /dev/null
+++ b/simd/jcqnts2i-64.asm

@@ -0,0 +1,185 @@
+;
+; jcqnts2i.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                      DCTELEM * workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11 = JDIMENSION start_col
+; r12 = DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+	push	rbp
+	mov	rbp,rsp
+	push	rbx
+	collect_args
+
+	pxor	xmm6,xmm6		; xmm6=(all 0's)
+	pcmpeqw	xmm7,xmm7
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	mov rsi, r10
+	mov rax, r11
+	mov rdi, r12
+	mov	rcx, DCTSIZE/4
+.convloop:
+	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]	; xmm0=(01234567)
+	movq	xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)
+
+	mov	rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)
+	movq	xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)
+
+	punpcklbw xmm0,xmm6		; xmm0=(01234567)
+	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)
+	paddw     xmm0,xmm7
+	paddw     xmm1,xmm7
+	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)
+	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)
+	paddw     xmm2,xmm7
+	paddw     xmm3,xmm7
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+	add	rsi, byte 4*SIZEOF_JSAMPROW
+	add	rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	rcx
+	jnz	short .convloop
+
+	uncollect_args
+	pop	rbx
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
+;                      DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM * divisors
+; r12 = DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+	push	rbp
+	mov	rbp,rsp
+	collect_args
+
+	mov rsi, r12
+	mov rdx, r11
+	mov rdi, r10
+	mov	rax, DCTSIZE2/32
+.quantloop:
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm5
+	movdqa	xmm2,xmm6
+	movdqa	xmm3,xmm7
+	psraw	xmm4,(WORD_BIT-1)
+	psraw	xmm5,(WORD_BIT-1)
+	psraw	xmm6,(WORD_BIT-1)
+	psraw	xmm7,(WORD_BIT-1)
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;
+	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;
+	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;
+	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;
+
+	paddw	xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
+	paddw	xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+	paddw	xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+	paddw	xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
+	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+	pmulhuw	xmm0, XMMWORD [SCALE(0,0,rdx)]	; scale
+	pmulhuw	xmm1, XMMWORD [SCALE(1,0,rdx)]
+	pmulhuw	xmm2, XMMWORD [SCALE(2,0,rdx)]
+	pmulhuw	xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4
+	psubw	xmm1,xmm5
+	psubw	xmm2,xmm6
+	psubw	xmm3,xmm7
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+	add	rsi, byte 32*SIZEOF_DCTELEM
+	add	rdx, byte 32*SIZEOF_DCTELEM
+	add	rdi, byte 32*SIZEOF_JCOEF
+	dec	rax
+	jnz	near .quantloop
+
+	uncollect_args
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jcqnts2i.asm b/simd/jcqnts2i.asm
new file mode 100644
index 0000000..0864d6e
--- /dev/null
+++ b/simd/jcqnts2i.asm

@@ -0,0 +1,200 @@
+;
+; jcqnts2i.asm - sample data conversion and quantization (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                      DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pxor	xmm6,xmm6		; xmm6=(all 0's)
+	pcmpeqw	xmm7,xmm7
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm0=(01234567)
+	movq	xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)
+
+	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)
+	movq	xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)
+
+	punpcklbw xmm0,xmm6		; xmm0=(01234567)
+	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)
+	paddw     xmm0,xmm7
+	paddw     xmm1,xmm7
+	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)
+	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)
+	paddw     xmm2,xmm7
+	paddw     xmm3,xmm7
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+	add	esi, byte 4*SIZEOF_JSAMPROW
+	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
+;                      DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/32
+	alignx	16,7
+.quantloop:
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm5
+	movdqa	xmm2,xmm6
+	movdqa	xmm3,xmm7
+	psraw	xmm4,(WORD_BIT-1)
+	psraw	xmm5,(WORD_BIT-1)
+	psraw	xmm6,(WORD_BIT-1)
+	psraw	xmm7,(WORD_BIT-1)
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;
+	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;
+	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;
+	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;
+
+	paddw	xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+	paddw	xmm1, XMMWORD [CORRECTION(1,0,edx)]
+	paddw	xmm2, XMMWORD [CORRECTION(2,0,edx)]
+	paddw	xmm3, XMMWORD [CORRECTION(3,0,edx)]
+	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+	pmulhuw	xmm0, XMMWORD [SCALE(0,0,edx)]	; scale
+	pmulhuw	xmm1, XMMWORD [SCALE(1,0,edx)]
+	pmulhuw	xmm2, XMMWORD [SCALE(2,0,edx)]
+	pmulhuw	xmm3, XMMWORD [SCALE(3,0,edx)]
+
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4
+	psubw	xmm1,xmm5
+	psubw	xmm2,xmm6
+	psubw	xmm3,xmm7
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+	add	esi, byte 32*SIZEOF_DCTELEM
+	add	edx, byte 32*SIZEOF_DCTELEM
+	add	edi, byte 32*SIZEOF_JCOEF
+	dec	eax
+	jnz	near .quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jcqntsse.asm b/simd/jcqntsse.asm
new file mode 100644
index 0000000..3065eca
--- /dev/null
+++ b/simd/jcqntsse.asm

@@ -0,0 +1,211 @@
+;
+; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_float_sse)
+
+EXTN(jsimd_convsamp_float_sse):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  mm7,mm7
+	psllw    mm7,7
+	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	mm0,mm7				; mm0=(01234567)
+	psubb	mm1,mm7				; mm1=(89ABCDEF)
+
+	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
+	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
+	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
+	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
+
+	punpcklwd mm4,mm2			; mm4=(***0***1)
+	punpckhwd mm2,mm2			; mm2=(***2***3)
+	punpcklwd mm5,mm0			; mm5=(***4***5)
+	punpckhwd mm0,mm0			; mm0=(***6***7)
+
+	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
+	psrad     mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
+	cvtpi2ps  xmm0,mm4			; xmm0=(01**)
+	cvtpi2ps  xmm1,mm2			; xmm1=(23**)
+	psrad     mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
+	psrad     mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
+	cvtpi2ps  xmm2,mm5			; xmm2=(45**)
+	cvtpi2ps  xmm3,mm0			; xmm3=(67**)
+
+	punpcklwd mm6,mm3			; mm6=(***8***9)
+	punpckhwd mm3,mm3			; mm3=(***A***B)
+	punpcklwd mm4,mm1			; mm4=(***C***D)
+	punpckhwd mm1,mm1			; mm1=(***E***F)
+
+	psrad     mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
+	psrad     mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
+	cvtpi2ps  xmm4,mm6			; xmm4=(89**)
+	cvtpi2ps  xmm5,mm3			; xmm5=(AB**)
+	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
+	psrad     mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
+	cvtpi2ps  xmm6,mm4			; xmm6=(CD**)
+	cvtpi2ps  xmm7,mm1			; xmm7=(EF**)
+
+	movlhps   xmm0,xmm1			; xmm0=(0123)
+	movlhps   xmm2,xmm3			; xmm2=(4567)
+	movlhps   xmm4,xmm5			; xmm4=(89AB)
+	movlhps   xmm6,xmm7			; xmm6=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .convloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                           FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_float_sse)
+
+EXTN(jsimd_quantize_float_sse):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	movhlps  xmm4,xmm0
+	movhlps  xmm5,xmm1
+
+	cvtps2pi mm0,xmm0
+	cvtps2pi mm1,xmm1
+	cvtps2pi mm4,xmm4
+	cvtps2pi mm5,xmm5
+
+	movhlps  xmm6,xmm2
+	movhlps  xmm7,xmm3
+
+	cvtps2pi mm2,xmm2
+	cvtps2pi mm3,xmm3
+	cvtps2pi mm6,xmm6
+	cvtps2pi mm7,xmm7
+
+	packssdw mm0,mm4
+	packssdw mm1,mm5
+	packssdw mm2,mm6
+	packssdw mm3,mm7
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	short .quantloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jcsammmx.asm b/simd/jcsammmx.asm
new file mode 100644
index 0000000..c7126a0
--- /dev/null
+++ b/simd/jcsammmx.asm

@@ -0,0 +1,324 @@
+;
+; jcsammmx.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28	; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	short .return
+
+	mov       edx, 0x00010000	; bias pattern
+	movd      mm7,edx
+	pcmpeqw   mm6,mm6
+	punpckldq mm7,mm7		; mm7={0, 1, 0, 1}
+	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mm2,mm0
+	movq	mm3,mm1
+
+	pand	mm0,mm6
+	psrlw	mm2,BYTE_BIT
+	pand	mm1,mm6
+	psrlw	mm3,BYTE_BIT
+
+	paddw	mm0,mm2
+	paddw	mm1,mm3
+	paddw	mm0,mm7
+	paddw	mm1,mm7
+	psrlw	mm0,1
+	psrlw	mm1,1
+
+	packuswb mm0,mm1
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr
+	sub	ecx, byte SIZEOF_MMWORD		; outcol
+	jnz	short .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28	; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov       edx, 0x00020001	; bias pattern
+	movd      mm7,edx
+	pcmpeqw   mm6,mm6
+	punpckldq mm7,mm7		; mm7={1, 2, 1, 2}
+	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	edi, JSAMPROW [edi]			; outptr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+	movq	mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pand	mm0,mm6
+	psrlw	mm4,BYTE_BIT
+	pand	mm1,mm6
+	psrlw	mm5,BYTE_BIT
+	paddw	mm0,mm4
+	paddw	mm1,mm5
+
+	movq	mm4,mm2
+	movq	mm5,mm3
+	pand	mm2,mm6
+	psrlw	mm4,BYTE_BIT
+	pand	mm3,mm6
+	psrlw	mm5,BYTE_BIT
+	paddw	mm2,mm4
+	paddw	mm3,mm5
+
+	paddw	mm0,mm1
+	paddw	mm2,mm3
+	paddw	mm0,mm7
+	paddw	mm2,mm7
+	psrlw	mm0,2
+	psrlw	mm2,2
+
+	packuswb mm0,mm2
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+	add	edx, byte 2*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr1
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr
+	sub	ecx, byte SIZEOF_MMWORD		; outcol
+	jnz	near .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jcsamss2-64.asm b/simd/jcsamss2-64.asm
new file mode 100644
index 0000000..29c3f4f
--- /dev/null
+++ b/simd/jcsamss2-64.asm

@@ -0,0 +1,328 @@
+;
+; jcsamss2.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+; r10 = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12 = JDIMENSION v_samp_factor
+; r13 = JDIMENSION width_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+	push	rbp
+	mov	rbp,rsp
+	collect_args
+
+	mov rcx, r13
+	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
+	jz	near .return
+
+	mov rdx, r10
+
+	; -- expand_right_edge
+
+	push	rcx
+	shl	rcx,1				; output_cols * 2
+	sub	rcx,rdx
+	jle	short .expand_end
+
+	mov	rax, r11
+	test	rax,rax
+	jle	short .expand_end
+
+	cld
+	mov	rsi, r14	; input_data
+.expandloop:
+	push	rax
+	push	rcx
+
+	mov	rdi, JSAMPROW [rsi]
+	add	rdi,rdx
+	mov	al, JSAMPLE [rdi-1]
+
+	rep stosb
+
+	pop	rcx
+	pop	rax
+
+	add	rsi, byte SIZEOF_JSAMPROW
+	dec	rax
+	jg	short .expandloop
+
+.expand_end:
+	pop	rcx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	rax, r12	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	rdx, 0x00010000		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	rsi, r14	; input_data
+	mov	rdi, r15	; output_data
+.rowloop:
+	push	rcx
+	push	rdi
+	push	rsi
+
+	mov	rsi, JSAMPROW [rsi]		; inptr
+	mov rdi, JSAMPROW [rdi]		; outptr
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	pxor	xmm1,xmm1
+	mov	rcx, SIZEOF_XMMWORD
+	jmp	short .downsample
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm2,xmm0
+	movdqa	xmm3,xmm1
+
+	pand	xmm0,xmm6
+	psrlw	xmm2,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm3,BYTE_BIT
+
+	paddw	xmm0,xmm2
+	paddw	xmm1,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+	psrlw	xmm0,1
+	psrlw	xmm1,1
+
+	packuswb xmm0,xmm1
+
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	rcx, byte SIZEOF_XMMWORD	; outcol
+	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	test	rcx,rcx
+	jnz	short .columnloop_r8
+
+	pop	rsi
+	pop	rdi
+	pop	rcx
+
+	add	rsi, byte SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte SIZEOF_JSAMPROW	; output_data
+	dec	rax				; rowctr
+	jg	near .rowloop
+
+.return:
+	uncollect_args
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+; r10 = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12 = JDIMENSION v_samp_factor
+; r13 = JDIMENSION width_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+	push	rbp
+	mov	rbp,rsp
+	collect_args
+
+	mov	rcx, r13
+	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
+	jz	near .return
+
+	mov	rdx, r10
+
+	; -- expand_right_edge
+
+	push	rcx
+	shl	rcx,1				; output_cols * 2
+	sub	rcx,rdx
+	jle	short .expand_end
+
+	mov	rax, r11
+	test	rax,rax
+	jle	short .expand_end
+
+	cld
+	mov	rsi, r14	; input_data
+.expandloop:
+	push	rax
+	push	rcx
+
+	mov	rdi, JSAMPROW [rsi]
+	add	rdi,rdx
+	mov	al, JSAMPLE [rdi-1]
+
+	rep stosb
+
+	pop	rcx
+	pop	rax
+
+	add	rsi, byte SIZEOF_JSAMPROW
+	dec	rax
+	jg	short .expandloop
+
+.expand_end:
+	pop	rcx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	rax, r12	; rowctr
+	test	rax,rax
+	jle	near .return
+
+	mov	rdx, 0x00020001		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	rsi, r14	; input_data
+	mov	rdi, r15	; output_data
+.rowloop:
+	push	rcx
+	push	rdi
+	push	rsi
+
+	mov	rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	rdi, JSAMPROW [rdi]			; outptr
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	mov	rcx, SIZEOF_XMMWORD
+	jmp	short .downsample
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	pand	xmm0,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm0,xmm4
+	paddw	xmm1,xmm5
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm5,xmm3
+	pand	xmm2,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm3,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm2,xmm4
+	paddw	xmm3,xmm5
+
+	paddw	xmm0,xmm1
+	paddw	xmm2,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm7
+	psrlw	xmm0,2
+	psrlw	xmm2,2
+
+	packuswb xmm0,xmm2
+
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	rcx, byte SIZEOF_XMMWORD	; outcol
+	add	rdx, byte 2*SIZEOF_XMMWORD	; inptr0
+	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr1
+	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	rcx,rcx
+	jnz	near .columnloop_r8
+
+	pop	rsi
+	pop	rdi
+	pop	rcx
+
+	add	rsi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	rax				; rowctr
+	jg	near .rowloop
+
+.return:
+	uncollect_args
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jcsamss2.asm b/simd/jcsamss2.asm
new file mode 100644
index 0000000..818e911
--- /dev/null
+++ b/simd/jcsamss2.asm

@@ -0,0 +1,351 @@
+;
+; jcsamss2.asm - downsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28		; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	edx, 0x00010000		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	pxor	xmm1,xmm1
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .downsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm2,xmm0
+	movdqa	xmm3,xmm1
+
+	pand	xmm0,xmm6
+	psrlw	xmm2,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm3,BYTE_BIT
+
+	paddw	xmm0,xmm2
+	paddw	xmm1,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+	psrlw	xmm0,1
+	psrlw	xmm1,1
+
+	packuswb xmm0,xmm1
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	ecx, byte SIZEOF_XMMWORD	; outcol
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	test	ecx,ecx
+	jnz	short .columnloop_r8
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28	; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	edx, 0x00020001		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	edi, JSAMPROW [edi]			; outptr
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .downsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	pand	xmm0,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm0,xmm4
+	paddw	xmm1,xmm5
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm5,xmm3
+	pand	xmm2,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm3,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm2,xmm4
+	paddw	xmm3,xmm5
+
+	paddw	xmm0,xmm1
+	paddw	xmm2,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm7
+	psrlw	xmm0,2
+	psrlw	xmm2,2
+
+	packuswb xmm0,xmm2
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	ecx, byte SIZEOF_XMMWORD	; outcol
+	add	edx, byte 2*SIZEOF_XMMWORD	; inptr0
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr1
+	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .columnloop_r8
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jdclrmmx.asm b/simd/jdclrmmx.asm
new file mode 100644
index 0000000..79772e0
--- /dev/null
+++ b/simd/jdclrmmx.asm

@@ -0,0 +1,407 @@
+;
+; jdclrmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
+;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                            JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b)	(b)+8			; JDIMENSION out_width
+%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
+%define input_row(b)	(b)+16		; JDIMENSION input_row
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [input_row(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	push	eax
+	push	edi
+	push	edx
+	push	ebx
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr0
+	mov	ebx, JSAMPROW [ebx]	; inptr1
+	mov	edx, JSAMPROW [edx]	; inptr2
+	mov	edi, JSAMPROW [edi]	; outptr
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+	alignx	16,7
+.columnloop:
+
+	movq	mm5, MMWORD [ebx]	; mm5=Cb(01234567)
+	movq	mm1, MMWORD [edx]	; mm1=Cr(01234567)
+
+	pcmpeqw	mm4,mm4
+	pcmpeqw	mm7,mm7
+	psrlw	mm4,BYTE_BIT
+	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+	movq	mm0,mm4			; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	mm4,mm5			; mm4=Cb(0246)=CbE
+	psrlw	mm5,BYTE_BIT		; mm5=Cb(1357)=CbO
+	pand	mm0,mm1			; mm0=Cr(0246)=CrE
+	psrlw	mm1,BYTE_BIT		; mm1=Cr(1357)=CrO
+
+	paddw	mm4,mm7
+	paddw	mm5,mm7
+	paddw	mm0,mm7
+	paddw	mm1,mm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm2,mm4			; mm2=CbE
+	movq	mm3,mm5			; mm3=CbO
+	paddw	mm4,mm4			; mm4=2*CbE
+	paddw	mm5,mm5			; mm5=2*CbO
+	movq	mm6,mm0			; mm6=CrE
+	movq	mm7,mm1			; mm7=CrO
+	paddw	mm0,mm0			; mm0=2*CrE
+	paddw	mm1,mm1			; mm1=2*CrO
+
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbE * -FIX(0.22800))
+	pmulhw	mm5,[GOTOFF(eax,PW_MF0228)]	; mm5=(2*CbO * -FIX(0.22800))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrE * FIX(0.40200))
+	pmulhw	mm1,[GOTOFF(eax,PW_F0402)]	; mm1=(2*CrO * FIX(0.40200))
+
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	paddw	mm5,[GOTOFF(eax,PW_ONE)]
+	psraw	mm4,1			; mm4=(CbE * -FIX(0.22800))
+	psraw	mm5,1			; mm5=(CbO * -FIX(0.22800))
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	paddw	mm1,[GOTOFF(eax,PW_ONE)]
+	psraw	mm0,1			; mm0=(CrE * FIX(0.40200))
+	psraw	mm1,1			; mm1=(CrO * FIX(0.40200))
+
+	paddw	mm4,mm2
+	paddw	mm5,mm3
+	paddw	mm4,mm2			; mm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	mm5,mm3			; mm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	mm0,mm6			; mm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	mm1,mm7			; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(B-Y)E
+	movq	MMWORD [wk(1)], mm5	; wk(1)=(B-Y)O
+
+	movq      mm4,mm2
+	movq      mm5,mm3
+	punpcklwd mm2,mm6
+	punpckhwd mm4,mm6
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm3,mm7
+	punpckhwd mm5,mm7
+	pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm4,SCALEBITS
+	paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm3,SCALEBITS
+	psrad     mm5,SCALEBITS
+
+	packssdw  mm2,mm4	; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  mm3,mm5	; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     mm2,mm6	; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     mm3,mm7	; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movq      mm5, MMWORD [esi]	; mm5=Y(01234567)
+
+	pcmpeqw   mm4,mm4
+	psrlw     mm4,BYTE_BIT		; mm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      mm4,mm5		; mm4=Y(0246)=YE
+	psrlw     mm5,BYTE_BIT		; mm5=Y(1357)=YO
+
+	paddw     mm0,mm4		; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm5		; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm4		; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm5		; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4, MMWORD [wk(0)]	; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+	paddw     mm5, MMWORD [wk(1)]	; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [edi+0*SIZEOF_DWORD], eax
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi+0*SIZEOF_WORD], ax
+	shr	eax,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .nextrow
+	mov	BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .nextrow
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	alignx	16,7
+
+.nextrow:
+	pop	ecx
+	pop	esi
+	pop	ebx
+	pop	edx
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	add	edi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm
new file mode 100644
index 0000000..ea9d2ac
--- /dev/null
+++ b/simd/jdclrss2-64.asm

@@ -0,0 +1,487 @@
+;
+; jdclrss2.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+				
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
+;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                             JSAMPARRAY output_buf, int num_rows)
+;
+
+; r10 = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14 = int num_rows
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	push	rbx
+	collect_args
+
+	mov	rcx, r10	; num_cols
+	test	rcx,rcx
+	jz	near .return
+
+	push	rcx
+
+	mov	rdi, r11
+	mov	rcx, r12
+	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+	lea	rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+	pop	rcx
+
+	mov	rdi, r13
+	mov	rax, r14
+	test	rax,rax
+	jle	near .return
+.rowloop:
+	push	rax
+	push	rdi
+	push	rdx
+	push	rbx
+	push	rsi
+	push	rcx			; col
+
+	mov	rsi, JSAMPROW [rsi]	; inptr0
+	mov	rbx, JSAMPROW [rbx]	; inptr1
+	mov	rdx, JSAMPROW [rdx]	; inptr2
+	mov	rdi, JSAMPROW [rdi]	; outptr
+.columnloop:
+
+	movdqa	xmm5, XMMWORD [rbx]	; xmm5=Cb(0123456789ABCDEF)
+	movdqa	xmm1, XMMWORD [rdx]	; xmm1=Cr(0123456789ABCDEF)
+
+	pcmpeqw	xmm4,xmm4
+	pcmpeqw	xmm7,xmm7
+	psrlw	xmm4,BYTE_BIT
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
+	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
+	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
+	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
+
+	paddw	xmm4,xmm7
+	paddw	xmm5,xmm7
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm2,xmm4		; xmm2=CbE
+	movdqa	xmm3,xmm5		; xmm3=CbO
+	paddw	xmm4,xmm4		; xmm4=2*CbE
+	paddw	xmm5,xmm5		; xmm5=2*CbO
+	movdqa	xmm6,xmm0		; xmm6=CrE
+	movdqa	xmm7,xmm1		; xmm7=CrO
+	paddw	xmm0,xmm0		; xmm0=2*CrE
+	paddw	xmm1,xmm1		; xmm1=2*CrO
+
+	pmulhw	xmm4,[rel PW_MF0228]	; xmm4=(2*CbE * -FIX(0.22800))
+	pmulhw	xmm5,[rel PW_MF0228]	; xmm5=(2*CbO * -FIX(0.22800))
+	pmulhw	xmm0,[rel PW_F0402]	; xmm0=(2*CrE * FIX(0.40200))
+	pmulhw	xmm1,[rel PW_F0402]	; xmm1=(2*CrO * FIX(0.40200))
+
+	paddw	xmm4,[rel PW_ONE]
+	paddw	xmm5,[rel PW_ONE]
+	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
+	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
+	paddw	xmm0,[rel PW_ONE]
+	paddw	xmm1,[rel PW_ONE]
+	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
+	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
+
+	paddw	xmm4,xmm2
+	paddw	xmm5,xmm3
+	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm5,xmm3
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm4,xmm6
+	pmaddwd   xmm2,[rel PW_MF0344_F0285]
+	pmaddwd   xmm4,[rel PW_MF0344_F0285]
+	punpcklwd xmm3,xmm7
+	punpckhwd xmm5,xmm7
+	pmaddwd   xmm3,[rel PW_MF0344_F0285]
+	pmaddwd   xmm5,[rel PW_MF0344_F0285]
+
+	paddd     xmm2,[rel PD_ONEHALF]
+	paddd     xmm4,[rel PD_ONEHALF]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm4,SCALEBITS
+	paddd     xmm3,[rel PD_ONEHALF]
+	paddd     xmm5,[rel PD_ONEHALF]
+	psrad     xmm3,SCALEBITS
+	psrad     xmm5,SCALEBITS
+
+	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movdqa    xmm5, XMMWORD [rsi]	; xmm5=Y(0123456789ABCDEF)
+
+	pcmpeqw   xmm4,xmm4
+	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
+	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
+
+	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	rdi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [rdi], xmmF
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	rcx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	rsi, byte SIZEOF_XMMWORD	; inptr0
+	add	rbx, byte SIZEOF_XMMWORD	; inptr1
+	add	rdx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	rcx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	rcx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	rcx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	rax,rcx
+	xor	rcx, byte 0x0F
+	shl	rcx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	rcx,rdi
+	and	rcx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	rax,rcx
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,rcx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	rdi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [rdi], xmmC
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [rdi], xmmH
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	rcx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	rsi, byte SIZEOF_XMMWORD	; inptr0
+	add	rbx, byte SIZEOF_XMMWORD	; inptr1
+	add	rdx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	rcx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	rcx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	rcx, byte SIZEOF_XMMWORD/16
+	jb	near .nextrow
+	mov	rax,rcx
+	xor	rcx, byte 0x03
+	inc	rcx
+	shl	rcx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	rcx,rdi
+	and	rcx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	rax, [rcx+rax*4]	; RGB_PIXELSIZE
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	rcx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+	pop	rcx
+	pop	rsi
+	pop	rbx
+	pop	rdx
+	pop	rdi
+	pop	rax
+
+	add	rsi, byte SIZEOF_JSAMPROW
+	add	rbx, byte SIZEOF_JSAMPROW
+	add	rdx, byte SIZEOF_JSAMPROW
+	add	rdi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	rax				; num_rows
+	jg	near .rowloop
+
+	sfence		; flush the write buffer
+
+.return:
+	uncollect_args
+	pop	rbx
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm
new file mode 100644
index 0000000..865fa82
--- /dev/null
+++ b/simd/jdclrss2.asm

@@ -0,0 +1,505 @@
+;
+; jdclrss2.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+				
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
+;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                             JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b)	(b)+8			; JDIMENSION out_width
+%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
+%define input_row(b)	(b)+16		; JDIMENSION input_row
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [input_row(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	push	eax
+	push	edi
+	push	edx
+	push	ebx
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr0
+	mov	ebx, JSAMPROW [ebx]	; inptr1
+	mov	edx, JSAMPROW [edx]	; inptr2
+	mov	edi, JSAMPROW [edi]	; outptr
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm5, XMMWORD [ebx]	; xmm5=Cb(0123456789ABCDEF)
+	movdqa	xmm1, XMMWORD [edx]	; xmm1=Cr(0123456789ABCDEF)
+
+	pcmpeqw	xmm4,xmm4
+	pcmpeqw	xmm7,xmm7
+	psrlw	xmm4,BYTE_BIT
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
+	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
+	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
+	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
+
+	paddw	xmm4,xmm7
+	paddw	xmm5,xmm7
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm2,xmm4		; xmm2=CbE
+	movdqa	xmm3,xmm5		; xmm3=CbO
+	paddw	xmm4,xmm4		; xmm4=2*CbE
+	paddw	xmm5,xmm5		; xmm5=2*CbO
+	movdqa	xmm6,xmm0		; xmm6=CrE
+	movdqa	xmm7,xmm1		; xmm7=CrO
+	paddw	xmm0,xmm0		; xmm0=2*CrE
+	paddw	xmm1,xmm1		; xmm1=2*CrO
+
+	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbE * -FIX(0.22800))
+	pmulhw	xmm5,[GOTOFF(eax,PW_MF0228)]	; xmm5=(2*CbO * -FIX(0.22800))
+	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrE * FIX(0.40200))
+	pmulhw	xmm1,[GOTOFF(eax,PW_F0402)]	; xmm1=(2*CrO * FIX(0.40200))
+
+	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm5,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
+	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
+	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm1,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
+	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
+
+	paddw	xmm4,xmm2
+	paddw	xmm5,xmm3
+	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm5,xmm3
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm4,xmm6
+	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd xmm3,xmm7
+	punpckhwd xmm5,xmm7
+	pmaddwd   xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm4,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm4,SCALEBITS
+	paddd     xmm3,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm3,SCALEBITS
+	psrad     xmm5,SCALEBITS
+
+	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movdqa    xmm5, XMMWORD [esi]	; xmm5=Y(0123456789ABCDEF)
+
+	pcmpeqw   xmm4,xmm4
+	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
+	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
+
+	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [edi], xmmF
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	ecx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	eax,ecx
+	xor	ecx, byte 0x0F
+	shl	ecx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	eax,ecx
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [edi], xmmC
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [edi], xmmH
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	ecx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	ecx, byte SIZEOF_XMMWORD/16
+	jb	short .nextrow
+	mov	eax,ecx
+	xor	ecx, byte 0x03
+	inc	ecx
+	shl	ecx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	eax, [ecx+eax*4]	; RGB_PIXELSIZE
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	alignx	16,7
+
+.nextrow:
+	pop	ecx
+	pop	esi
+	pop	ebx
+	pop	edx
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	add	edi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	sfence		; flush the write buffer
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jdcolmmx.asm b/simd/jdcolmmx.asm
new file mode 100644
index 0000000..58775e8
--- /dev/null
+++ b/simd/jdcolmmx.asm

@@ -0,0 +1,117 @@
+;
+; jdcolmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402	times 4 dw  F_0_402
+PW_MF0228	times 4 dw -F_0_228
+PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
+PW_ONE		times 4 dw  1
+PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
+%include "jdclrmmx.asm"

diff --git a/simd/jdcolss2-64.asm b/simd/jdcolss2-64.asm
new file mode 100644
index 0000000..5e8a322
--- /dev/null
+++ b/simd/jdcolss2-64.asm

@@ -0,0 +1,117 @@
+;
+; jdcolss2.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402	times 8 dw  F_0_402
+PW_MF0228	times 8 dw -F_0_228
+PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
+PW_ONE		times 8 dw  1
+PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdclrss2-64.asm"

diff --git a/simd/jdcolss2.asm b/simd/jdcolss2.asm
new file mode 100644
index 0000000..7ae985d
--- /dev/null
+++ b/simd/jdcolss2.asm

@@ -0,0 +1,117 @@
+;
+; jdcolss2.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402	times 8 dw  F_0_402
+PW_MF0228	times 8 dw -F_0_228
+PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
+PW_ONE		times 8 dw  1
+PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdclrss2.asm"

diff --git a/simd/jdct.inc b/simd/jdct.inc
new file mode 100644
index 0000000..cc62704
--- /dev/null
+++ b/simd/jdct.inc

@@ -0,0 +1,28 @@
+;
+; jdct.inc - private declarations for forward & reverse DCT subsystems
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; [TAB8]
+
+; Each IDCT routine is responsible for range-limiting its results and
+; converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
+; be quite far out of range if the input data is corrupt, so a bulletproof
+; range-limiting step is required.  We use a mask-and-table-lookup method
+; to do the combined operations quickly.
+;
+%define RANGE_MASK  (MAXJSAMPLE * 4 + 3)  ; 2 bits wider than legal samples
+
+%define ROW(n,b,s)		((b)+(n)*(s))
+%define COL(n,b,s)		((b)+(n)*(s)*DCTSIZE)
+
+%define DWBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
+%define MMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
+%define XMMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
+
+; --------------------------------------------------------------------------

diff --git a/simd/jdmermmx.asm b/simd/jdmermmx.asm
new file mode 100644
index 0000000..fd587fb
--- /dev/null
+++ b/simd/jdmermmx.asm

@@ -0,0 +1,123 @@
+;
+; jdmermmx.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_merged_upsample_mmx)
+
+EXTN(jconst_merged_upsample_mmx):
+
+PW_F0402	times 4 dw  F_0_402
+PW_MF0228	times 4 dw -F_0_228
+PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
+PW_ONE		times 4 dw  1
+PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
+%include "jdmrgmmx.asm"

diff --git a/simd/jdmerss2-64.asm b/simd/jdmerss2-64.asm
new file mode 100644
index 0000000..2f9c5c1
--- /dev/null
+++ b/simd/jdmerss2-64.asm

@@ -0,0 +1,123 @@
+;
+; jdmerss2.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402	times 8 dw  F_0_402
+PW_MF0228	times 8 dw -F_0_228
+PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
+PW_ONE		times 8 dw  1
+PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgss2-64.asm"

diff --git a/simd/jdmerss2.asm b/simd/jdmerss2.asm
new file mode 100644
index 0000000..2294e0d
--- /dev/null
+++ b/simd/jdmerss2.asm

@@ -0,0 +1,123 @@
+;
+; jdmerss2.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402	times 8 dw  F_0_402
+PW_MF0228	times 8 dw -F_0_228
+PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
+PW_ONE		times 8 dw  1
+PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgss2.asm"

diff --git a/simd/jdmrgmmx.asm b/simd/jdmrgmmx.asm
new file mode 100644
index 0000000..f5fa936
--- /dev/null
+++ b/simd/jdmrgmmx.asm

@@ -0,0 +1,466 @@
+;
+; jdmrgmmx.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)	(b)+8			; JDIMENSION output_width
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		3
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [output_width(eax)]	; col
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
+	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
+	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
+	mov	edi, JSAMPROW [edi]				; outptr
+
+	pop	ecx			; col
+
+	alignx	16,7
+.columnloop:
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	movq      mm6, MMWORD [ebx]	; mm6=Cb(01234567)
+	movq      mm7, MMWORD [edx]	; mm7=Cr(01234567)
+
+	pxor      mm1,mm1		; mm1=(all 0's)
+	pcmpeqw   mm3,mm3
+	psllw     mm3,7			; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	movq      mm4,mm6
+	punpckhbw mm6,mm1		; mm6=Cb(4567)=CbH
+	punpcklbw mm4,mm1		; mm4=Cb(0123)=CbL
+	movq      mm0,mm7
+	punpckhbw mm7,mm1		; mm7=Cr(4567)=CrH
+	punpcklbw mm0,mm1		; mm0=Cr(0123)=CrL
+
+	paddw     mm6,mm3
+	paddw     mm4,mm3
+	paddw     mm7,mm3
+	paddw     mm0,mm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm5,mm6			; mm5=CbH
+	movq	mm2,mm4			; mm2=CbL
+	paddw	mm6,mm6			; mm6=2*CbH
+	paddw	mm4,mm4			; mm4=2*CbL
+	movq	mm1,mm7			; mm1=CrH
+	movq	mm3,mm0			; mm3=CrL
+	paddw	mm7,mm7			; mm7=2*CrH
+	paddw	mm0,mm0			; mm0=2*CrL
+
+	pmulhw	mm6,[GOTOFF(eax,PW_MF0228)]	; mm6=(2*CbH * -FIX(0.22800))
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbL * -FIX(0.22800))
+	pmulhw	mm7,[GOTOFF(eax,PW_F0402)]	; mm7=(2*CrH * FIX(0.40200))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrL * FIX(0.40200))
+
+	paddw	mm6,[GOTOFF(eax,PW_ONE)]
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	psraw	mm6,1			; mm6=(CbH * -FIX(0.22800))
+	psraw	mm4,1			; mm4=(CbL * -FIX(0.22800))
+	paddw	mm7,[GOTOFF(eax,PW_ONE)]
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	psraw	mm7,1			; mm7=(CrH * FIX(0.40200))
+	psraw	mm0,1			; mm0=(CrL * FIX(0.40200))
+
+	paddw	mm6,mm5
+	paddw	mm4,mm2
+	paddw	mm6,mm5			; mm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	mm4,mm2			; mm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	mm7,mm1			; mm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	mm0,mm3			; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movq	MMWORD [wk(0)], mm6	; wk(0)=(B-Y)H
+	movq	MMWORD [wk(1)], mm7	; wk(1)=(R-Y)H
+
+	movq      mm6,mm5
+	movq      mm7,mm2
+	punpcklwd mm5,mm1
+	punpckhwd mm6,mm1
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm2,mm3
+	punpckhwd mm7,mm3
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm5,SCALEBITS
+	psrad     mm6,SCALEBITS
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm7,SCALEBITS
+
+	packssdw  mm5,mm6	; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  mm2,mm7	; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     mm5,mm1	; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     mm2,mm3	; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movq	MMWORD [wk(2)], mm5	; wk(2)=(G-Y)H
+
+	mov	al,2			; Yctr
+	jmp	short .Yloop_1st
+	alignx	16,7
+
+.Yloop_2nd:
+	movq	mm0, MMWORD [wk(1)]	; mm0=(R-Y)H
+	movq	mm2, MMWORD [wk(2)]	; mm2=(G-Y)H
+	movq	mm4, MMWORD [wk(0)]	; mm4=(B-Y)H
+	alignx	16,7
+
+.Yloop_1st:
+	movq	mm7, MMWORD [esi]	; mm7=Y(01234567)
+
+	pcmpeqw	mm6,mm6
+	psrlw	mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	mm6,mm7			; mm6=Y(0246)=YE
+	psrlw	mm7,BYTE_BIT		; mm7=Y(1357)=YO
+
+	movq	mm1,mm0			; mm1=mm0=(R-Y)(L/H)
+	movq	mm3,mm2			; mm3=mm2=(G-Y)(L/H)
+	movq	mm5,mm4			; mm5=mm4=(B-Y)(L/H)
+
+	paddw     mm0,mm6		; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm7		; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm6		; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm7		; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4,mm6		; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+	paddw     mm5,mm7		; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .endcolumn
+
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [edi+0*SIZEOF_DWORD], eax
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi+0*SIZEOF_WORD], ax
+	shr	eax,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .endcolumn
+	mov	BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .endcolumn
+
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .endcolumn
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)	(b)+8			; JDIMENSION output_width
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+	align	16
+	global	EXTN(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	eax, JDIMENSION [output_width(ebp)]
+
+	mov	edi, JSAMPIMAGE [input_buf(ebp)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(ebp)]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+	push	edx			; inptr2
+	push	ebx			; inptr1
+	push	esi			; inptr00
+	mov	ebx,esp
+
+	push	edi			; output_buf (outptr0)
+	push	ecx			; in_row_group_ctr
+	push	ebx			; input_buf
+	push	eax			; output_width
+
+	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+	add	esi, byte SIZEOF_JSAMPROW	; inptr01
+	add	edi, byte SIZEOF_JSAMPROW	; outptr1
+	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
+	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
+
+	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+	add	esp, byte 7*SIZEOF_DWORD
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm
new file mode 100644
index 0000000..3e54c7a
--- /dev/null
+++ b/simd/jdmrgss2-64.asm

@@ -0,0 +1,569 @@
+;
+; jdmrgss2.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+				
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+; r10 = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		3
+
+	align	16
+	global	EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	push	rbx
+	collect_args
+
+	mov	rcx, r10	; col
+	test	rcx,rcx
+	jz	near .return
+
+	push	rcx
+
+	mov	rdi, r11
+	mov	rcx, r12
+	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+	mov	rdi, r13
+	mov	rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]		; inptr0
+	mov	rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]		; inptr1
+	mov	rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]		; inptr2
+	mov	rdi, JSAMPROW [rdi]				; outptr
+
+	pop	rcx			; col
+
+.columnloop:
+
+	movdqa    xmm6, XMMWORD [rbx]	; xmm6=Cb(0123456789ABCDEF)
+	movdqa    xmm7, XMMWORD [rdx]	; xmm7=Cr(0123456789ABCDEF)
+
+	pxor      xmm1,xmm1		; xmm1=(all 0's)
+	pcmpeqw   xmm3,xmm3
+	psllw     xmm3,7		; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	movdqa    xmm4,xmm6
+	punpckhbw xmm6,xmm1		; xmm6=Cb(89ABCDEF)=CbH
+	punpcklbw xmm4,xmm1		; xmm4=Cb(01234567)=CbL
+	movdqa    xmm0,xmm7
+	punpckhbw xmm7,xmm1		; xmm7=Cr(89ABCDEF)=CrH
+	punpcklbw xmm0,xmm1		; xmm0=Cr(01234567)=CrL
+
+	paddw     xmm6,xmm3
+	paddw     xmm4,xmm3
+	paddw     xmm7,xmm3
+	paddw     xmm0,xmm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm5,xmm6		; xmm5=CbH
+	movdqa	xmm2,xmm4		; xmm2=CbL
+	paddw	xmm6,xmm6		; xmm6=2*CbH
+	paddw	xmm4,xmm4		; xmm4=2*CbL
+	movdqa	xmm1,xmm7		; xmm1=CrH
+	movdqa	xmm3,xmm0		; xmm3=CrL
+	paddw	xmm7,xmm7		; xmm7=2*CrH
+	paddw	xmm0,xmm0		; xmm0=2*CrL
+
+	pmulhw	xmm6,[rel PW_MF0228]	; xmm6=(2*CbH * -FIX(0.22800))
+	pmulhw	xmm4,[rel PW_MF0228]	; xmm4=(2*CbL * -FIX(0.22800))
+	pmulhw	xmm7,[rel PW_F0402]	; xmm7=(2*CrH * FIX(0.40200))
+	pmulhw	xmm0,[rel PW_F0402]	; xmm0=(2*CrL * FIX(0.40200))
+
+	paddw	xmm6,[rel PW_ONE]
+	paddw	xmm4,[rel PW_ONE]
+	psraw	xmm6,1			; xmm6=(CbH * -FIX(0.22800))
+	psraw	xmm4,1			; xmm4=(CbL * -FIX(0.22800))
+	paddw	xmm7,[rel PW_ONE]
+	paddw	xmm0,[rel PW_ONE]
+	psraw	xmm7,1			; xmm7=(CrH * FIX(0.40200))
+	psraw	xmm0,1			; xmm0=(CrL * FIX(0.40200))
+
+	paddw	xmm6,xmm5
+	paddw	xmm4,xmm2
+	paddw	xmm6,xmm5		; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	xmm4,xmm2		; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	xmm7,xmm1		; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	xmm0,xmm3		; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=(B-Y)H
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(R-Y)H
+
+	movdqa    xmm6,xmm5
+	movdqa    xmm7,xmm2
+	punpcklwd xmm5,xmm1
+	punpckhwd xmm6,xmm1
+	pmaddwd   xmm5,[rel PW_MF0344_F0285]
+	pmaddwd   xmm6,[rel PW_MF0344_F0285]
+	punpcklwd xmm2,xmm3
+	punpckhwd xmm7,xmm3
+	pmaddwd   xmm2,[rel PW_MF0344_F0285]
+	pmaddwd   xmm7,[rel PW_MF0344_F0285]
+
+	paddd     xmm5,[rel PD_ONEHALF]
+	paddd     xmm6,[rel PD_ONEHALF]
+	psrad     xmm5,SCALEBITS
+	psrad     xmm6,SCALEBITS
+	paddd     xmm2,[rel PD_ONEHALF]
+	paddd     xmm7,[rel PD_ONEHALF]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm7,SCALEBITS
+
+	packssdw  xmm5,xmm6	; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  xmm2,xmm7	; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     xmm5,xmm1	; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     xmm2,xmm3	; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movdqa	XMMWORD [wk(2)], xmm5	; wk(2)=(G-Y)H
+
+	mov	al,2			; Yctr
+	jmp	short .Yloop_1st
+
+.Yloop_2nd:
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(R-Y)H
+	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(G-Y)H
+	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(B-Y)H
+
+.Yloop_1st:
+	movdqa	xmm7, XMMWORD [rsi]	; xmm7=Y(0123456789ABCDEF)
+
+	pcmpeqw	xmm6,xmm6
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	xmm6,xmm7		; xmm6=Y(02468ACE)=YE
+	psrlw	xmm7,BYTE_BIT		; xmm7=Y(13579BDF)=YO
+
+	movdqa	xmm1,xmm0		; xmm1=xmm0=(R-Y)(L/H)
+	movdqa	xmm3,xmm2		; xmm3=xmm2=(G-Y)(L/H)
+	movdqa	xmm5,xmm4		; xmm5=xmm4=(B-Y)(L/H)
+
+	paddw     xmm0,xmm6		; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm7		; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm6		; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm7		; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4,xmm6		; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+	paddw     xmm5,xmm7		; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	rdi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [rdi], xmmF
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	rcx, byte SIZEOF_XMMWORD
+	jz	near .endcolumn
+
+	add	rsi, byte SIZEOF_XMMWORD	; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	rbx, byte SIZEOF_XMMWORD	; inptr1
+	add	rdx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	rcx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	rcx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	rcx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	rax,rcx
+	xor	rcx, byte 0x0F
+	shl	rcx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	rcx,rdi
+	and	rcx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	rax,rcx
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	rcx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	rdi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [rdi], xmmC
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [rdi], xmmH
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	rcx, byte SIZEOF_XMMWORD
+	jz	near .endcolumn
+
+	add	rsi, byte SIZEOF_XMMWORD	; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	rbx, byte SIZEOF_XMMWORD	; inptr1
+	add	rdx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	rcx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	rcx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	rcx, byte SIZEOF_XMMWORD/16
+	jb	near .endcolumn
+	mov	rax,rcx
+	xor	rcx, byte 0x03
+	inc	rcx
+	shl	rcx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	rcx,rdi
+	and	rcx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	rax, [rcx+rax*4]	; RGB_PIXELSIZE
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	rcx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	sfence		; flush the write buffer
+
+.return:
+	uncollect_args
+	pop	rbx
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+; r10 = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+	align	16
+	global	EXTN(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+	push	rbp
+	mov	rbp,rsp
+	push	rbx
+	collect_args
+
+	mov	rax, r10
+
+	mov	rdi, r11
+	mov	rcx, r12
+	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+	mov	rdi, r13
+	lea	rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+	push	rdx			; inptr2
+	push	rbx			; inptr1
+	push	rsi			; inptr00
+	mov	rbx,rsp
+
+	push	rdi
+	push	rcx
+	push	rax
+
+	mov rdx, rcx
+	mov rcx, rdi
+	mov	rdi, rax
+	mov rsi, rbx
+
+	call	EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+	pop rax
+	pop rcx
+	pop rdi
+	pop rsi
+	pop rbx
+	pop rdx
+
+	add	rdi, byte SIZEOF_JSAMPROW	; outptr1
+	add	rsi, byte SIZEOF_JSAMPROW	; inptr01
+
+	push	rdx			; inptr2
+	push	rbx			; inptr1
+	push	rsi			; inptr00
+	mov	rbx,rsp
+
+	push	rdi
+	push	rcx
+	push	rax
+
+	mov rdx, rcx
+	mov rcx, rdi
+	mov	rdi, rax
+	mov rsi, rbx
+
+	call	EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+	pop rax
+	pop rcx
+	pop rdi
+	pop rsi
+	pop rbx
+	pop rdx
+
+	uncollect_args
+	pop	rbx
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm
new file mode 100644
index 0000000..99b7eb9
--- /dev/null
+++ b/simd/jdmrgss2.asm

@@ -0,0 +1,564 @@
+;
+; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+				
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+%define output_width(b)	(b)+8			; JDIMENSION output_width
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		3
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [output_width(eax)]	; col
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
+	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
+	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
+	mov	edi, JSAMPROW [edi]				; outptr
+
+	pop	ecx			; col
+
+	alignx	16,7
+.columnloop:
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	movdqa    xmm6, XMMWORD [ebx]	; xmm6=Cb(0123456789ABCDEF)
+	movdqa    xmm7, XMMWORD [edx]	; xmm7=Cr(0123456789ABCDEF)
+
+	pxor      xmm1,xmm1		; xmm1=(all 0's)
+	pcmpeqw   xmm3,xmm3
+	psllw     xmm3,7		; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	movdqa    xmm4,xmm6
+	punpckhbw xmm6,xmm1		; xmm6=Cb(89ABCDEF)=CbH
+	punpcklbw xmm4,xmm1		; xmm4=Cb(01234567)=CbL
+	movdqa    xmm0,xmm7
+	punpckhbw xmm7,xmm1		; xmm7=Cr(89ABCDEF)=CrH
+	punpcklbw xmm0,xmm1		; xmm0=Cr(01234567)=CrL
+
+	paddw     xmm6,xmm3
+	paddw     xmm4,xmm3
+	paddw     xmm7,xmm3
+	paddw     xmm0,xmm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm5,xmm6		; xmm5=CbH
+	movdqa	xmm2,xmm4		; xmm2=CbL
+	paddw	xmm6,xmm6		; xmm6=2*CbH
+	paddw	xmm4,xmm4		; xmm4=2*CbL
+	movdqa	xmm1,xmm7		; xmm1=CrH
+	movdqa	xmm3,xmm0		; xmm3=CrL
+	paddw	xmm7,xmm7		; xmm7=2*CrH
+	paddw	xmm0,xmm0		; xmm0=2*CrL
+
+	pmulhw	xmm6,[GOTOFF(eax,PW_MF0228)]	; xmm6=(2*CbH * -FIX(0.22800))
+	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbL * -FIX(0.22800))
+	pmulhw	xmm7,[GOTOFF(eax,PW_F0402)]	; xmm7=(2*CrH * FIX(0.40200))
+	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrL * FIX(0.40200))
+
+	paddw	xmm6,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm6,1			; xmm6=(CbH * -FIX(0.22800))
+	psraw	xmm4,1			; xmm4=(CbL * -FIX(0.22800))
+	paddw	xmm7,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm7,1			; xmm7=(CrH * FIX(0.40200))
+	psraw	xmm0,1			; xmm0=(CrL * FIX(0.40200))
+
+	paddw	xmm6,xmm5
+	paddw	xmm4,xmm2
+	paddw	xmm6,xmm5		; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	xmm4,xmm2		; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	xmm7,xmm1		; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	xmm0,xmm3		; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=(B-Y)H
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(R-Y)H
+
+	movdqa    xmm6,xmm5
+	movdqa    xmm7,xmm2
+	punpcklwd xmm5,xmm1
+	punpckhwd xmm6,xmm1
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd xmm2,xmm3
+	punpckhwd xmm7,xmm3
+	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm6,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm5,SCALEBITS
+	psrad     xmm6,SCALEBITS
+	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm7,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm7,SCALEBITS
+
+	packssdw  xmm5,xmm6	; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  xmm2,xmm7	; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     xmm5,xmm1	; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     xmm2,xmm3	; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movdqa	XMMWORD [wk(2)], xmm5	; wk(2)=(G-Y)H
+
+	mov	al,2			; Yctr
+	jmp	short .Yloop_1st
+	alignx	16,7
+
+.Yloop_2nd:
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(R-Y)H
+	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(G-Y)H
+	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(B-Y)H
+	alignx	16,7
+
+.Yloop_1st:
+	movdqa	xmm7, XMMWORD [esi]	; xmm7=Y(0123456789ABCDEF)
+
+	pcmpeqw	xmm6,xmm6
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	xmm6,xmm7		; xmm6=Y(02468ACE)=YE
+	psrlw	xmm7,BYTE_BIT		; xmm7=Y(13579BDF)=YO
+
+	movdqa	xmm1,xmm0		; xmm1=xmm0=(R-Y)(L/H)
+	movdqa	xmm3,xmm2		; xmm3=xmm2=(G-Y)(L/H)
+	movdqa	xmm5,xmm4		; xmm5=xmm4=(B-Y)(L/H)
+
+	paddw     xmm0,xmm6		; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm7		; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm6		; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm7		; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4,xmm6		; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+	paddw     xmm5,xmm7		; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [edi], xmmF
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .endcolumn
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	ecx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	eax,ecx
+	xor	ecx, byte 0x0F
+	shl	ecx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	eax,ecx
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [edi], xmmC
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [edi], xmmH
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .endcolumn
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	ecx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	ecx, byte SIZEOF_XMMWORD/16
+	jb	short .endcolumn
+	mov	eax,ecx
+	xor	ecx, byte 0x03
+	inc	ecx
+	shl	ecx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	eax, [ecx+eax*4]	; RGB_PIXELSIZE
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	sfence		; flush the write buffer
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+%define output_width(b)	(b)+8			; JDIMENSION output_width
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+	align	16
+	global	EXTN(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	eax, POINTER [output_width(ebp)]
+
+	mov	edi, JSAMPIMAGE [input_buf(ebp)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(ebp)]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+	push	edx			; inptr2
+	push	ebx			; inptr1
+	push	esi			; inptr00
+	mov	ebx,esp
+
+	push	edi			; output_buf (outptr0)
+	push	ecx			; in_row_group_ctr
+	push	ebx			; input_buf
+	push	eax			; output_width
+
+	call	near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+	add	esi, byte SIZEOF_JSAMPROW	; inptr01
+	add	edi, byte SIZEOF_JSAMPROW	; outptr1
+	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
+	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
+
+	call	near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+	add	esp, byte 7*SIZEOF_DWORD
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jdsammmx.asm b/simd/jdsammmx.asm
new file mode 100644
index 0000000..c09e5b9
--- /dev/null
+++ b/simd/jdsammmx.asm

@@ -0,0 +1,737 @@
+;
+; jdsammmx.asm - upsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE		times 4 dw  1
+PW_TWO		times 4 dw  2
+PW_THREE	times 4 dw  3
+PW_SEVEN	times 4 dw  7
+PW_EIGHT	times 4 dw  8
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v1_fancy_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax			; colctr
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr
+
+	test	eax, SIZEOF_MMWORD-1
+	jz	short .skip
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+.skip:
+	pxor	mm0,mm0			; mm0=(all 0's)
+	pcmpeqb	mm7,mm7
+	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
+	pand	mm7, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	add	eax, byte SIZEOF_MMWORD-1
+	and	eax, byte -SIZEOF_MMWORD
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	pcmpeqb	mm6,mm6
+	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+	pand	mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	movq	mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm2,mm1
+	movq	mm3,mm1			; mm1=( 0 1 2 3 4 5 6 7)
+	psllq	mm2,BYTE_BIT		; mm2=( - 0 1 2 3 4 5 6)
+	psrlq	mm3,BYTE_BIT		; mm3=( 1 2 3 4 5 6 7 -)
+
+	por	mm2,mm7			; mm2=(-1 0 1 2 3 4 5 6)
+	por	mm3,mm6			; mm3=( 1 2 3 4 5 6 7 8)
+
+	movq	mm7,mm1
+	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT	; mm7=( 7 - - - - - - -)
+
+	movq      mm4,mm1
+	punpcklbw mm1,mm0		; mm1=( 0 1 2 3)
+	punpckhbw mm4,mm0		; mm4=( 4 5 6 7)
+	movq      mm5,mm2
+	punpcklbw mm2,mm0		; mm2=(-1 0 1 2)
+	punpckhbw mm5,mm0		; mm5=( 3 4 5 6)
+	movq      mm6,mm3
+	punpcklbw mm3,mm0		; mm3=( 1 2 3 4)
+	punpckhbw mm6,mm0		; mm6=( 5 6 7 8)
+
+	pmullw	mm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	mm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	mm2,mm1
+	paddw	mm5,mm4
+	psrlw	mm2,2			; mm2=OutLE=( 0  2  4  6)
+	psrlw	mm5,2			; mm5=OutHE=( 8 10 12 14)
+	paddw	mm3,mm1
+	paddw	mm6,mm4
+	psrlw	mm3,2			; mm3=OutLO=( 1  3  5  7)
+	psrlw	mm6,2			; mm6=OutHO=( 9 11 13 15)
+
+	psllw	mm3,BYTE_BIT
+	psllw	mm6,BYTE_BIT
+	por	mm2,mm3			; mm2=OutL=( 0  1  2  3  4  5  6  7)
+	por	mm5,mm6			; mm5=OutH=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+	sub	eax, byte SIZEOF_MMWORD
+	add	esi, byte 1*SIZEOF_MMWORD	; inptr
+	add	edi, byte 2*SIZEOF_MMWORD	; outptr
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		4
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_h2v2_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v2_fancy_upsample_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	edx,eax				; edx = original ebp
+	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(edx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
+	mov	edi, POINTER [output_data_ptr(edx)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	test	eax, SIZEOF_MMWORD-1
+	jz	short .skip
+	push	edx
+	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+	pop	edx
+.skip:
+	; -- process the first column block
+
+	movq	mm0, MMWORD [ebx+0*SIZEOF_MMWORD]	; mm0=row[ 0][0]
+	movq	mm1, MMWORD [ecx+0*SIZEOF_MMWORD]	; mm1=row[-1][0]
+	movq	mm2, MMWORD [esi+0*SIZEOF_MMWORD]	; mm2=row[+1][0]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      mm3,mm3		; mm3=(all 0's)
+	movq      mm4,mm0
+	punpcklbw mm0,mm3		; mm0=row[ 0][0]( 0 1 2 3)
+	punpckhbw mm4,mm3		; mm4=row[ 0][0]( 4 5 6 7)
+	movq      mm5,mm1
+	punpcklbw mm1,mm3		; mm1=row[-1][0]( 0 1 2 3)
+	punpckhbw mm5,mm3		; mm5=row[-1][0]( 4 5 6 7)
+	movq      mm6,mm2
+	punpcklbw mm2,mm3		; mm2=row[+1][0]( 0 1 2 3)
+	punpckhbw mm6,mm3		; mm6=row[+1][0]( 4 5 6 7)
+
+	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+
+	pcmpeqb	mm7,mm7
+	psrlq	mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
+
+	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
+	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
+	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
+	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
+
+	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1	; temporarily save
+	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5	; the intermediate data
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+	pand	mm1,mm7			; mm1=( 0 - - -)
+	pand	mm2,mm7			; mm2=( 0 - - -)
+
+	movq	MMWORD [wk(0)], mm1
+	movq	MMWORD [wk(1)], mm2
+
+	poppic	ebx
+
+	add	eax, byte SIZEOF_MMWORD-1
+	and	eax, byte -SIZEOF_MMWORD
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	; -- process the last column block
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pcmpeqb	mm1,mm1
+	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
+	movq	mm2,mm1
+
+	pand	mm1, MMWORD [edx+1*SIZEOF_MMWORD]	; mm1=( - - - 7)
+	pand	mm2, MMWORD [edi+1*SIZEOF_MMWORD]	; mm2=( - - - 7)
+
+	movq	MMWORD [wk(2)], mm1
+	movq	MMWORD [wk(3)], mm2
+
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	; -- process the next column block
+
+	movq	mm0, MMWORD [ebx+1*SIZEOF_MMWORD]	; mm0=row[ 0][1]
+	movq	mm1, MMWORD [ecx+1*SIZEOF_MMWORD]	; mm1=row[-1][1]
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]	; mm2=row[+1][1]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      mm3,mm3		; mm3=(all 0's)
+	movq      mm4,mm0
+	punpcklbw mm0,mm3		; mm0=row[ 0][1]( 0 1 2 3)
+	punpckhbw mm4,mm3		; mm4=row[ 0][1]( 4 5 6 7)
+	movq      mm5,mm1
+	punpcklbw mm1,mm3		; mm1=row[-1][1]( 0 1 2 3)
+	punpckhbw mm5,mm3		; mm5=row[-1][1]( 4 5 6 7)
+	movq      mm6,mm2
+	punpcklbw mm2,mm3		; mm2=row[+1][1]( 0 1 2 3)
+	punpckhbw mm6,mm3		; mm6=row[+1][1]( 4 5 6 7)
+
+	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+
+	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
+	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
+	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
+	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
+
+	movq	MMWORD [edx+2*SIZEOF_MMWORD], mm1	; temporarily save
+	movq	MMWORD [edx+3*SIZEOF_MMWORD], mm5	; the intermediate data
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm1=( - - - 0)
+	psllq	mm2,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm2=( - - - 0)
+
+	movq	MMWORD [wk(2)], mm1
+	movq	MMWORD [wk(3)], mm2
+
+.upsample:
+	; -- process the upper row
+
+	movq	mm7, MMWORD [edx+0*SIZEOF_MMWORD]	; mm7=Int0L=( 0 1 2 3)
+	movq	mm3, MMWORD [edx+1*SIZEOF_MMWORD]	; mm3=Int0H=( 4 5 6 7)
+
+	movq	mm0,mm7
+	movq	mm4,mm3
+	psrlq	mm0,2*BYTE_BIT			; mm0=( 1 2 3 -)
+	psllq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( - - - 4)
+	movq	mm5,mm7
+	movq	mm6,mm3
+	psrlq	mm5,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm5=( 3 - - -)
+	psllq	mm6,2*BYTE_BIT			; mm6=( - 4 5 6)
+
+	por	mm0,mm4				; mm0=( 1 2 3 4)
+	por	mm5,mm6				; mm5=( 3 4 5 6)
+
+	movq	mm1,mm7
+	movq	mm2,mm3
+	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
+	psrlq	mm2,2*BYTE_BIT			; mm2=( 5 6 7 -)
+	movq	mm4,mm3
+	psrlq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( 7 - - -)
+
+	por	mm1, MMWORD [wk(0)]		; mm1=(-1 0 1 2)
+	por	mm2, MMWORD [wk(2)]		; mm2=( 5 6 7 8)
+
+	movq	MMWORD [wk(0)], mm4
+
+	pmullw	mm7,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm3,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm5,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm0,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	mm2,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	mm1,mm7
+	paddw	mm5,mm3
+	psrlw	mm1,4			; mm1=Out0LE=( 0  2  4  6)
+	psrlw	mm5,4			; mm5=Out0HE=( 8 10 12 14)
+	paddw	mm0,mm7
+	paddw	mm2,mm3
+	psrlw	mm0,4			; mm0=Out0LO=( 1  3  5  7)
+	psrlw	mm2,4			; mm2=Out0HO=( 9 11 13 15)
+
+	psllw	mm0,BYTE_BIT
+	psllw	mm2,BYTE_BIT
+	por	mm1,mm0			; mm1=Out0L=( 0  1  2  3  4  5  6  7)
+	por	mm5,mm2			; mm5=Out0H=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+	; -- process the lower row
+
+	movq	mm6, MMWORD [edi+0*SIZEOF_MMWORD]	; mm6=Int1L=( 0 1 2 3)
+	movq	mm4, MMWORD [edi+1*SIZEOF_MMWORD]	; mm4=Int1H=( 4 5 6 7)
+
+	movq	mm7,mm6
+	movq	mm3,mm4
+	psrlq	mm7,2*BYTE_BIT			; mm7=( 1 2 3 -)
+	psllq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( - - - 4)
+	movq	mm0,mm6
+	movq	mm2,mm4
+	psrlq	mm0,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm0=( 3 - - -)
+	psllq	mm2,2*BYTE_BIT			; mm2=( - 4 5 6)
+
+	por	mm7,mm3				; mm7=( 1 2 3 4)
+	por	mm0,mm2				; mm0=( 3 4 5 6)
+
+	movq	mm1,mm6
+	movq	mm5,mm4
+	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
+	psrlq	mm5,2*BYTE_BIT			; mm5=( 5 6 7 -)
+	movq	mm3,mm4
+	psrlq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( 7 - - -)
+
+	por	mm1, MMWORD [wk(1)]		; mm1=(-1 0 1 2)
+	por	mm5, MMWORD [wk(3)]		; mm5=( 5 6 7 8)
+
+	movq	MMWORD [wk(1)], mm3
+
+	pmullw	mm6,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm0,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm7,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	mm5,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	mm1,mm6
+	paddw	mm0,mm4
+	psrlw	mm1,4			; mm1=Out1LE=( 0  2  4  6)
+	psrlw	mm0,4			; mm0=Out1HE=( 8 10 12 14)
+	paddw	mm7,mm6
+	paddw	mm5,mm4
+	psrlw	mm7,4			; mm7=Out1LO=( 1  3  5  7)
+	psrlw	mm5,4			; mm5=Out1HO=( 9 11 13 15)
+
+	psllw	mm7,BYTE_BIT
+	psllw	mm5,BYTE_BIT
+	por	mm1,mm7			; mm1=Out1L=( 0  1  2  3  4  5  6  7)
+	por	mm0,mm5			; mm0=Out1H=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+	poppic	ebx
+
+	sub	eax, byte SIZEOF_MMWORD
+	add	ecx, byte 1*SIZEOF_MMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_MMWORD	; inptr1(below)
+	add	edx, byte 2*SIZEOF_MMWORD	; outptr0
+	add	edi, byte 2*SIZEOF_MMWORD	; outptr1
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
+;                          JDIMENSION output_width,
+;                          JSAMPARRAY input_data,
+;                          JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define output_width(b)	(b)+12		; JDIMENSION output_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_upsample_mmx)
+
+EXTN(jsimd_h2v1_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, JDIMENSION [output_width(ebp)]
+	add	edx, byte (2*SIZEOF_MMWORD)-1
+	and	edx, byte -(2*SIZEOF_MMWORD)
+	jz	short .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	mov	eax,edx				; colctr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	movq      mm1,mm0
+	punpcklbw mm0,mm0
+	punpckhbw mm1,mm1
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq      mm3,mm2
+	punpcklbw mm2,mm2
+	punpckhbw mm3,mm3
+
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	edi, byte 4*SIZEOF_MMWORD	; outptr
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
+;                          JDIMENSION output_width,
+;                          JSAMPARRAY input_data,
+;                          JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define output_width(b)	(b)+12		; JDIMENSION output_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v2_upsample_mmx)
+
+EXTN(jsimd_h2v2_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, JDIMENSION [output_width(ebp)]
+	add	edx, byte (2*SIZEOF_MMWORD)-1
+	and	edx, byte -(2*SIZEOF_MMWORD)
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]			; inptr
+	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+	mov	eax,edx					; colctr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	movq      mm1,mm0
+	punpcklbw mm0,mm0
+	punpckhbw mm1,mm1
+
+	movq	MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq      mm3,mm2
+	punpcklbw mm2,mm2
+	punpckhbw mm3,mm3
+
+	movq	MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	ebx, byte 4*SIZEOF_MMWORD	; outptr0
+	add	edi, byte 4*SIZEOF_MMWORD	; outptr1
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jdsamss2-64.asm b/simd/jdsamss2-64.asm
new file mode 100644
index 0000000..8521491
--- /dev/null
+++ b/simd/jdsamss2-64.asm

@@ -0,0 +1,668 @@
+;
+; jdsamss2.asm - upsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE		times 8 dw  1
+PW_TWO		times 8 dw  2
+PW_THREE	times 8 dw  3
+PW_SEVEN	times 8 dw  7
+PW_EIGHT	times 8 dw  8
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+	push	rbp
+	mov	rbp,rsp
+	collect_args
+
+	mov	rax, r11  ; colctr
+	test	rax,rax
+	jz	near .return
+
+	mov	rcx, r10	; rowctr
+	test	rcx,rcx
+	jz	near .return
+
+	mov	rsi, r12	; input_data
+	mov	rdi, r13
+	mov	rdi, JSAMPARRAY [rdi]			; output_data
+.rowloop:
+	push	rax			; colctr
+	push	rdi
+	push	rsi
+
+	mov	rsi, JSAMPROW [rsi]	; inptr
+	mov	rdi, JSAMPROW [rdi]	; outptr
+
+	test	rax, SIZEOF_XMMWORD-1
+	jz	short .skip
+	mov	dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+.skip:
+	pxor	xmm0,xmm0		; xmm0=(all 0's)
+	pcmpeqb	xmm7,xmm7
+	psrldq	xmm7,(SIZEOF_XMMWORD-1)
+	pand	xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+	add	rax, byte SIZEOF_XMMWORD-1
+	and	rax, byte -SIZEOF_XMMWORD
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .columnloop
+
+.columnloop_last:
+	pcmpeqb	xmm6,xmm6
+	pslldq	xmm6,(SIZEOF_XMMWORD-1)
+	pand	xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	jmp	short .upsample
+
+.columnloop:
+	movdqa	xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	pslldq	xmm6,(SIZEOF_XMMWORD-1)
+
+.upsample:
+	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2,xmm1
+	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
+	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
+	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
+
+	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
+	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
+
+	movdqa	xmm7,xmm1
+	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
+
+	movdqa    xmm4,xmm1
+	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm2
+	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
+	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
+	movdqa    xmm6,xmm3
+	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
+	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
+
+	pmullw	xmm1,[rel PW_THREE]
+	pmullw	xmm4,[rel PW_THREE]
+	paddw	xmm2,[rel PW_ONE]
+	paddw	xmm5,[rel PW_ONE]
+	paddw	xmm3,[rel PW_TWO]
+	paddw	xmm6,[rel PW_TWO]
+
+	paddw	xmm2,xmm1
+	paddw	xmm5,xmm4
+	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+	paddw	xmm3,xmm1
+	paddw	xmm6,xmm4
+	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm3,BYTE_BIT
+	psllw	xmm6,BYTE_BIT
+	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
+	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+	sub	rax, byte SIZEOF_XMMWORD
+	add	rsi, byte 1*SIZEOF_XMMWORD	; inptr
+	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	rsi
+	pop	rdi
+	pop	rax
+
+	add	rsi, byte SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte SIZEOF_JSAMPROW	; output_data
+	dec	rcx				; rowctr
+	jg	near .rowloop
+
+.return:
+	uncollect_args
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		4
+
+	align	16
+	global	EXTN(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	push	rbx
+	collect_args
+
+	mov	rax, r11  ; colctr
+	test	rax,rax
+	jz	near .return
+
+	mov	rcx, r10	; rowctr
+	test	rcx,rcx
+	jz	near .return
+
+	mov	rsi, r12	; input_data
+	mov	rdi, r13
+	mov	rdi, JSAMPARRAY [rdi]			; output_data
+.rowloop:
+	push	rax					; colctr
+	push	rcx
+	push	rdi
+	push	rsi
+
+	mov	rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	test	rax, SIZEOF_XMMWORD-1
+	jz	short .skip
+	push	rdx
+	mov	dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+	pop	rdx
+.skip:
+	; -- process the first column block
+
+	movdqa	xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
+	movdqa	xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
+	movdqa	xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
+
+	pxor      xmm3,xmm3		; xmm3=(all 0's)
+	movdqa    xmm4,xmm0
+	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm1
+	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm2
+	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm0,[rel PW_THREE]
+	pmullw	xmm4,[rel PW_THREE]
+
+	pcmpeqb	xmm7,xmm7
+	psrldq	xmm7,(SIZEOF_XMMWORD-2)
+
+	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+	movdqa	XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
+	movdqa	XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
+	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
+
+	movdqa	XMMWORD [wk(0)], xmm1
+	movdqa	XMMWORD [wk(1)], xmm2
+
+	add	rax, byte SIZEOF_XMMWORD-1
+	and	rax, byte -SIZEOF_XMMWORD
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .columnloop
+
+.columnloop_last:
+	; -- process the last column block
+
+	pcmpeqb	xmm1,xmm1
+	pslldq	xmm1,(SIZEOF_XMMWORD-2)
+	movdqa	xmm2,xmm1
+
+	pand	xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+	pand	xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
+	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
+
+	jmp	near .upsample
+
+.columnloop:
+	; -- process the next column block
+
+	movdqa	xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
+	movdqa	xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
+	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
+
+	pxor      xmm3,xmm3		; xmm3=(all 0's)
+	movdqa    xmm4,xmm0
+	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm1
+	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm2
+	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm0,[rel PW_THREE]
+	pmullw	xmm4,[rel PW_THREE]
+
+	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+	movdqa	XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
+	movdqa	XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
+	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
+	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
+
+	movdqa	XMMWORD [wk(2)], xmm1
+	movdqa	XMMWORD [wk(3)], xmm2
+
+.upsample:
+	; -- process the upper row
+
+	movdqa	xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
+	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
+	movdqa	xmm5,xmm7
+	movdqa	xmm6,xmm3
+	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
+	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
+
+	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
+	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
+
+	movdqa	xmm1,xmm7
+	movdqa	xmm2,xmm3
+	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
+	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
+	movdqa	xmm4,xmm3
+	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
+
+	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
+	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
+
+	movdqa	XMMWORD [wk(0)], xmm4
+
+	pmullw	xmm7,[rel PW_THREE]
+	pmullw	xmm3,[rel PW_THREE]
+	paddw	xmm1,[rel PW_EIGHT]
+	paddw	xmm5,[rel PW_EIGHT]
+	paddw	xmm0,[rel PW_SEVEN]
+	paddw	xmm2,[rel PW_SEVEN]
+
+	paddw	xmm1,xmm7
+	paddw	xmm5,xmm3
+	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm3
+	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm0,BYTE_BIT
+	psllw	xmm2,BYTE_BIT
+	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+	; -- process the lower row
+
+	movdqa	xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+	movdqa	xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
+	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
+	movdqa	xmm0,xmm6
+	movdqa	xmm2,xmm4
+	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
+	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
+
+	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
+	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
+
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,xmm4
+	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
+	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
+	movdqa	xmm3,xmm4
+	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
+
+	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
+	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
+
+	movdqa	XMMWORD [wk(1)], xmm3
+
+	pmullw	xmm6,[rel PW_THREE]
+	pmullw	xmm4,[rel PW_THREE]
+	paddw	xmm1,[rel PW_EIGHT]
+	paddw	xmm0,[rel PW_EIGHT]
+	paddw	xmm7,[rel PW_SEVEN]
+	paddw	xmm5,[rel PW_SEVEN]
+
+	paddw	xmm1,xmm6
+	paddw	xmm0,xmm4
+	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+	paddw	xmm7,xmm6
+	paddw	xmm5,xmm4
+	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm7,BYTE_BIT
+	psllw	xmm5,BYTE_BIT
+	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+	sub	rax, byte SIZEOF_XMMWORD
+	add	rcx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
+	add	rbx, byte 1*SIZEOF_XMMWORD	; inptr0
+	add	rsi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
+	add	rdx, byte 2*SIZEOF_XMMWORD	; outptr0
+	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr1
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	near .columnloop
+	test	rax,rax
+	jnz	near .columnloop_last
+
+	pop	rsi
+	pop	rdi
+	pop	rcx
+	pop	rax
+
+	add	rsi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	rcx, byte 2			; rowctr
+	jg	near .rowloop
+
+.return:
+	uncollect_args
+	pop	rbx
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+	push	rbp
+	mov	rbp,rsp
+	collect_args
+
+	mov	rdx, r11
+	add	rdx, byte (2*SIZEOF_XMMWORD)-1
+	and	rdx, byte -(2*SIZEOF_XMMWORD)
+	jz	near .return
+
+	mov	rcx, r10	; rowctr
+	test	rcx,rcx
+	jz	short .return
+
+	mov	rsi, r12 ; input_data
+	mov	rdi, r13
+	mov	rdi, JSAMPARRAY [rdi]			; output_data
+.rowloop:
+	push	rdi
+	push	rsi
+
+	mov	rsi, JSAMPROW [rsi]		; inptr
+	mov	rdi, JSAMPROW [rdi]		; outptr
+	mov	rax,rdx				; colctr
+.columnloop:
+
+	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+	movdqa    xmm1,xmm0
+	punpcklbw xmm0,xmm0
+	punpckhbw xmm1,xmm1
+
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+	sub	rax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+	movdqa    xmm3,xmm2
+	punpcklbw xmm2,xmm2
+	punpckhbw xmm3,xmm3
+
+	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+	sub	rax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	rdi, byte 4*SIZEOF_XMMWORD	; outptr
+	jmp	short .columnloop
+
+.nextrow:
+	pop	rsi
+	pop	rdi
+
+	add	rsi, byte SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte SIZEOF_JSAMPROW	; output_data
+	dec	rcx				; rowctr
+	jg	short .rowloop
+
+.return:
+	uncollect_args
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+	push	rbp
+	mov	rbp,rsp
+	push	rbx
+	collect_args
+
+	mov	rdx, r11
+	add	rdx, byte (2*SIZEOF_XMMWORD)-1
+	and	rdx, byte -(2*SIZEOF_XMMWORD)
+	jz	near .return
+
+	mov	rcx, r10	; rowctr
+	test	rcx,rcx
+	jz	near .return
+
+	mov	rsi, r12	; input_data
+	mov	rdi, r13
+	mov	rdi, JSAMPARRAY [rdi]			; output_data
+.rowloop:
+	push	rdi
+	push	rsi
+
+	mov	rsi, JSAMPROW [rsi]			; inptr
+	mov	rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]	; outptr1
+	mov	rax,rdx					; colctr
+.columnloop:
+
+	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+	movdqa    xmm1,xmm0
+	punpcklbw xmm0,xmm0
+	punpckhbw xmm1,xmm1
+
+	movdqa	XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+	sub	rax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+	movdqa    xmm3,xmm2
+	punpcklbw xmm2,xmm2
+	punpckhbw xmm3,xmm3
+
+	movdqa	XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+	sub	rax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	rbx, byte 4*SIZEOF_XMMWORD	; outptr0
+	add	rdi, byte 4*SIZEOF_XMMWORD	; outptr1
+	jmp	short .columnloop
+
+.nextrow:
+	pop	rsi
+	pop	rdi
+
+	add	rsi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	rcx, byte 2			; rowctr
+	jg	near .rowloop
+
+.return:
+	uncollect_args
+	pop	rbx
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jdsamss2.asm b/simd/jdsamss2.asm
new file mode 100644
index 0000000..b5c863b
--- /dev/null
+++ b/simd/jdsamss2.asm

@@ -0,0 +1,729 @@
+;
+; jdsamss2.asm - upsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE		times 8 dw  1
+PW_TWO		times 8 dw  2
+PW_THREE	times 8 dw  3
+PW_SEVEN	times 8 dw  7
+PW_EIGHT	times 8 dw  8
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax			; colctr
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr
+
+	test	eax, SIZEOF_XMMWORD-1
+	jz	short .skip
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+.skip:
+	pxor	xmm0,xmm0		; xmm0=(all 0's)
+	pcmpeqb	xmm7,xmm7
+	psrldq	xmm7,(SIZEOF_XMMWORD-1)
+	pand	xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+	add	eax, byte SIZEOF_XMMWORD-1
+	and	eax, byte -SIZEOF_XMMWORD
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	pcmpeqb	xmm6,xmm6
+	pslldq	xmm6,(SIZEOF_XMMWORD-1)
+	pand	xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	pslldq	xmm6,(SIZEOF_XMMWORD-1)
+
+.upsample:
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2,xmm1
+	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
+	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
+	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
+
+	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
+	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
+
+	movdqa	xmm7,xmm1
+	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
+
+	movdqa    xmm4,xmm1
+	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm2
+	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
+	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
+	movdqa    xmm6,xmm3
+	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
+	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
+
+	pmullw	xmm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	xmm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	xmm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	xmm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	xmm2,xmm1
+	paddw	xmm5,xmm4
+	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+	paddw	xmm3,xmm1
+	paddw	xmm6,xmm4
+	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm3,BYTE_BIT
+	psllw	xmm6,BYTE_BIT
+	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
+	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+	sub	eax, byte SIZEOF_XMMWORD
+	add	esi, byte 1*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		4
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	edx,eax				; edx = original ebp
+	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(edx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
+	mov	edi, POINTER [output_data_ptr(edx)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	test	eax, SIZEOF_XMMWORD-1
+	jz	short .skip
+	push	edx
+	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+	pop	edx
+.skip:
+	; -- process the first column block
+
+	movdqa	xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
+	movdqa	xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
+	movdqa	xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      xmm3,xmm3		; xmm3=(all 0's)
+	movdqa    xmm4,xmm0
+	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm1
+	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm2
+	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+
+	pcmpeqb	xmm7,xmm7
+	psrldq	xmm7,(SIZEOF_XMMWORD-2)
+
+	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
+	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
+	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
+
+	movdqa	XMMWORD [wk(0)], xmm1
+	movdqa	XMMWORD [wk(1)], xmm2
+
+	poppic	ebx
+
+	add	eax, byte SIZEOF_XMMWORD-1
+	and	eax, byte -SIZEOF_XMMWORD
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	; -- process the last column block
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pcmpeqb	xmm1,xmm1
+	pslldq	xmm1,(SIZEOF_XMMWORD-2)
+	movdqa	xmm2,xmm1
+
+	pand	xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+	pand	xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
+	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
+
+	jmp	near .upsample
+	alignx	16,7
+
+.columnloop:
+	; -- process the next column block
+
+	movdqa	xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
+	movdqa	xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
+	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      xmm3,xmm3		; xmm3=(all 0's)
+	movdqa    xmm4,xmm0
+	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm1
+	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm2
+	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+
+	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+	movdqa	XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
+	movdqa	XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
+	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
+	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
+
+	movdqa	XMMWORD [wk(2)], xmm1
+	movdqa	XMMWORD [wk(3)], xmm2
+
+.upsample:
+	; -- process the upper row
+
+	movdqa	xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
+	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
+	movdqa	xmm5,xmm7
+	movdqa	xmm6,xmm3
+	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
+	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
+
+	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
+	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
+
+	movdqa	xmm1,xmm7
+	movdqa	xmm2,xmm3
+	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
+	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
+	movdqa	xmm4,xmm3
+	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
+
+	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
+	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
+
+	movdqa	XMMWORD [wk(0)], xmm4
+
+	pmullw	xmm7,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm3,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm5,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm0,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	xmm2,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	xmm1,xmm7
+	paddw	xmm5,xmm3
+	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm3
+	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm0,BYTE_BIT
+	psllw	xmm2,BYTE_BIT
+	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+	; -- process the lower row
+
+	movdqa	xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+	movdqa	xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
+	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
+	movdqa	xmm0,xmm6
+	movdqa	xmm2,xmm4
+	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
+	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
+
+	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
+	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
+
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,xmm4
+	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
+	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
+	movdqa	xmm3,xmm4
+	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
+
+	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
+	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
+
+	movdqa	XMMWORD [wk(1)], xmm3
+
+	pmullw	xmm6,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm0,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm7,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	xmm5,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	xmm1,xmm6
+	paddw	xmm0,xmm4
+	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+	paddw	xmm7,xmm6
+	paddw	xmm5,xmm4
+	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm7,BYTE_BIT
+	psllw	xmm5,BYTE_BIT
+	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+	poppic	ebx
+
+	sub	eax, byte SIZEOF_XMMWORD
+	add	ecx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_XMMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
+	add	edx, byte 2*SIZEOF_XMMWORD	; outptr0
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr1
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define output_width(b)	(b)+12		; JDIMENSION output_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, JDIMENSION [output_width(ebp)]
+	add	edx, byte (2*SIZEOF_XMMWORD)-1
+	and	edx, byte -(2*SIZEOF_XMMWORD)
+	jz	short .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	mov	eax,edx				; colctr
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+	movdqa    xmm1,xmm0
+	punpcklbw xmm0,xmm0
+	punpckhbw xmm1,xmm1
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+	movdqa    xmm3,xmm2
+	punpcklbw xmm2,xmm2
+	punpckhbw xmm3,xmm3
+
+	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 4*SIZEOF_XMMWORD	; outptr
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	short .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b)		(b)+8			; int max_v_samp_factor
+%define output_width(b)	(b)+12		; JDIMENSION output_width
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, JDIMENSION [output_width(ebp)]
+	add	edx, byte (2*SIZEOF_XMMWORD)-1
+	and	edx, byte -(2*SIZEOF_XMMWORD)
+	jz	near .return
+
+	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]			; inptr
+	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+	mov	eax,edx					; colctr
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+	movdqa    xmm1,xmm0
+	punpcklbw xmm0,xmm0
+	punpckhbw xmm1,xmm1
+
+	movdqa	XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+	movdqa    xmm3,xmm2
+	punpcklbw xmm2,xmm2
+	punpckhbw xmm3,xmm3
+
+	movdqa	XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	ebx, byte 4*SIZEOF_XMMWORD	; outptr0
+	add	edi, byte 4*SIZEOF_XMMWORD	; outptr1
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	short .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jf3dnflt.asm b/simd/jf3dnflt.asm
new file mode 100644
index 0000000..542672d
--- /dev/null
+++ b/simd/jf3dnflt.asm

@@ -0,0 +1,320 @@
+;
+; jf3dnflt.asm - floating-point FDCT (3DNow!)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382	times 2 dd  0.382683432365089771728460
+PD_0_707	times 2 dd  0.707106781186547524400844
+PD_0_541	times 2 dd  0.541196100146196984399723
+PD_1_306	times 2 dd  1.306562964876376527856643
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_3dnow (FAST_FLOAT * data)
+;
+
+%define data(b)		(b)+8		; FAST_FLOAT * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_float_3dnow)
+
+EXTN(jsimd_fdct_float_3dnow):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+	movq      mm4,mm0		; transpose coefficients
+	punpckldq mm0,mm1		; mm0=(00 10)=data0
+	punpckhdq mm4,mm1		; mm4=(01 11)=data1
+	movq      mm5,mm2		; transpose coefficients
+	punpckldq mm2,mm3		; mm2=(06 16)=data6
+	punpckhdq mm5,mm3		; mm5=(07 17)=data7
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
+	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
+	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
+	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
+
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
+
+	movq      mm4,mm1		; transpose coefficients
+	punpckldq mm1,mm3		; mm1=(02 12)=data2
+	punpckhdq mm4,mm3		; mm4=(03 13)=data3
+	movq      mm0,mm2		; transpose coefficients
+	punpckldq mm2,mm5		; mm2=(04 14)=data4
+	punpckhdq mm0,mm5		; mm0=(05 15)=data5
+
+	movq	mm3,mm4
+	movq	mm5,mm1
+	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
+	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
+	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
+	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm2,mm7
+	movq	mm0,mm6
+	pfsub	mm7,mm4			; mm7=tmp13
+	pfsub	mm6,mm1			; mm6=tmp12
+	pfadd	mm2,mm4			; mm2=tmp10
+	pfadd	mm0,mm1			; mm0=tmp11
+
+	pfadd	mm6,mm7
+	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+	movq	mm4,mm2
+	movq	mm1,mm7
+	pfsub	mm2,mm0			; mm2=data4
+	pfsub	mm7,mm6			; mm7=data6
+	pfadd	mm4,mm0			; mm4=data0
+	pfadd	mm1,mm6			; mm1=data2
+
+	movq	MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
+
+	pfadd	mm3,mm5			; mm3=tmp10
+	pfadd	mm5,mm0			; mm5=tmp11
+	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
+
+	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+	movq	mm2,mm3			; mm2=tmp10
+	pfsub	mm3,mm0
+	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
+	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+	pfadd	mm2,mm3			; mm2=z2
+	pfadd	mm0,mm3			; mm0=z4
+
+	movq	mm7,mm6
+	pfsub	mm6,mm5			; mm6=z13
+	pfadd	mm7,mm5			; mm7=z11
+
+	movq	mm4,mm6
+	movq	mm1,mm7
+	pfsub	mm6,mm2			; mm6=data3
+	pfsub	mm7,mm0			; mm7=data7
+	pfadd	mm4,mm2			; mm4=data5
+	pfadd	mm1,mm0			; mm1=data1
+
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	add	edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+	movq      mm4,mm0		; transpose coefficients
+	punpckldq mm0,mm1		; mm0=(00 01)=data0
+	punpckhdq mm4,mm1		; mm4=(10 11)=data1
+	movq      mm5,mm2		; transpose coefficients
+	punpckldq mm2,mm3		; mm2=(60 61)=data6
+	punpckhdq mm5,mm3		; mm5=(70 71)=data7
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
+	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
+	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
+	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
+
+	movq	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
+
+	movq      mm4,mm1		; transpose coefficients
+	punpckldq mm1,mm3		; mm1=(20 21)=data2
+	punpckhdq mm4,mm3		; mm4=(30 31)=data3
+	movq      mm0,mm2		; transpose coefficients
+	punpckldq mm2,mm5		; mm2=(40 41)=data4
+	punpckhdq mm0,mm5		; mm0=(50 51)=data5
+
+	movq	mm3,mm4
+	movq	mm5,mm1
+	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
+	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
+	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
+	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm2,mm7
+	movq	mm0,mm6
+	pfsub	mm7,mm4			; mm7=tmp13
+	pfsub	mm6,mm1			; mm6=tmp12
+	pfadd	mm2,mm4			; mm2=tmp10
+	pfadd	mm0,mm1			; mm0=tmp11
+
+	pfadd	mm6,mm7
+	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+	movq	mm4,mm2
+	movq	mm1,mm7
+	pfsub	mm2,mm0			; mm2=data4
+	pfsub	mm7,mm6			; mm7=data6
+	pfadd	mm4,mm0			; mm4=data0
+	pfadd	mm1,mm6			; mm1=data2
+
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
+
+	pfadd	mm3,mm5			; mm3=tmp10
+	pfadd	mm5,mm0			; mm5=tmp11
+	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
+
+	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+	movq	mm2,mm3			; mm2=tmp10
+	pfsub	mm3,mm0
+	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
+	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+	pfadd	mm2,mm3			; mm2=z2
+	pfadd	mm0,mm3			; mm0=z4
+
+	movq	mm7,mm6
+	pfsub	mm6,mm5			; mm6=z13
+	pfadd	mm7,mm5			; mm7=z11
+
+	movq	mm4,mm6
+	movq	mm1,mm7
+	pfsub	mm6,mm2			; mm6=data3
+	pfsub	mm7,mm0			; mm7=data7
+	pfadd	mm4,mm2			; mm4=data5
+	pfadd	mm1,mm0			; mm1=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	add	edx, byte 2*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .columnloop
+
+	femms		; empty MMX/3DNow! state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jfmmxfst.asm b/simd/jfmmxfst.asm
new file mode 100644
index 0000000..0647242
--- /dev/null
+++ b/simd/jfmmxfst.asm

@@ -0,0 +1,397 @@
+;
+; jfmmxfst.asm - fast integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707	times 4 dw  F_0_707 << CONST_SHIFT
+PW_F0382	times 4 dw  F_0_382 << CONST_SHIFT
+PW_F0541	times 4 dw  F_0_541 << CONST_SHIFT
+PW_F1306	times 4 dw  F_1_306 << CONST_SHIFT
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_mmx (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_ifast_mmx)
+
+EXTN(jsimd_fdct_ifast_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(20 21 22 23), mm2=(24 25 26 27)
+	; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
+	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 01 02 03), mm1=(04 05 06 07)
+	; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
+	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
+	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
+	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
+	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
+	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	psubw	mm5,mm7			; mm5=tmp13
+	psubw	mm0,mm4			; mm0=tmp12
+	paddw	mm1,mm7			; mm1=tmp10
+	paddw	mm6,mm4			; mm6=tmp11
+
+	paddw	mm0,mm5
+	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+	movq	mm7,mm1
+	movq	mm4,mm5
+	psubw	mm1,mm6			; mm1=data4
+	psubw	mm5,mm0			; mm5=data6
+	paddw	mm7,mm6			; mm7=data0
+	paddw	mm4,mm0			; mm4=data2
+
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+	; -- Odd part
+
+	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
+
+	paddw	mm2,mm3			; mm2=tmp10
+	paddw	mm3,mm6			; mm3=tmp11
+	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
+
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+	movq	mm1,mm2			; mm1=tmp10
+	psubw	mm2,mm6
+	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+	paddw	mm1,mm2			; mm1=z2
+	paddw	mm6,mm2			; mm6=z4
+
+	movq	mm5,mm0
+	psubw	mm0,mm3			; mm0=z13
+	paddw	mm5,mm3			; mm5=z11
+
+	movq	mm7,mm0
+	movq	mm4,mm5
+	psubw	mm0,mm1			; mm0=data3
+	psubw	mm5,mm6			; mm5=data7
+	paddw	mm7,mm1			; mm7=data5
+	paddw	mm4,mm6			; mm4=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(02 12 22 32), mm2=(42 52 62 72)
+	; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
+	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
+	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 10 20 30), mm1=(40 50 60 70)
+	; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
+	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
+	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
+	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
+	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
+	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	psubw	mm5,mm7			; mm5=tmp13
+	psubw	mm0,mm4			; mm0=tmp12
+	paddw	mm1,mm7			; mm1=tmp10
+	paddw	mm6,mm4			; mm6=tmp11
+
+	paddw	mm0,mm5
+	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+	movq	mm7,mm1
+	movq	mm4,mm5
+	psubw	mm1,mm6			; mm1=data4
+	psubw	mm5,mm0			; mm5=data6
+	paddw	mm7,mm6			; mm7=data0
+	paddw	mm4,mm0			; mm4=data2
+
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+	; -- Odd part
+
+	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
+
+	paddw	mm2,mm3			; mm2=tmp10
+	paddw	mm3,mm6			; mm3=tmp11
+	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
+
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+	movq	mm1,mm2			; mm1=tmp10
+	psubw	mm2,mm6
+	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+	paddw	mm1,mm2			; mm1=z2
+	paddw	mm6,mm2			; mm6=z4
+
+	movq	mm5,mm0
+	psubw	mm0,mm3			; mm0=z13
+	paddw	mm5,mm3			; mm5=z11
+
+	movq	mm7,mm0
+	movq	mm4,mm5
+	psubw	mm0,mm1			; mm0=data3
+	psubw	mm5,mm6			; mm5=data7
+	paddw	mm7,mm1			; mm7=data5
+	paddw	mm4,mm6			; mm4=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+	add	edx, byte 4*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .columnloop
+
+	emms		; empty MMX state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jfmmxint.asm b/simd/jfmmxint.asm
new file mode 100644
index 0000000..a7e73f7
--- /dev/null
+++ b/simd/jfmmxint.asm

@@ -0,0 +1,622 @@
+;
+; jfmmxint.asm - accurate integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X	times 4 dw  1 << (PASS1_BITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_mmx (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_islow_mmx)
+
+EXTN(jsimd_fdct_islow_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(20 21 22 23), mm2=(24 25 26 27)
+	; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
+	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 01 02 03), mm1=(04 05 06 07)
+	; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
+	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
+	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
+	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
+	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
+	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	paddw	mm5,mm7			; mm5=tmp10
+	paddw	mm0,mm4			; mm0=tmp11
+	psubw	mm1,mm7			; mm1=tmp13
+	psubw	mm6,mm4			; mm6=tmp12
+
+	movq	mm7,mm5
+	paddw	mm5,mm0			; mm5=tmp10+tmp11
+	psubw	mm7,mm0			; mm7=tmp10-tmp11
+
+	psllw	mm5,PASS1_BITS		; mm5=data0
+	psllw	mm7,PASS1_BITS		; mm7=data4
+
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movq      mm4,mm1		; mm1=tmp13
+	movq      mm0,mm1
+	punpcklwd mm4,mm6		; mm6=tmp12
+	punpckhwd mm0,mm6
+	movq      mm1,mm4
+	movq      mm6,mm0
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm4,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm1,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm4,mm0		; mm4=data2
+	packssdw  mm1,mm6		; mm1=data6
+
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+	; -- Odd part
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
+
+	movq	mm0,mm2			; mm2=tmp4
+	movq	mm6,mm3			; mm3=tmp5
+	paddw	mm0,mm5			; mm0=z3
+	paddw	mm6,mm7			; mm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm4,mm0
+	movq      mm1,mm0
+	punpcklwd mm4,mm6
+	punpckhwd mm1,mm6
+	movq      mm0,mm4
+	movq      mm6,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
+	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movq      mm4,mm2
+	movq      mm1,mm2
+	punpcklwd mm4,mm7
+	punpckhwd mm1,mm7
+	movq      mm2,mm4
+	movq      mm7,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
+
+	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
+	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
+	paddd	mm2,mm0			; mm2=data1L
+	paddd	mm7,mm6			; mm7=data1H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm4,DESCALE_P1
+	psrad	mm1,DESCALE_P1
+	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm2,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+
+	packssdw  mm4,mm1		; mm4=data7
+	packssdw  mm2,mm7		; mm2=data1
+
+	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+	movq      mm1,mm3
+	movq      mm7,mm3
+	punpcklwd mm1,mm5
+	punpckhwd mm7,mm5
+	movq      mm3,mm1
+	movq      mm5,mm7
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
+
+	paddd	mm1,mm0			; mm1=data5L
+	paddd	mm7,mm6			; mm7=data5H
+	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
+	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
+
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm1,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm3,DESCALE_P1
+	psrad	mm5,DESCALE_P1
+
+	packssdw  mm1,mm7		; mm1=data5
+	packssdw  mm3,mm5		; mm3=data3
+
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(02 12 22 32), mm2=(42 52 62 72)
+	; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
+	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
+	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 10 20 30), mm1=(40 50 60 70)
+	; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
+	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
+	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
+	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
+	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
+	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	paddw	mm5,mm7			; mm5=tmp10
+	paddw	mm0,mm4			; mm0=tmp11
+	psubw	mm1,mm7			; mm1=tmp13
+	psubw	mm6,mm4			; mm6=tmp12
+
+	movq	mm7,mm5
+	paddw	mm5,mm0			; mm5=tmp10+tmp11
+	psubw	mm7,mm0			; mm7=tmp10-tmp11
+
+	paddw	mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	paddw	mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	psraw	mm5,PASS1_BITS		; mm5=data0
+	psraw	mm7,PASS1_BITS		; mm7=data4
+
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movq      mm4,mm1		; mm1=tmp13
+	movq      mm0,mm1
+	punpcklwd mm4,mm6		; mm6=tmp12
+	punpckhwd mm0,mm6
+	movq      mm1,mm4
+	movq      mm6,mm0
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm4,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm1,DESCALE_P2
+	psrad	mm6,DESCALE_P2
+
+	packssdw  mm4,mm0		; mm4=data2
+	packssdw  mm1,mm6		; mm1=data6
+
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+	; -- Odd part
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
+
+	movq	mm0,mm2			; mm2=tmp4
+	movq	mm6,mm3			; mm3=tmp5
+	paddw	mm0,mm5			; mm0=z3
+	paddw	mm6,mm7			; mm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm4,mm0
+	movq      mm1,mm0
+	punpcklwd mm4,mm6
+	punpckhwd mm1,mm6
+	movq      mm0,mm4
+	movq      mm6,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
+	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movq      mm4,mm2
+	movq      mm1,mm2
+	punpcklwd mm4,mm7
+	punpckhwd mm1,mm7
+	movq      mm2,mm4
+	movq      mm7,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
+
+	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
+	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
+	paddd	mm2,mm0			; mm2=data1L
+	paddd	mm7,mm6			; mm7=data1H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm4,DESCALE_P2
+	psrad	mm1,DESCALE_P2
+	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm2,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+
+	packssdw  mm4,mm1		; mm4=data7
+	packssdw  mm2,mm7		; mm2=data1
+
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+	movq      mm1,mm3
+	movq      mm7,mm3
+	punpcklwd mm1,mm5
+	punpckhwd mm7,mm5
+	movq      mm3,mm1
+	movq      mm5,mm7
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
+
+	paddd	mm1,mm0			; mm1=data5L
+	paddd	mm7,mm6			; mm7=data5H
+	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
+	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
+
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm1,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm3,DESCALE_P2
+	psrad	mm5,DESCALE_P2
+
+	packssdw  mm1,mm7		; mm1=data5
+	packssdw  mm3,mm5		; mm3=data3
+
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+	add	edx, byte 4*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .columnloop
+
+	emms		; empty MMX state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jfss2fst-64.asm b/simd/jfss2fst-64.asm
new file mode 100644
index 0000000..c99acc2
--- /dev/null
+++ b/simd/jfss2fst-64.asm

@@ -0,0 +1,392 @@
+;
+; jfss2fst.asm - fast integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707	times 8 dw  F_0_707 << CONST_SHIFT
+PW_F0382	times 8 dw  F_0_382 << CONST_SHIFT
+PW_F0541	times 8 dw  F_0_541 << CONST_SHIFT
+PW_F1306	times 8 dw  F_1_306 << CONST_SHIFT
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2 (DCTELEM * data)
+;
+
+; r10 = DCTELEM * data
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+
+	; ---- Pass 1: process rows.
+
+	mov	rdx, r10	; (DCTELEM *)
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
+	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
+
+	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
+	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(42 52 62 72 43 53 63 73)
+	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=(44 54 64 74 45 55 65 75)
+
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
+	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
+	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
+	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
+	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+	movdqa	xmm6,xmm1
+	movdqa	xmm3,xmm0
+	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
+	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
+	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
+	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm2, XMMWORD [wk(0)]	; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(44 54 64 74 45 55 65 75)
+	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
+
+	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
+	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
+	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
+	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+	movdqa	xmm2,xmm1
+	movdqa	xmm5,xmm7
+	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
+	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
+	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
+	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm0,xmm6
+	psubw	xmm3,xmm1		; xmm3=tmp13
+	psubw	xmm6,xmm7		; xmm6=tmp12
+	paddw	xmm4,xmm1		; xmm4=tmp10
+	paddw	xmm0,xmm7		; xmm0=tmp11
+
+	paddw	xmm6,xmm3
+	psllw	xmm6,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm6,[rel PW_F0707] ; xmm6=z1
+
+	movdqa	xmm1,xmm4
+	movdqa	xmm7,xmm3
+	psubw	xmm4,xmm0		; xmm4=data4
+	psubw	xmm3,xmm6		; xmm3=data6
+	paddw	xmm1,xmm0		; xmm1=data0
+	paddw	xmm7,xmm6		; xmm7=data2
+
+	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=tmp6
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp7
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=data4
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=data6
+
+	; -- Odd part
+
+	paddw	xmm2,xmm5		; xmm2=tmp10
+	paddw	xmm5,xmm0		; xmm5=tmp11
+	paddw	xmm0,xmm6		; xmm0=tmp12, xmm6=tmp7
+
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[rel PW_F0707] ; xmm5=z3
+
+	movdqa	xmm4,xmm2		; xmm4=tmp10
+	psubw	xmm2,xmm0
+	pmulhw	xmm2,[rel PW_F0382] ; xmm2=z5
+	pmulhw	xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+	pmulhw	xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+	paddw	xmm4,xmm2		; xmm4=z2
+	paddw	xmm0,xmm2		; xmm0=z4
+
+	movdqa	xmm3,xmm6
+	psubw	xmm6,xmm5		; xmm6=z13
+	paddw	xmm3,xmm5		; xmm3=z11
+
+	movdqa	xmm2,xmm6
+	movdqa	xmm5,xmm3
+	psubw	xmm6,xmm4		; xmm6=data3
+	psubw	xmm3,xmm0		; xmm3=data7
+	paddw	xmm2,xmm4		; xmm2=data5
+	paddw	xmm5,xmm0		; xmm5=data1
+
+	; ---- Pass 2: process columns.
+
+	; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+	; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+	movdqa    xmm4,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm5		; xmm1=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm4,xmm5		; xmm4=(40 41 50 51 60 61 70 71)
+	movdqa    xmm0,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm6		; xmm7=(02 03 12 13 22 23 32 33)
+	punpckhwd xmm0,xmm6		; xmm0=(42 43 52 53 62 63 72 73)
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=col4
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=col6
+
+	; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+	; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(02 03 12 13 22 23 32 33)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(42 43 52 53 62 63 72 73)
+
+	movdqa    xmm7,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm2		; xmm5=(04 05 14 15 24 25 34 35)
+	punpckhwd xmm7,xmm2		; xmm7=(44 45 54 55 64 65 74 75)
+	movdqa    xmm0,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm3		; xmm6=(06 07 16 17 26 27 36 37)
+	punpckhwd xmm0,xmm3		; xmm0=(46 47 56 57 66 67 76 77)
+
+	movdqa    xmm2,xmm5		; transpose coefficients(phase 2)
+	punpckldq xmm5,xmm6		; xmm5=(04 05 06 07 14 15 16 17)
+	punpckhdq xmm2,xmm6		; xmm2=(24 25 26 27 34 35 36 37)
+	movdqa    xmm3,xmm7		; transpose coefficients(phase 2)
+	punpckldq xmm7,xmm0		; xmm7=(44 45 46 47 54 55 56 57)
+	punpckhdq xmm3,xmm0		; xmm3=(64 65 66 67 74 75 76 77)
+
+	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=(02 03 12 13 22 23 32 33)
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(42 43 52 53 62 63 72 73)
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(24 25 26 27 34 35 36 37)
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(44 45 46 47 54 55 56 57)
+
+	movdqa    xmm2,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm6		; xmm1=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm2,xmm6		; xmm2=(20 21 22 23 30 31 32 33)
+	movdqa    xmm7,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm0		; xmm4=(40 41 42 43 50 51 52 53)
+	punpckhdq xmm7,xmm0		; xmm7=(60 61 62 63 70 71 72 73)
+
+	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm5		; xmm1=(00 01 02 03 04 05 06 07)=data0
+	punpckhqdq xmm6,xmm5		; xmm6=(10 11 12 13 14 15 16 17)=data1
+	movdqa     xmm0,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm3		; xmm7=(60 61 62 63 64 65 66 67)=data6
+	punpckhqdq xmm0,xmm3		; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm3,xmm1
+	psubw	xmm6,xmm7		; xmm6=data1-data6=tmp6
+	psubw	xmm1,xmm0		; xmm1=data0-data7=tmp7
+	paddw	xmm5,xmm7		; xmm5=data1+data6=tmp1
+	paddw	xmm3,xmm0		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(24 25 26 27 34 35 36 37)
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(44 45 46 47 54 55 56 57)
+	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=tmp7
+
+	movdqa     xmm6,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm7		; xmm2=(20 21 22 23 24 25 26 27)=data2
+	punpckhqdq xmm6,xmm7		; xmm6=(30 31 32 33 34 35 36 37)=data3
+	movdqa     xmm1,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm0		; xmm4=(40 41 42 43 44 45 46 47)=data4
+	punpckhqdq xmm1,xmm0		; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+	movdqa	xmm7,xmm6
+	movdqa	xmm0,xmm2
+	paddw	xmm6,xmm4		; xmm6=data3+data4=tmp3
+	paddw	xmm2,xmm1		; xmm2=data2+data5=tmp2
+	psubw	xmm7,xmm4		; xmm7=data3-data4=tmp4
+	psubw	xmm0,xmm1		; xmm0=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm1,xmm5
+	psubw	xmm3,xmm6		; xmm3=tmp13
+	psubw	xmm5,xmm2		; xmm5=tmp12
+	paddw	xmm4,xmm6		; xmm4=tmp10
+	paddw	xmm1,xmm2		; xmm1=tmp11
+
+	paddw	xmm5,xmm3
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[rel PW_F0707] ; xmm5=z1
+
+	movdqa	xmm6,xmm4
+	movdqa	xmm2,xmm3
+	psubw	xmm4,xmm1		; xmm4=data4
+	psubw	xmm3,xmm5		; xmm3=data6
+	paddw	xmm6,xmm1		; xmm6=data0
+	paddw	xmm2,xmm5		; xmm2=data2
+
+	movdqa	XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+	movdqa	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+	; -- Odd part
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=tmp6
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
+
+	paddw	xmm7,xmm0		; xmm7=tmp10
+	paddw	xmm0,xmm1		; xmm0=tmp11
+	paddw	xmm1,xmm5		; xmm1=tmp12, xmm5=tmp7
+
+	psllw	xmm7,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm0,[rel PW_F0707] ; xmm0=z3
+
+	movdqa	xmm4,xmm7		; xmm4=tmp10
+	psubw	xmm7,xmm1
+	pmulhw	xmm7,[rel PW_F0382] ; xmm7=z5
+	pmulhw	xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+	pmulhw	xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+	paddw	xmm4,xmm7		; xmm4=z2
+	paddw	xmm1,xmm7		; xmm1=z4
+
+	movdqa	xmm3,xmm5
+	psubw	xmm5,xmm0		; xmm5=z13
+	paddw	xmm3,xmm0		; xmm3=z11
+
+	movdqa	xmm6,xmm5
+	movdqa	xmm2,xmm3
+	psubw	xmm5,xmm4		; xmm5=data3
+	psubw	xmm3,xmm1		; xmm3=data7
+	paddw	xmm6,xmm4		; xmm6=data5
+	paddw	xmm2,xmm1		; xmm2=data1
+
+	movdqa	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+	movdqa	XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+	movdqa	XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jfss2fst.asm b/simd/jfss2fst.asm
new file mode 100644
index 0000000..73fc9e5
--- /dev/null
+++ b/simd/jfss2fst.asm

@@ -0,0 +1,404 @@
+;
+; jfss2fst.asm - fast integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707	times 8 dw  F_0_707 << CONST_SHIFT
+PW_F0382	times 8 dw  F_0_382 << CONST_SHIFT
+PW_F0541	times 8 dw  F_0_541 << CONST_SHIFT
+PW_F1306	times 8 dw  F_1_306 << CONST_SHIFT
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2 (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
+	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
+
+	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
+	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(42 52 62 72 43 53 63 73)
+	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=(44 54 64 74 45 55 65 75)
+
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
+	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
+	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
+	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
+	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+	movdqa	xmm6,xmm1
+	movdqa	xmm3,xmm0
+	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
+	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
+	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
+	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm2, XMMWORD [wk(0)]	; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(44 54 64 74 45 55 65 75)
+	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
+
+	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
+	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
+	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
+	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+	movdqa	xmm2,xmm1
+	movdqa	xmm5,xmm7
+	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
+	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
+	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
+	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm0,xmm6
+	psubw	xmm3,xmm1		; xmm3=tmp13
+	psubw	xmm6,xmm7		; xmm6=tmp12
+	paddw	xmm4,xmm1		; xmm4=tmp10
+	paddw	xmm0,xmm7		; xmm0=tmp11
+
+	paddw	xmm6,xmm3
+	psllw	xmm6,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
+
+	movdqa	xmm1,xmm4
+	movdqa	xmm7,xmm3
+	psubw	xmm4,xmm0		; xmm4=data4
+	psubw	xmm3,xmm6		; xmm3=data6
+	paddw	xmm1,xmm0		; xmm1=data0
+	paddw	xmm7,xmm6		; xmm7=data2
+
+	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=tmp6
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp7
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=data4
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=data6
+
+	; -- Odd part
+
+	paddw	xmm2,xmm5		; xmm2=tmp10
+	paddw	xmm5,xmm0		; xmm5=tmp11
+	paddw	xmm0,xmm6		; xmm0=tmp12, xmm6=tmp7
+
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
+
+	movdqa	xmm4,xmm2		; xmm4=tmp10
+	psubw	xmm2,xmm0
+	pmulhw	xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+	pmulhw	xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+	paddw	xmm4,xmm2		; xmm4=z2
+	paddw	xmm0,xmm2		; xmm0=z4
+
+	movdqa	xmm3,xmm6
+	psubw	xmm6,xmm5		; xmm6=z13
+	paddw	xmm3,xmm5		; xmm3=z11
+
+	movdqa	xmm2,xmm6
+	movdqa	xmm5,xmm3
+	psubw	xmm6,xmm4		; xmm6=data3
+	psubw	xmm3,xmm0		; xmm3=data7
+	paddw	xmm2,xmm4		; xmm2=data5
+	paddw	xmm5,xmm0		; xmm5=data1
+
+	; ---- Pass 2: process columns.
+
+;	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+	; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+	movdqa    xmm4,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm5		; xmm1=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm4,xmm5		; xmm4=(40 41 50 51 60 61 70 71)
+	movdqa    xmm0,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm6		; xmm7=(02 03 12 13 22 23 32 33)
+	punpckhwd xmm0,xmm6		; xmm0=(42 43 52 53 62 63 72 73)
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=col4
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=col6
+
+	; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+	; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(02 03 12 13 22 23 32 33)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(42 43 52 53 62 63 72 73)
+
+	movdqa    xmm7,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm2		; xmm5=(04 05 14 15 24 25 34 35)
+	punpckhwd xmm7,xmm2		; xmm7=(44 45 54 55 64 65 74 75)
+	movdqa    xmm0,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm3		; xmm6=(06 07 16 17 26 27 36 37)
+	punpckhwd xmm0,xmm3		; xmm0=(46 47 56 57 66 67 76 77)
+
+	movdqa    xmm2,xmm5		; transpose coefficients(phase 2)
+	punpckldq xmm5,xmm6		; xmm5=(04 05 06 07 14 15 16 17)
+	punpckhdq xmm2,xmm6		; xmm2=(24 25 26 27 34 35 36 37)
+	movdqa    xmm3,xmm7		; transpose coefficients(phase 2)
+	punpckldq xmm7,xmm0		; xmm7=(44 45 46 47 54 55 56 57)
+	punpckhdq xmm3,xmm0		; xmm3=(64 65 66 67 74 75 76 77)
+
+	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=(02 03 12 13 22 23 32 33)
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(42 43 52 53 62 63 72 73)
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(24 25 26 27 34 35 36 37)
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(44 45 46 47 54 55 56 57)
+
+	movdqa    xmm2,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm6		; xmm1=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm2,xmm6		; xmm2=(20 21 22 23 30 31 32 33)
+	movdqa    xmm7,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm0		; xmm4=(40 41 42 43 50 51 52 53)
+	punpckhdq xmm7,xmm0		; xmm7=(60 61 62 63 70 71 72 73)
+
+	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm5		; xmm1=(00 01 02 03 04 05 06 07)=data0
+	punpckhqdq xmm6,xmm5		; xmm6=(10 11 12 13 14 15 16 17)=data1
+	movdqa     xmm0,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm3		; xmm7=(60 61 62 63 64 65 66 67)=data6
+	punpckhqdq xmm0,xmm3		; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm3,xmm1
+	psubw	xmm6,xmm7		; xmm6=data1-data6=tmp6
+	psubw	xmm1,xmm0		; xmm1=data0-data7=tmp7
+	paddw	xmm5,xmm7		; xmm5=data1+data6=tmp1
+	paddw	xmm3,xmm0		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(24 25 26 27 34 35 36 37)
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(44 45 46 47 54 55 56 57)
+	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=tmp7
+
+	movdqa     xmm6,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm7		; xmm2=(20 21 22 23 24 25 26 27)=data2
+	punpckhqdq xmm6,xmm7		; xmm6=(30 31 32 33 34 35 36 37)=data3
+	movdqa     xmm1,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm0		; xmm4=(40 41 42 43 44 45 46 47)=data4
+	punpckhqdq xmm1,xmm0		; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+	movdqa	xmm7,xmm6
+	movdqa	xmm0,xmm2
+	paddw	xmm6,xmm4		; xmm6=data3+data4=tmp3
+	paddw	xmm2,xmm1		; xmm2=data2+data5=tmp2
+	psubw	xmm7,xmm4		; xmm7=data3-data4=tmp4
+	psubw	xmm0,xmm1		; xmm0=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm1,xmm5
+	psubw	xmm3,xmm6		; xmm3=tmp13
+	psubw	xmm5,xmm2		; xmm5=tmp12
+	paddw	xmm4,xmm6		; xmm4=tmp10
+	paddw	xmm1,xmm2		; xmm1=tmp11
+
+	paddw	xmm5,xmm3
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
+
+	movdqa	xmm6,xmm4
+	movdqa	xmm2,xmm3
+	psubw	xmm4,xmm1		; xmm4=data4
+	psubw	xmm3,xmm5		; xmm3=data6
+	paddw	xmm6,xmm1		; xmm6=data0
+	paddw	xmm2,xmm5		; xmm2=data2
+
+	movdqa	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+	movdqa	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+	movdqa	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+	; -- Odd part
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=tmp6
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
+
+	paddw	xmm7,xmm0		; xmm7=tmp10
+	paddw	xmm0,xmm1		; xmm0=tmp11
+	paddw	xmm1,xmm5		; xmm1=tmp12, xmm5=tmp7
+
+	psllw	xmm7,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
+
+	movdqa	xmm4,xmm7		; xmm4=tmp10
+	psubw	xmm7,xmm1
+	pmulhw	xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+	pmulhw	xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+	paddw	xmm4,xmm7		; xmm4=z2
+	paddw	xmm1,xmm7		; xmm1=z4
+
+	movdqa	xmm3,xmm5
+	psubw	xmm5,xmm0		; xmm5=z13
+	paddw	xmm3,xmm0		; xmm3=z11
+
+	movdqa	xmm6,xmm5
+	movdqa	xmm2,xmm3
+	psubw	xmm5,xmm4		; xmm5=data3
+	psubw	xmm3,xmm1		; xmm3=data7
+	paddw	xmm6,xmm4		; xmm6=data5
+	paddw	xmm2,xmm1		; xmm2=data1
+
+	movdqa	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+	movdqa	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+	movdqa	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+	movdqa	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jfss2int-64.asm b/simd/jfss2int-64.asm
new file mode 100644
index 0000000..b8ec4b5
--- /dev/null
+++ b/simd/jfss2int-64.asm

@@ -0,0 +1,622 @@
+;
+; jfss2int.asm - accurate integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X	times 8 dw  1 << (PASS1_BITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2 (DCTELEM * data)
+;
+
+; r10 = DCTELEM * data
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		6
+
+	align	16
+	global	EXTN(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+
+	; ---- Pass 1: process rows.
+
+	mov	rdx, r10	; (DCTELEM *)
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
+	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
+
+	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
+	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
+	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=(42 52 62 72 43 53 63 73)
+	movdqa	XMMWORD [wk(3)], xmm2	; wk(3)=(44 54 64 74 45 55 65 75)
+
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
+	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
+	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
+	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
+	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+	movdqa	xmm6,xmm1
+	movdqa	xmm3,xmm0
+	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
+	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
+	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
+	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa	xmm5, XMMWORD [wk(3)]	; xmm5=(44 54 64 74 45 55 65 75)
+	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
+
+	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
+	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
+	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
+	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+	movdqa	xmm2,xmm1
+	movdqa	xmm5,xmm7
+	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
+	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
+	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
+	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm0,xmm6
+	paddw	xmm3,xmm1		; xmm3=tmp10
+	paddw	xmm6,xmm7		; xmm6=tmp11
+	psubw	xmm4,xmm1		; xmm4=tmp13
+	psubw	xmm0,xmm7		; xmm0=tmp12
+
+	movdqa	xmm1,xmm3
+	paddw	xmm3,xmm6		; xmm3=tmp10+tmp11
+	psubw	xmm1,xmm6		; xmm1=tmp10-tmp11
+
+	psllw	xmm3,PASS1_BITS		; xmm3=data0
+	psllw	xmm1,PASS1_BITS		; xmm1=data4
+
+	movdqa	XMMWORD [wk(2)], xmm3	; wk(2)=data0
+	movdqa	XMMWORD [wk(3)], xmm1	; wk(3)=data4
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movdqa    xmm7,xmm4		; xmm4=tmp13
+	movdqa    xmm6,xmm4
+	punpcklwd xmm7,xmm0		; xmm0=tmp12
+	punpckhwd xmm6,xmm0
+	movdqa    xmm4,xmm7
+	movdqa    xmm0,xmm6
+	pmaddwd   xmm7,[rel PW_F130_F054]	; xmm7=data2L
+	pmaddwd   xmm6,[rel PW_F130_F054]	; xmm6=data2H
+	pmaddwd   xmm4,[rel PW_F054_MF130]	; xmm4=data6L
+	pmaddwd   xmm0,[rel PW_F054_MF130]	; xmm0=data6H
+
+	paddd	xmm7,[rel PD_DESCALE_P1]
+	paddd	xmm6,[rel PD_DESCALE_P1]
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+	paddd	xmm4,[rel PD_DESCALE_P1]
+	paddd	xmm0,[rel PD_DESCALE_P1]
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm7,xmm6		; xmm7=data2
+	packssdw  xmm4,xmm0		; xmm4=data6
+
+	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=data2
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=data6
+
+	; -- Odd part
+
+	movdqa	xmm3, XMMWORD [wk(0)]	; xmm3=tmp6
+	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp7
+
+	movdqa	xmm6,xmm2		; xmm2=tmp4
+	movdqa	xmm0,xmm5		; xmm5=tmp5
+	paddw	xmm6,xmm3		; xmm6=z3
+	paddw	xmm0,xmm1		; xmm0=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm7,xmm6
+	movdqa    xmm4,xmm6
+	punpcklwd xmm7,xmm0
+	punpckhwd xmm4,xmm0
+	movdqa    xmm6,xmm7
+	movdqa    xmm0,xmm4
+	pmaddwd   xmm7,[rel PW_MF078_F117]	; xmm7=z3L
+	pmaddwd   xmm4,[rel PW_MF078_F117]	; xmm4=z3H
+	pmaddwd   xmm6,[rel PW_F117_F078]	; xmm6=z4L
+	pmaddwd   xmm0,[rel PW_F117_F078]	; xmm0=z4H
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=z3L
+	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movdqa    xmm7,xmm2
+	movdqa    xmm4,xmm2
+	punpcklwd xmm7,xmm1
+	punpckhwd xmm4,xmm1
+	movdqa    xmm2,xmm7
+	movdqa    xmm1,xmm4
+	pmaddwd   xmm7,[rel PW_MF060_MF089]	; xmm7=tmp4L
+	pmaddwd   xmm4,[rel PW_MF060_MF089]	; xmm4=tmp4H
+	pmaddwd   xmm2,[rel PW_MF089_F060]	; xmm2=tmp7L
+	pmaddwd   xmm1,[rel PW_MF089_F060]	; xmm1=tmp7H
+
+	paddd	xmm7, XMMWORD [wk(0)]	; xmm7=data7L
+	paddd	xmm4, XMMWORD [wk(1)]	; xmm4=data7H
+	paddd	xmm2,xmm6		; xmm2=data1L
+	paddd	xmm1,xmm0		; xmm1=data1H
+
+	paddd	xmm7,[rel PD_DESCALE_P1]
+	paddd	xmm4,[rel PD_DESCALE_P1]
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm4,DESCALE_P1
+	paddd	xmm2,[rel PD_DESCALE_P1]
+	paddd	xmm1,[rel PD_DESCALE_P1]
+	psrad	xmm2,DESCALE_P1
+	psrad	xmm1,DESCALE_P1
+
+	packssdw  xmm7,xmm4		; xmm7=data7
+	packssdw  xmm2,xmm1		; xmm2=data1
+
+	movdqa    xmm4,xmm5
+	movdqa    xmm1,xmm5
+	punpcklwd xmm4,xmm3
+	punpckhwd xmm1,xmm3
+	movdqa    xmm5,xmm4
+	movdqa    xmm3,xmm1
+	pmaddwd   xmm4,[rel PW_MF050_MF256]	; xmm4=tmp5L
+	pmaddwd   xmm1,[rel PW_MF050_MF256]	; xmm1=tmp5H
+	pmaddwd   xmm5,[rel PW_MF256_F050]	; xmm5=tmp6L
+	pmaddwd   xmm3,[rel PW_MF256_F050]	; xmm3=tmp6H
+
+	paddd	xmm4,xmm6		; xmm4=data5L
+	paddd	xmm1,xmm0		; xmm1=data5H
+	paddd	xmm5, XMMWORD [wk(0)]	; xmm5=data3L
+	paddd	xmm3, XMMWORD [wk(1)]	; xmm3=data3H
+
+	paddd	xmm4,[rel PD_DESCALE_P1]
+	paddd	xmm1,[rel PD_DESCALE_P1]
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm1,DESCALE_P1
+	paddd	xmm5,[rel PD_DESCALE_P1]
+	paddd	xmm3,[rel PD_DESCALE_P1]
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm3,DESCALE_P1
+
+	packssdw  xmm4,xmm1		; xmm4=data5
+	packssdw  xmm5,xmm3		; xmm5=data3
+
+	; ---- Pass 2: process columns.
+
+	movdqa	xmm6, XMMWORD [wk(2)]	; xmm6=col0
+	movdqa	xmm0, XMMWORD [wk(4)]	; xmm0=col2
+
+	; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+	; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+	movdqa    xmm1,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm2		; xmm6=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm1,xmm2		; xmm1=(40 41 50 51 60 61 70 71)
+	movdqa    xmm3,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm5		; xmm0=(02 03 12 13 22 23 32 33)
+	punpckhwd xmm3,xmm5		; xmm3=(42 43 52 53 62 63 72 73)
+
+	movdqa	xmm2, XMMWORD [wk(3)]	; xmm2=col4
+	movdqa	xmm5, XMMWORD [wk(5)]	; xmm5=col6
+
+	; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+	; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=(02 03 12 13 22 23 32 33)
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(42 43 52 53 62 63 72 73)
+
+	movdqa    xmm0,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm4		; xmm2=(04 05 14 15 24 25 34 35)
+	punpckhwd xmm0,xmm4		; xmm0=(44 45 54 55 64 65 74 75)
+	movdqa    xmm3,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm7		; xmm5=(06 07 16 17 26 27 36 37)
+	punpckhwd xmm3,xmm7		; xmm3=(46 47 56 57 66 67 76 77)
+
+	movdqa    xmm4,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(04 05 06 07 14 15 16 17)
+	punpckhdq xmm4,xmm5		; xmm4=(24 25 26 27 34 35 36 37)
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm3		; xmm0=(44 45 46 47 54 55 56 57)
+	punpckhdq xmm7,xmm3		; xmm7=(64 65 66 67 74 75 76 77)
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=(02 03 12 13 22 23 32 33)
+	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53 62 63 72 73)
+	movdqa	XMMWORD [wk(2)], xmm4	; wk(2)=(24 25 26 27 34 35 36 37)
+	movdqa	XMMWORD [wk(3)], xmm0	; wk(3)=(44 45 46 47 54 55 56 57)
+
+	movdqa    xmm4,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm5		; xmm6=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm4,xmm5		; xmm4=(20 21 22 23 30 31 32 33)
+	movdqa    xmm0,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm3		; xmm1=(40 41 42 43 50 51 52 53)
+	punpckhdq xmm0,xmm3		; xmm0=(60 61 62 63 70 71 72 73)
+
+	movdqa     xmm5,xmm6		; transpose coefficients(phase 3)
+	punpcklqdq xmm6,xmm2		; xmm6=(00 01 02 03 04 05 06 07)=data0
+	punpckhqdq xmm5,xmm2		; xmm5=(10 11 12 13 14 15 16 17)=data1
+	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm7		; xmm0=(60 61 62 63 64 65 66 67)=data6
+	punpckhqdq xmm3,xmm7		; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+	movdqa	xmm2,xmm5
+	movdqa	xmm7,xmm6
+	psubw	xmm5,xmm0		; xmm5=data1-data6=tmp6
+	psubw	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	paddw	xmm2,xmm0		; xmm2=data1+data6=tmp1
+	paddw	xmm7,xmm3		; xmm7=data0+data7=tmp0
+
+	movdqa	xmm0, XMMWORD [wk(2)]	; xmm0=(24 25 26 27 34 35 36 37)
+	movdqa	xmm3, XMMWORD [wk(3)]	; xmm3=(44 45 46 47 54 55 56 57)
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movdqa     xmm5,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm0		; xmm4=(20 21 22 23 24 25 26 27)=data2
+	punpckhqdq xmm5,xmm0		; xmm5=(30 31 32 33 34 35 36 37)=data3
+	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm3		; xmm1=(40 41 42 43 44 45 46 47)=data4
+	punpckhqdq xmm6,xmm3		; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm3,xmm4
+	paddw	xmm5,xmm1		; xmm5=data3+data4=tmp3
+	paddw	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	psubw	xmm0,xmm1		; xmm0=data3-data4=tmp4
+	psubw	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm1,xmm7
+	movdqa	xmm6,xmm2
+	paddw	xmm7,xmm5		; xmm7=tmp10
+	paddw	xmm2,xmm4		; xmm2=tmp11
+	psubw	xmm1,xmm5		; xmm1=tmp13
+	psubw	xmm6,xmm4		; xmm6=tmp12
+
+	movdqa	xmm5,xmm7
+	paddw	xmm7,xmm2		; xmm7=tmp10+tmp11
+	psubw	xmm5,xmm2		; xmm5=tmp10-tmp11
+
+	paddw	xmm7,[rel PW_DESCALE_P2X]
+	paddw	xmm5,[rel PW_DESCALE_P2X]
+	psraw	xmm7,PASS1_BITS		; xmm7=data0
+	psraw	xmm5,PASS1_BITS		; xmm5=data4
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+	movdqa	XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movdqa    xmm4,xmm1		; xmm1=tmp13
+	movdqa    xmm2,xmm1
+	punpcklwd xmm4,xmm6		; xmm6=tmp12
+	punpckhwd xmm2,xmm6
+	movdqa    xmm1,xmm4
+	movdqa    xmm6,xmm2
+	pmaddwd   xmm4,[rel PW_F130_F054]	; xmm4=data2L
+	pmaddwd   xmm2,[rel PW_F130_F054]	; xmm2=data2H
+	pmaddwd   xmm1,[rel PW_F054_MF130]	; xmm1=data6L
+	pmaddwd   xmm6,[rel PW_F054_MF130]	; xmm6=data6H
+
+	paddd	xmm4,[rel PD_DESCALE_P2]
+	paddd	xmm2,[rel PD_DESCALE_P2]
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm2,DESCALE_P2
+	paddd	xmm1,[rel PD_DESCALE_P2]
+	paddd	xmm6,[rel PD_DESCALE_P2]
+	psrad	xmm1,DESCALE_P2
+	psrad	xmm6,DESCALE_P2
+
+	packssdw  xmm4,xmm2		; xmm4=data2
+	packssdw  xmm1,xmm6		; xmm1=data6
+
+	movdqa	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+	; -- Odd part
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp6
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
+
+	movdqa	xmm2,xmm0		; xmm0=tmp4
+	movdqa	xmm6,xmm3		; xmm3=tmp5
+	paddw	xmm2,xmm7		; xmm2=z3
+	paddw	xmm6,xmm5		; xmm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm1,xmm2
+	punpcklwd xmm4,xmm6
+	punpckhwd xmm1,xmm6
+	movdqa    xmm2,xmm4
+	movdqa    xmm6,xmm1
+	pmaddwd   xmm4,[rel PW_MF078_F117]	; xmm4=z3L
+	pmaddwd   xmm1,[rel PW_MF078_F117]	; xmm1=z3H
+	pmaddwd   xmm2,[rel PW_F117_F078]	; xmm2=z4L
+	pmaddwd   xmm6,[rel PW_F117_F078]	; xmm6=z4H
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=z3L
+	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movdqa    xmm4,xmm0
+	movdqa    xmm1,xmm0
+	punpcklwd xmm4,xmm5
+	punpckhwd xmm1,xmm5
+	movdqa    xmm0,xmm4
+	movdqa    xmm5,xmm1
+	pmaddwd   xmm4,[rel PW_MF060_MF089]	; xmm4=tmp4L
+	pmaddwd   xmm1,[rel PW_MF060_MF089]	; xmm1=tmp4H
+	pmaddwd   xmm0,[rel PW_MF089_F060]	; xmm0=tmp7L
+	pmaddwd   xmm5,[rel PW_MF089_F060]	; xmm5=tmp7H
+
+	paddd	xmm4, XMMWORD [wk(0)]	; xmm4=data7L
+	paddd	xmm1, XMMWORD [wk(1)]	; xmm1=data7H
+	paddd	xmm0,xmm2		; xmm0=data1L
+	paddd	xmm5,xmm6		; xmm5=data1H
+
+	paddd	xmm4,[rel PD_DESCALE_P2]
+	paddd	xmm1,[rel PD_DESCALE_P2]
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm0,[rel PD_DESCALE_P2]
+	paddd	xmm5,[rel PD_DESCALE_P2]
+	psrad	xmm0,DESCALE_P2
+	psrad	xmm5,DESCALE_P2
+
+	packssdw  xmm4,xmm1		; xmm4=data7
+	packssdw  xmm0,xmm5		; xmm0=data1
+
+	movdqa	XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+	movdqa    xmm1,xmm3
+	movdqa    xmm5,xmm3
+	punpcklwd xmm1,xmm7
+	punpckhwd xmm5,xmm7
+	movdqa    xmm3,xmm1
+	movdqa    xmm7,xmm5
+	pmaddwd   xmm1,[rel PW_MF050_MF256]	; xmm1=tmp5L
+	pmaddwd   xmm5,[rel PW_MF050_MF256]	; xmm5=tmp5H
+	pmaddwd   xmm3,[rel PW_MF256_F050]	; xmm3=tmp6L
+	pmaddwd   xmm7,[rel PW_MF256_F050]	; xmm7=tmp6H
+
+	paddd	xmm1,xmm2		; xmm1=data5L
+	paddd	xmm5,xmm6		; xmm5=data5H
+	paddd	xmm3, XMMWORD [wk(0)]	; xmm3=data3L
+	paddd	xmm7, XMMWORD [wk(1)]	; xmm7=data3H
+
+	paddd	xmm1,[rel PD_DESCALE_P2]
+	paddd	xmm5,[rel PD_DESCALE_P2]
+	psrad	xmm1,DESCALE_P2
+	psrad	xmm5,DESCALE_P2
+	paddd	xmm3,[rel PD_DESCALE_P2]
+	paddd	xmm7,[rel PD_DESCALE_P2]
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm1,xmm5		; xmm1=data5
+	packssdw  xmm3,xmm7		; xmm3=data3
+
+	movdqa	XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jfss2int.asm b/simd/jfss2int.asm
new file mode 100644
index 0000000..5e3f2aa
--- /dev/null
+++ b/simd/jfss2int.asm

@@ -0,0 +1,634 @@
+;
+; jfss2int.asm - accurate integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X	times 8 dw  1 << (PASS1_BITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2 (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		6
+
+	align	16
+	global	EXTN(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
+	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
+
+	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
+	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
+	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=(42 52 62 72 43 53 63 73)
+	movdqa	XMMWORD [wk(3)], xmm2	; wk(3)=(44 54 64 74 45 55 65 75)
+
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
+	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
+	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
+	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
+	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+	movdqa	xmm6,xmm1
+	movdqa	xmm3,xmm0
+	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
+	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
+	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
+	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa	xmm5, XMMWORD [wk(3)]	; xmm5=(44 54 64 74 45 55 65 75)
+	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
+
+	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
+	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
+	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
+	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+	movdqa	xmm2,xmm1
+	movdqa	xmm5,xmm7
+	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
+	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
+	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
+	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm0,xmm6
+	paddw	xmm3,xmm1		; xmm3=tmp10
+	paddw	xmm6,xmm7		; xmm6=tmp11
+	psubw	xmm4,xmm1		; xmm4=tmp13
+	psubw	xmm0,xmm7		; xmm0=tmp12
+
+	movdqa	xmm1,xmm3
+	paddw	xmm3,xmm6		; xmm3=tmp10+tmp11
+	psubw	xmm1,xmm6		; xmm1=tmp10-tmp11
+
+	psllw	xmm3,PASS1_BITS		; xmm3=data0
+	psllw	xmm1,PASS1_BITS		; xmm1=data4
+
+	movdqa	XMMWORD [wk(2)], xmm3	; wk(2)=data0
+	movdqa	XMMWORD [wk(3)], xmm1	; wk(3)=data4
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movdqa    xmm7,xmm4		; xmm4=tmp13
+	movdqa    xmm6,xmm4
+	punpcklwd xmm7,xmm0		; xmm0=tmp12
+	punpckhwd xmm6,xmm0
+	movdqa    xmm4,xmm7
+	movdqa    xmm0,xmm6
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_F130_F054)]	; xmm7=data2L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]	; xmm6=data2H
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F054_MF130)]	; xmm4=data6L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F054_MF130)]	; xmm0=data6H
+
+	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm7,xmm6		; xmm7=data2
+	packssdw  xmm4,xmm0		; xmm4=data6
+
+	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=data2
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=data6
+
+	; -- Odd part
+
+	movdqa	xmm3, XMMWORD [wk(0)]	; xmm3=tmp6
+	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp7
+
+	movdqa	xmm6,xmm2		; xmm2=tmp4
+	movdqa	xmm0,xmm5		; xmm5=tmp5
+	paddw	xmm6,xmm3		; xmm6=z3
+	paddw	xmm0,xmm1		; xmm0=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm7,xmm6
+	movdqa    xmm4,xmm6
+	punpcklwd xmm7,xmm0
+	punpckhwd xmm4,xmm0
+	movdqa    xmm6,xmm7
+	movdqa    xmm0,xmm4
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]	; xmm7=z3L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]	; xmm4=z3H
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]	; xmm6=z4L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F117_F078)]	; xmm0=z4H
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=z3L
+	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movdqa    xmm7,xmm2
+	movdqa    xmm4,xmm2
+	punpcklwd xmm7,xmm1
+	punpckhwd xmm4,xmm1
+	movdqa    xmm2,xmm7
+	movdqa    xmm1,xmm4
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm7=tmp4L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm4=tmp4H
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF089_F060)]	; xmm2=tmp7L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]	; xmm1=tmp7H
+
+	paddd	xmm7, XMMWORD [wk(0)]	; xmm7=data7L
+	paddd	xmm4, XMMWORD [wk(1)]	; xmm4=data7H
+	paddd	xmm2,xmm6		; xmm2=data1L
+	paddd	xmm1,xmm0		; xmm1=data1H
+
+	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm4,DESCALE_P1
+	paddd	xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm2,DESCALE_P1
+	psrad	xmm1,DESCALE_P1
+
+	packssdw  xmm7,xmm4		; xmm7=data7
+	packssdw  xmm2,xmm1		; xmm2=data1
+
+	movdqa    xmm4,xmm5
+	movdqa    xmm1,xmm5
+	punpcklwd xmm4,xmm3
+	punpckhwd xmm1,xmm3
+	movdqa    xmm5,xmm4
+	movdqa    xmm3,xmm1
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm4=tmp5L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm1=tmp5H
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF256_F050)]	; xmm5=tmp6L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]	; xmm3=tmp6H
+
+	paddd	xmm4,xmm6		; xmm4=data5L
+	paddd	xmm1,xmm0		; xmm1=data5H
+	paddd	xmm5, XMMWORD [wk(0)]	; xmm5=data3L
+	paddd	xmm3, XMMWORD [wk(1)]	; xmm3=data3H
+
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm1,DESCALE_P1
+	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm3,DESCALE_P1
+
+	packssdw  xmm4,xmm1		; xmm4=data5
+	packssdw  xmm5,xmm3		; xmm5=data3
+
+	; ---- Pass 2: process columns.
+
+;	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	movdqa	xmm6, XMMWORD [wk(2)]	; xmm6=col0
+	movdqa	xmm0, XMMWORD [wk(4)]	; xmm0=col2
+
+	; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+	; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+	movdqa    xmm1,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm2		; xmm6=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm1,xmm2		; xmm1=(40 41 50 51 60 61 70 71)
+	movdqa    xmm3,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm5		; xmm0=(02 03 12 13 22 23 32 33)
+	punpckhwd xmm3,xmm5		; xmm3=(42 43 52 53 62 63 72 73)
+
+	movdqa	xmm2, XMMWORD [wk(3)]	; xmm2=col4
+	movdqa	xmm5, XMMWORD [wk(5)]	; xmm5=col6
+
+	; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+	; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=(02 03 12 13 22 23 32 33)
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(42 43 52 53 62 63 72 73)
+
+	movdqa    xmm0,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm4		; xmm2=(04 05 14 15 24 25 34 35)
+	punpckhwd xmm0,xmm4		; xmm0=(44 45 54 55 64 65 74 75)
+	movdqa    xmm3,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm7		; xmm5=(06 07 16 17 26 27 36 37)
+	punpckhwd xmm3,xmm7		; xmm3=(46 47 56 57 66 67 76 77)
+
+	movdqa    xmm4,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(04 05 06 07 14 15 16 17)
+	punpckhdq xmm4,xmm5		; xmm4=(24 25 26 27 34 35 36 37)
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm3		; xmm0=(44 45 46 47 54 55 56 57)
+	punpckhdq xmm7,xmm3		; xmm7=(64 65 66 67 74 75 76 77)
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=(02 03 12 13 22 23 32 33)
+	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53 62 63 72 73)
+	movdqa	XMMWORD [wk(2)], xmm4	; wk(2)=(24 25 26 27 34 35 36 37)
+	movdqa	XMMWORD [wk(3)], xmm0	; wk(3)=(44 45 46 47 54 55 56 57)
+
+	movdqa    xmm4,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm5		; xmm6=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm4,xmm5		; xmm4=(20 21 22 23 30 31 32 33)
+	movdqa    xmm0,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm3		; xmm1=(40 41 42 43 50 51 52 53)
+	punpckhdq xmm0,xmm3		; xmm0=(60 61 62 63 70 71 72 73)
+
+	movdqa     xmm5,xmm6		; transpose coefficients(phase 3)
+	punpcklqdq xmm6,xmm2		; xmm6=(00 01 02 03 04 05 06 07)=data0
+	punpckhqdq xmm5,xmm2		; xmm5=(10 11 12 13 14 15 16 17)=data1
+	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm7		; xmm0=(60 61 62 63 64 65 66 67)=data6
+	punpckhqdq xmm3,xmm7		; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+	movdqa	xmm2,xmm5
+	movdqa	xmm7,xmm6
+	psubw	xmm5,xmm0		; xmm5=data1-data6=tmp6
+	psubw	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	paddw	xmm2,xmm0		; xmm2=data1+data6=tmp1
+	paddw	xmm7,xmm3		; xmm7=data0+data7=tmp0
+
+	movdqa	xmm0, XMMWORD [wk(2)]	; xmm0=(24 25 26 27 34 35 36 37)
+	movdqa	xmm3, XMMWORD [wk(3)]	; xmm3=(44 45 46 47 54 55 56 57)
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movdqa     xmm5,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm0		; xmm4=(20 21 22 23 24 25 26 27)=data2
+	punpckhqdq xmm5,xmm0		; xmm5=(30 31 32 33 34 35 36 37)=data3
+	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm3		; xmm1=(40 41 42 43 44 45 46 47)=data4
+	punpckhqdq xmm6,xmm3		; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm3,xmm4
+	paddw	xmm5,xmm1		; xmm5=data3+data4=tmp3
+	paddw	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	psubw	xmm0,xmm1		; xmm0=data3-data4=tmp4
+	psubw	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm1,xmm7
+	movdqa	xmm6,xmm2
+	paddw	xmm7,xmm5		; xmm7=tmp10
+	paddw	xmm2,xmm4		; xmm2=tmp11
+	psubw	xmm1,xmm5		; xmm1=tmp13
+	psubw	xmm6,xmm4		; xmm6=tmp12
+
+	movdqa	xmm5,xmm7
+	paddw	xmm7,xmm2		; xmm7=tmp10+tmp11
+	psubw	xmm5,xmm2		; xmm5=tmp10-tmp11
+
+	paddw	xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	paddw	xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	psraw	xmm7,PASS1_BITS		; xmm7=data0
+	psraw	xmm5,PASS1_BITS		; xmm5=data4
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+	movdqa	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movdqa    xmm4,xmm1		; xmm1=tmp13
+	movdqa    xmm2,xmm1
+	punpcklwd xmm4,xmm6		; xmm6=tmp12
+	punpckhwd xmm2,xmm6
+	movdqa    xmm1,xmm4
+	movdqa    xmm6,xmm2
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]	; xmm4=data2L
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F130_F054)]	; xmm2=data2H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=data6L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F054_MF130)]	; xmm6=data6H
+
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm2,DESCALE_P2
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm1,DESCALE_P2
+	psrad	xmm6,DESCALE_P2
+
+	packssdw  xmm4,xmm2		; xmm4=data2
+	packssdw  xmm1,xmm6		; xmm1=data6
+
+	movdqa	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+	; -- Odd part
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp6
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
+
+	movdqa	xmm2,xmm0		; xmm0=tmp4
+	movdqa	xmm6,xmm3		; xmm3=tmp5
+	paddw	xmm2,xmm7		; xmm2=z3
+	paddw	xmm6,xmm5		; xmm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm1,xmm2
+	punpcklwd xmm4,xmm6
+	punpckhwd xmm1,xmm6
+	movdqa    xmm2,xmm4
+	movdqa    xmm6,xmm1
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]	; xmm4=z3L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF078_F117)]	; xmm1=z3H
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F117_F078)]	; xmm2=z4L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]	; xmm6=z4H
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=z3L
+	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movdqa    xmm4,xmm0
+	movdqa    xmm1,xmm0
+	punpcklwd xmm4,xmm5
+	punpckhwd xmm1,xmm5
+	movdqa    xmm0,xmm4
+	movdqa    xmm5,xmm1
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm4=tmp4L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm1=tmp4H
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF089_F060)]	; xmm0=tmp7L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF089_F060)]	; xmm5=tmp7H
+
+	paddd	xmm4, XMMWORD [wk(0)]	; xmm4=data7L
+	paddd	xmm1, XMMWORD [wk(1)]	; xmm1=data7H
+	paddd	xmm0,xmm2		; xmm0=data1L
+	paddd	xmm5,xmm6		; xmm5=data1H
+
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm0,DESCALE_P2
+	psrad	xmm5,DESCALE_P2
+
+	packssdw  xmm4,xmm1		; xmm4=data7
+	packssdw  xmm0,xmm5		; xmm0=data1
+
+	movdqa	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+	movdqa    xmm1,xmm3
+	movdqa    xmm5,xmm3
+	punpcklwd xmm1,xmm7
+	punpckhwd xmm5,xmm7
+	movdqa    xmm3,xmm1
+	movdqa    xmm7,xmm5
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm1=tmp5L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm5=tmp5H
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]	; xmm3=tmp6L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF256_F050)]	; xmm7=tmp6H
+
+	paddd	xmm1,xmm2		; xmm1=data5L
+	paddd	xmm5,xmm6		; xmm5=data5H
+	paddd	xmm3, XMMWORD [wk(0)]	; xmm3=data3L
+	paddd	xmm7, XMMWORD [wk(1)]	; xmm7=data3H
+
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm1,DESCALE_P2
+	psrad	xmm5,DESCALE_P2
+	paddd	xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm1,xmm5		; xmm1=data5
+	packssdw  xmm3,xmm7		; xmm3=data3
+
+	movdqa	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jfsseflt-64.asm b/simd/jfsseflt-64.asm
new file mode 100644
index 0000000..0f3e21a
--- /dev/null
+++ b/simd/jfsseflt-64.asm

@@ -0,0 +1,358 @@
+;
+; jfsseflt.asm - floating-point FDCT (64-bit SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382	times 4 dd  0.382683432365089771728460
+PD_0_707	times 4 dd  0.707106781186547524400844
+PD_0_541	times 4 dd  0.541196100146196984399723
+PD_1_306	times 4 dd  1.306562964876376527856643
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse (FAST_FLOAT * data)
+;
+
+; r10 = FAST_FLOAT * data
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+
+	; ---- Pass 1: process rows.
+
+	mov	rdx, r10	; (FAST_FLOAT *)
+	mov	rcx, DCTSIZE/4
+.rowloop:
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+	; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+	; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+	unpcklps xmm0,xmm1		; xmm0=(20 30 21 31)
+	unpckhps xmm4,xmm1		; xmm4=(22 32 23 33)
+	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+	unpcklps xmm2,xmm3		; xmm2=(24 34 25 35)
+	unpckhps xmm5,xmm3		; xmm5=(26 36 27 37)
+
+	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+	; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+	; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 32 23 33)
+	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(24 34 25 35)
+
+	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+	unpcklps xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps xmm4,xmm7		; xmm4=(02 12 03 13)
+	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+	unpcklps xmm1,xmm3		; xmm1=(04 14 05 15)
+	unpckhps xmm2,xmm3		; xmm2=(06 16 07 17)
+
+	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm0		; xmm6=(00 10 20 30)=data0
+	unpckhps2 xmm7,xmm0		; xmm7=(01 11 21 31)=data1
+	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+	unpcklps2 xmm2,xmm5		; xmm2=(06 16 26 36)=data6
+	unpckhps2 xmm3,xmm5		; xmm3=(07 17 27 37)=data7
+
+	movaps	xmm0,xmm7
+	movaps	xmm5,xmm6
+	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+
+	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 32 23 33)
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(24 34 25 35)
+	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(02 12 22 32)=data2
+	unpckhps2 xmm7,xmm2		; xmm7=(03 13 23 33)=data3
+	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm3		; xmm1=(04 14 24 34)=data4
+	unpckhps2 xmm6,xmm3		; xmm6=(05 15 25 35)=data5
+
+	movaps	xmm2,xmm7
+	movaps	xmm3,xmm4
+	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movaps	xmm1,xmm5
+	movaps	xmm6,xmm0
+	subps	xmm5,xmm7		; xmm5=tmp13
+	subps	xmm0,xmm4		; xmm0=tmp12
+	addps	xmm1,xmm7		; xmm1=tmp10
+	addps	xmm6,xmm4		; xmm6=tmp11
+
+	addps	xmm0,xmm5
+	mulps	xmm0,[rel PD_0_707] ; xmm0=z1
+
+	movaps	xmm7,xmm1
+	movaps	xmm4,xmm5
+	subps	xmm1,xmm6		; xmm1=data4
+	subps	xmm5,xmm0		; xmm5=data6
+	addps	xmm7,xmm6		; xmm7=data0
+	addps	xmm4,xmm0		; xmm4=data2
+
+	movaps	XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+	; -- Odd part
+
+	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+
+	addps	xmm2,xmm3		; xmm2=tmp10
+	addps	xmm3,xmm6		; xmm3=tmp11
+	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+
+	mulps	xmm3,[rel PD_0_707] ; xmm3=z3
+
+	movaps	xmm1,xmm2		; xmm1=tmp10
+	subps	xmm2,xmm6
+	mulps	xmm2,[rel PD_0_382] ; xmm2=z5
+	mulps	xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+	mulps	xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+	addps	xmm1,xmm2		; xmm1=z2
+	addps	xmm6,xmm2		; xmm6=z4
+
+	movaps	xmm5,xmm0
+	subps	xmm0,xmm3		; xmm0=z13
+	addps	xmm5,xmm3		; xmm5=z11
+
+	movaps	xmm7,xmm0
+	movaps	xmm4,xmm5
+	subps	xmm0,xmm1		; xmm0=data3
+	subps	xmm5,xmm6		; xmm5=data7
+	addps	xmm7,xmm1		; xmm7=data5
+	addps	xmm4,xmm6		; xmm4=data1
+
+	movaps	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+	add	rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	rcx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	rdx, r10	; (FAST_FLOAT *)
+	mov	rcx, DCTSIZE/4
+.columnloop:
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+	; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+	; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+	unpcklps xmm0,xmm1		; xmm0=(02 03 12 13)
+	unpckhps xmm4,xmm1		; xmm4=(22 23 32 33)
+	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+	unpcklps xmm2,xmm3		; xmm2=(42 43 52 53)
+	unpckhps xmm5,xmm3		; xmm5=(62 63 72 73)
+
+	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+	; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+	; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 23 32 33)
+	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(42 43 52 53)
+
+	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+	unpcklps xmm6,xmm7		; xmm6=(00 01 10 11)
+	unpckhps xmm4,xmm7		; xmm4=(20 21 30 31)
+	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+	unpcklps xmm1,xmm3		; xmm1=(40 41 50 51)
+	unpckhps xmm2,xmm3		; xmm2=(60 61 70 71)
+
+	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm0		; xmm6=(00 01 02 03)=data0
+	unpckhps2 xmm7,xmm0		; xmm7=(10 11 12 13)=data1
+	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+	unpcklps2 xmm2,xmm5		; xmm2=(60 61 62 63)=data6
+	unpckhps2 xmm3,xmm5		; xmm3=(70 71 72 73)=data7
+
+	movaps	xmm0,xmm7
+	movaps	xmm5,xmm6
+	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+
+	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 23 32 33)
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53)
+	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(20 21 22 23)=data2
+	unpckhps2 xmm7,xmm2		; xmm7=(30 31 32 33)=data3
+	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm3		; xmm1=(40 41 42 43)=data4
+	unpckhps2 xmm6,xmm3		; xmm6=(50 51 52 53)=data5
+
+	movaps	xmm2,xmm7
+	movaps	xmm3,xmm4
+	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movaps	xmm1,xmm5
+	movaps	xmm6,xmm0
+	subps	xmm5,xmm7		; xmm5=tmp13
+	subps	xmm0,xmm4		; xmm0=tmp12
+	addps	xmm1,xmm7		; xmm1=tmp10
+	addps	xmm6,xmm4		; xmm6=tmp11
+
+	addps	xmm0,xmm5
+	mulps	xmm0,[rel PD_0_707] ; xmm0=z1
+
+	movaps	xmm7,xmm1
+	movaps	xmm4,xmm5
+	subps	xmm1,xmm6		; xmm1=data4
+	subps	xmm5,xmm0		; xmm5=data6
+	addps	xmm7,xmm6		; xmm7=data0
+	addps	xmm4,xmm0		; xmm4=data2
+
+	movaps	XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+	; -- Odd part
+
+	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+
+	addps	xmm2,xmm3		; xmm2=tmp10
+	addps	xmm3,xmm6		; xmm3=tmp11
+	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+
+	mulps	xmm3,[rel PD_0_707] ; xmm3=z3
+
+	movaps	xmm1,xmm2		; xmm1=tmp10
+	subps	xmm2,xmm6
+	mulps	xmm2,[rel PD_0_382] ; xmm2=z5
+	mulps	xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+	mulps	xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+	addps	xmm1,xmm2		; xmm1=z2
+	addps	xmm6,xmm2		; xmm6=z4
+
+	movaps	xmm5,xmm0
+	subps	xmm0,xmm3		; xmm0=z13
+	addps	xmm5,xmm3		; xmm5=z11
+
+	movaps	xmm7,xmm0
+	movaps	xmm4,xmm5
+	subps	xmm0,xmm1		; xmm0=data3
+	subps	xmm5,xmm6		; xmm5=data7
+	addps	xmm7,xmm1		; xmm7=data5
+	addps	xmm4,xmm6		; xmm4=data1
+
+	movaps	XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+	add	rdx, byte 4*SIZEOF_FAST_FLOAT
+	dec	rcx
+	jnz	near .columnloop
+
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jfsseflt.asm b/simd/jfsseflt.asm
new file mode 100644
index 0000000..bc54ccc
--- /dev/null
+++ b/simd/jfsseflt.asm

@@ -0,0 +1,370 @@
+;
+; jfsseflt.asm - floating-point FDCT (SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382	times 4 dd  0.382683432365089771728460
+PD_0_707	times 4 dd  0.707106781186547524400844
+PD_0_541	times 4 dd  0.541196100146196984399723
+PD_1_306	times 4 dd  1.306562964876376527856643
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse (FAST_FLOAT * data)
+;
+
+%define data(b)		(b)+8		; FAST_FLOAT * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+	; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+	unpcklps xmm0,xmm1		; xmm0=(20 30 21 31)
+	unpckhps xmm4,xmm1		; xmm4=(22 32 23 33)
+	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+	unpcklps xmm2,xmm3		; xmm2=(24 34 25 35)
+	unpckhps xmm5,xmm3		; xmm5=(26 36 27 37)
+
+	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+	; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 32 23 33)
+	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(24 34 25 35)
+
+	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+	unpcklps xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps xmm4,xmm7		; xmm4=(02 12 03 13)
+	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+	unpcklps xmm1,xmm3		; xmm1=(04 14 05 15)
+	unpckhps xmm2,xmm3		; xmm2=(06 16 07 17)
+
+	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm0		; xmm6=(00 10 20 30)=data0
+	unpckhps2 xmm7,xmm0		; xmm7=(01 11 21 31)=data1
+	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+	unpcklps2 xmm2,xmm5		; xmm2=(06 16 26 36)=data6
+	unpckhps2 xmm3,xmm5		; xmm3=(07 17 27 37)=data7
+
+	movaps	xmm0,xmm7
+	movaps	xmm5,xmm6
+	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+
+	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 32 23 33)
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(24 34 25 35)
+	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(02 12 22 32)=data2
+	unpckhps2 xmm7,xmm2		; xmm7=(03 13 23 33)=data3
+	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm3		; xmm1=(04 14 24 34)=data4
+	unpckhps2 xmm6,xmm3		; xmm6=(05 15 25 35)=data5
+
+	movaps	xmm2,xmm7
+	movaps	xmm3,xmm4
+	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movaps	xmm1,xmm5
+	movaps	xmm6,xmm0
+	subps	xmm5,xmm7		; xmm5=tmp13
+	subps	xmm0,xmm4		; xmm0=tmp12
+	addps	xmm1,xmm7		; xmm1=tmp10
+	addps	xmm6,xmm4		; xmm6=tmp11
+
+	addps	xmm0,xmm5
+	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+	movaps	xmm7,xmm1
+	movaps	xmm4,xmm5
+	subps	xmm1,xmm6		; xmm1=data4
+	subps	xmm5,xmm0		; xmm5=data6
+	addps	xmm7,xmm6		; xmm7=data0
+	addps	xmm4,xmm0		; xmm4=data2
+
+	movaps	XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	; -- Odd part
+
+	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+
+	addps	xmm2,xmm3		; xmm2=tmp10
+	addps	xmm3,xmm6		; xmm3=tmp11
+	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+
+	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+	movaps	xmm1,xmm2		; xmm1=tmp10
+	subps	xmm2,xmm6
+	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+	addps	xmm1,xmm2		; xmm1=z2
+	addps	xmm6,xmm2		; xmm6=z4
+
+	movaps	xmm5,xmm0
+	subps	xmm0,xmm3		; xmm0=z13
+	addps	xmm5,xmm3		; xmm5=z11
+
+	movaps	xmm7,xmm0
+	movaps	xmm4,xmm5
+	subps	xmm0,xmm1		; xmm0=data3
+	subps	xmm5,xmm6		; xmm5=data7
+	addps	xmm7,xmm1		; xmm7=data5
+	addps	xmm4,xmm6		; xmm4=data1
+
+	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	add	edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+	; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+	unpcklps xmm0,xmm1		; xmm0=(02 03 12 13)
+	unpckhps xmm4,xmm1		; xmm4=(22 23 32 33)
+	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+	unpcklps xmm2,xmm3		; xmm2=(42 43 52 53)
+	unpckhps xmm5,xmm3		; xmm5=(62 63 72 73)
+
+	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+	; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 23 32 33)
+	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(42 43 52 53)
+
+	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+	unpcklps xmm6,xmm7		; xmm6=(00 01 10 11)
+	unpckhps xmm4,xmm7		; xmm4=(20 21 30 31)
+	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+	unpcklps xmm1,xmm3		; xmm1=(40 41 50 51)
+	unpckhps xmm2,xmm3		; xmm2=(60 61 70 71)
+
+	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm0		; xmm6=(00 01 02 03)=data0
+	unpckhps2 xmm7,xmm0		; xmm7=(10 11 12 13)=data1
+	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+	unpcklps2 xmm2,xmm5		; xmm2=(60 61 62 63)=data6
+	unpckhps2 xmm3,xmm5		; xmm3=(70 71 72 73)=data7
+
+	movaps	xmm0,xmm7
+	movaps	xmm5,xmm6
+	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+
+	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 23 32 33)
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53)
+	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(20 21 22 23)=data2
+	unpckhps2 xmm7,xmm2		; xmm7=(30 31 32 33)=data3
+	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm3		; xmm1=(40 41 42 43)=data4
+	unpckhps2 xmm6,xmm3		; xmm6=(50 51 52 53)=data5
+
+	movaps	xmm2,xmm7
+	movaps	xmm3,xmm4
+	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movaps	xmm1,xmm5
+	movaps	xmm6,xmm0
+	subps	xmm5,xmm7		; xmm5=tmp13
+	subps	xmm0,xmm4		; xmm0=tmp12
+	addps	xmm1,xmm7		; xmm1=tmp10
+	addps	xmm6,xmm4		; xmm6=tmp11
+
+	addps	xmm0,xmm5
+	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+	movaps	xmm7,xmm1
+	movaps	xmm4,xmm5
+	subps	xmm1,xmm6		; xmm1=data4
+	subps	xmm5,xmm0		; xmm5=data6
+	addps	xmm7,xmm6		; xmm7=data0
+	addps	xmm4,xmm0		; xmm4=data2
+
+	movaps	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	; -- Odd part
+
+	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+
+	addps	xmm2,xmm3		; xmm2=tmp10
+	addps	xmm3,xmm6		; xmm3=tmp11
+	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+
+	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+	movaps	xmm1,xmm2		; xmm1=tmp10
+	subps	xmm2,xmm6
+	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+	addps	xmm1,xmm2		; xmm1=z2
+	addps	xmm6,xmm2		; xmm6=z4
+
+	movaps	xmm5,xmm0
+	subps	xmm0,xmm3		; xmm0=z13
+	addps	xmm5,xmm3		; xmm5=z11
+
+	movaps	xmm7,xmm0
+	movaps	xmm4,xmm5
+	subps	xmm0,xmm1		; xmm0=data3
+	subps	xmm5,xmm6		; xmm5=data7
+	addps	xmm7,xmm1		; xmm7=data5
+	addps	xmm4,xmm6		; xmm4=data1
+
+	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	add	edx, byte 4*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .columnloop
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/ji3dnflt.asm b/simd/ji3dnflt.asm
new file mode 100644
index 0000000..dc2076f
--- /dev/null
+++ b/simd/ji3dnflt.asm

@@ -0,0 +1,452 @@
+;
+; ji3dnflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414	times 2 dd  1.414213562373095048801689
+PD_1_847	times 2 dd  1.847759065022573512256366
+PD_1_082	times 2 dd  1.082392200292393968799446
+PD_2_613	times 2 dd  2.613125929752753055713286
+PD_RNDINT_MAGIC	times 2 dd  100663296.0	; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_3dnow (void * dct_table, JCOEFPTR coef_block,
+;                         JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_float_3dnow)
+
+EXTN(jsimd_idct_float_3dnow):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE/2				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	pushpic	ebx		; save GOT address
+	mov	ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	mov	eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	or	ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	or	ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	or	eax,ebx
+	poppic	ebx		; restore GOT address
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd mm0,mm0
+	psrad     mm0,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm0,mm0
+
+	pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0
+	punpckhdq mm1,mm1
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movd      mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movd      mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movd      mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd mm0,mm0
+	punpcklwd mm1,mm1
+	psrad     mm0,(DWORD_BIT-WORD_BIT)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm0,mm0
+	pi2fd     mm1,mm1
+
+	pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	punpcklwd mm2,mm2
+	punpcklwd mm3,mm3
+	psrad     mm2,(DWORD_BIT-WORD_BIT)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm2,mm2
+	pi2fd     mm3,mm3
+
+	pfmul     mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pfsub	mm0,mm2			; mm0=tmp11
+	pfsub	mm1,mm3
+	pfadd	mm4,mm2			; mm4=tmp10
+	pfadd	mm5,mm3			; mm5=tmp13
+
+	pfmul	mm1,[GOTOFF(ebx,PD_1_414)]
+	pfsub	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm5			; mm4=tmp3
+	pfsub	mm0,mm1			; mm0=tmp2
+	pfadd	mm6,mm5			; mm6=tmp0
+	pfadd	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; tmp3
+	movq	MMWORD [wk(0)], mm0	; tmp2
+
+	; -- Odd part
+
+	movd      mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movd      mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movd      mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movd      mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd mm2,mm2
+	punpcklwd mm3,mm3
+	psrad     mm2,(DWORD_BIT-WORD_BIT)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm2,mm2
+	pi2fd     mm3,mm3
+
+	pfmul     mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	punpcklwd mm5,mm5
+	punpcklwd mm1,mm1
+	psrad     mm5,(DWORD_BIT-WORD_BIT)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm5,mm5
+	pi2fd     mm1,mm1
+
+	pfmul     mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	pfadd	mm2,mm1			; mm2=z11
+	pfadd	mm5,mm3			; mm5=z13
+	pfsub	mm4,mm1			; mm4=z12
+	pfsub	mm0,mm3			; mm0=z10
+
+	movq	mm1,mm2
+	pfsub	mm2,mm5
+	pfadd	mm1,mm5			; mm1=tmp7
+
+	pfmul	mm2,[GOTOFF(ebx,PD_1_414)]	; mm2=tmp11
+
+	movq	mm3,mm0
+	pfadd	mm0,mm4
+	pfmul	mm0,[GOTOFF(ebx,PD_1_847)]	; mm0=z5
+	pfmul	mm3,[GOTOFF(ebx,PD_2_613)]	; mm3=(z10 * 2.613125930)
+	pfmul	mm4,[GOTOFF(ebx,PD_1_082)]	; mm4=(z12 * 1.082392200)
+	pfsubr	mm3,mm0			; mm3=tmp12
+	pfsub	mm4,mm0			; mm4=tmp10
+
+	; -- Final output stage
+
+	pfsub	mm3,mm1			; mm3=tmp6
+	movq	mm5,mm6
+	movq	mm0,mm7
+	pfadd	mm6,mm1			; mm6=data0=(00 01)
+	pfadd	mm7,mm3			; mm7=data1=(10 11)
+	pfsub	mm5,mm1			; mm5=data7=(70 71)
+	pfsub	mm0,mm3			; mm0=data6=(60 61)
+	pfsub	mm2,mm3			; mm2=tmp5
+
+	movq      mm1,mm6		; transpose coefficients
+	punpckldq mm6,mm7		; mm6=(00 10)
+	punpckhdq mm1,mm7		; mm1=(01 11)
+	movq      mm3,mm0		; transpose coefficients
+	punpckldq mm0,mm5		; mm0=(60 70)
+	punpckhdq mm3,mm5		; mm3=(61 71)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
+	movq	mm5, MMWORD [wk(1)]	; mm5=tmp3
+
+	pfadd	mm4,mm2			; mm4=tmp4
+	movq	mm6,mm7
+	movq	mm1,mm5
+	pfadd	mm7,mm2			; mm7=data2=(20 21)
+	pfadd	mm5,mm4			; mm5=data4=(40 41)
+	pfsub	mm6,mm2			; mm6=data5=(50 51)
+	pfsub	mm1,mm4			; mm1=data3=(30 31)
+
+	movq      mm0,mm7		; transpose coefficients
+	punpckldq mm7,mm1		; mm7=(20 30)
+	punpckhdq mm0,mm1		; mm0=(21 31)
+	movq      mm3,mm5		; transpose coefficients
+	punpckldq mm5,mm6		; mm5=(40 50)
+	punpckhdq mm3,mm6		; mm3=(41 51)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+	add	esi, byte 2*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 2*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/2				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pfsub	mm0,mm2			; mm0=tmp11
+	pfsub	mm1,mm3
+	pfadd	mm4,mm2			; mm4=tmp10
+	pfadd	mm5,mm3			; mm5=tmp13
+
+	pfmul	mm1,[GOTOFF(ebx,PD_1_414)]
+	pfsub	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm5			; mm4=tmp3
+	pfsub	mm0,mm1			; mm0=tmp2
+	pfadd	mm6,mm5			; mm6=tmp0
+	pfadd	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; tmp3
+	movq	MMWORD [wk(0)], mm0	; tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	pfadd	mm2,mm1			; mm2=z11
+	pfadd	mm5,mm3			; mm5=z13
+	pfsub	mm4,mm1			; mm4=z12
+	pfsub	mm0,mm3			; mm0=z10
+
+	movq	mm1,mm2
+	pfsub	mm2,mm5
+	pfadd	mm1,mm5			; mm1=tmp7
+
+	pfmul	mm2,[GOTOFF(ebx,PD_1_414)]	; mm2=tmp11
+
+	movq	mm3,mm0
+	pfadd	mm0,mm4
+	pfmul	mm0,[GOTOFF(ebx,PD_1_847)]	; mm0=z5
+	pfmul	mm3,[GOTOFF(ebx,PD_2_613)]	; mm3=(z10 * 2.613125930)
+	pfmul	mm4,[GOTOFF(ebx,PD_1_082)]	; mm4=(z12 * 1.082392200)
+	pfsubr	mm3,mm0			; mm3=tmp12
+	pfsub	mm4,mm0			; mm4=tmp10
+
+	; -- Final output stage
+
+	pfsub	mm3,mm1			; mm3=tmp6
+	movq	mm5,mm6
+	movq	mm0,mm7
+	pfadd	mm6,mm1			; mm6=data0=(00 10)
+	pfadd	mm7,mm3			; mm7=data1=(01 11)
+	pfsub	mm5,mm1			; mm5=data7=(07 17)
+	pfsub	mm0,mm3			; mm0=data6=(06 16)
+	pfsub	mm2,mm3			; mm2=tmp5
+
+	movq	mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; mm1=[PD_RNDINT_MAGIC]
+	pcmpeqd	mm3,mm3
+	psrld	mm3,WORD_BIT		; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+	pfadd	mm6,mm1			; mm6=roundint(data0/8)=(00 ** 10 **)
+	pfadd	mm7,mm1			; mm7=roundint(data1/8)=(01 ** 11 **)
+	pfadd	mm0,mm1			; mm0=roundint(data6/8)=(06 ** 16 **)
+	pfadd	mm5,mm1			; mm5=roundint(data7/8)=(07 ** 17 **)
+
+	pand	mm6,mm3			; mm6=(00 -- 10 --)
+	pslld	mm7,WORD_BIT		; mm7=(-- 01 -- 11)
+	pand	mm0,mm3			; mm0=(06 -- 16 --)
+	pslld	mm5,WORD_BIT		; mm5=(-- 07 -- 17)
+	por	mm6,mm7			; mm6=(00 01 10 11)
+	por	mm0,mm5			; mm0=(06 07 16 17)
+
+	movq	mm1, MMWORD [wk(0)]	; mm1=tmp2
+	movq	mm3, MMWORD [wk(1)]	; mm3=tmp3
+
+	pfadd	mm4,mm2			; mm4=tmp4
+	movq	mm7,mm1
+	movq	mm5,mm3
+	pfadd	mm1,mm2			; mm1=data2=(02 12)
+	pfadd	mm3,mm4			; mm3=data4=(04 14)
+	pfsub	mm7,mm2			; mm7=data5=(05 15)
+	pfsub	mm5,mm4			; mm5=data3=(03 13)
+
+	movq	mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; mm2=[PD_RNDINT_MAGIC]
+	pcmpeqd	mm4,mm4
+	psrld	mm4,WORD_BIT		; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+	pfadd	mm3,mm2			; mm3=roundint(data4/8)=(04 ** 14 **)
+	pfadd	mm7,mm2			; mm7=roundint(data5/8)=(05 ** 15 **)
+	pfadd	mm1,mm2			; mm1=roundint(data2/8)=(02 ** 12 **)
+	pfadd	mm5,mm2			; mm5=roundint(data3/8)=(03 ** 13 **)
+
+	pand	mm3,mm4			; mm3=(04 -- 14 --)
+	pslld	mm7,WORD_BIT		; mm7=(-- 05 -- 15)
+	pand	mm1,mm4			; mm1=(02 -- 12 --)
+	pslld	mm5,WORD_BIT		; mm5=(-- 03 -- 13)
+	por	mm3,mm7			; mm3=(04 05 14 15)
+	por	mm1,mm5			; mm1=(02 03 12 13)
+
+	movq      mm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm2=[PB_CENTERJSAMP]
+
+	packsswb  mm6,mm3		; mm6=(00 01 10 11 04 05 14 15)
+	packsswb  mm1,mm0		; mm1=(02 03 12 13 06 07 16 17)
+	paddb     mm6,mm2
+	paddb     mm1,mm2
+
+	movq      mm4,mm6		; transpose coefficients(phase 2)
+	punpcklwd mm6,mm1		; mm6=(00 01 02 03 10 11 12 13)
+	punpckhwd mm4,mm1		; mm4=(04 05 06 07 14 15 16 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 3)
+	punpckldq mm6,mm4		; mm6=(00 01 02 03 04 05 06 07)
+	punpckhdq mm7,mm4		; mm7=(10 11 12 13 14 15 16 17)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 2*SIZEOF_FAST_FLOAT	; wsptr
+	add	edi, byte 2*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jimmxfst.asm b/simd/jimmxfst.asm
new file mode 100644
index 0000000..3b05572
--- /dev/null
+++ b/simd/jimmxfst.asm

@@ -0,0 +1,500 @@
+;
+; jimmxfst.asm - fast integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414	times 4 dw  F_1_414 << CONST_SHIFT
+PW_F1847	times 4 dw  F_1_847 << CONST_SHIFT
+PW_MF1613	times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082	times 4 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_mmx (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_ifast_mmx)
+
+EXTN(jsimd_idct_ifast_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	psubw	mm0,mm2			; mm0=tmp11
+	psubw	mm1,mm3
+	paddw	mm4,mm2			; mm4=tmp10
+	paddw	mm5,mm3			; mm5=tmp13
+
+	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	psubw	mm4,mm5			; mm4=tmp3
+	psubw	mm0,mm1			; mm0=tmp2
+	paddw	mm6,mm5			; mm6=tmp0
+	paddw	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
+	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	psubw	mm2,mm1			; mm2=z12
+	psubw	mm5,mm3			; mm5=z10
+	paddw	mm4,mm1			; mm4=z11
+	paddw	mm0,mm3			; mm0=z13
+
+	movq	mm1,mm5			; mm1=z10(unscaled)
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
+
+	movq	mm3,mm4
+	psubw	mm4,mm0
+	paddw	mm3,mm0			; mm3=tmp7
+
+	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movq	mm0,mm5
+	paddw	mm5,mm2
+	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
+	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	mm0,mm1
+	psubw	mm2,mm5			; mm2=tmp10
+	paddw	mm0,mm5			; mm0=tmp12
+
+	; -- Final output stage
+
+	psubw	mm0,mm3			; mm0=tmp6
+	movq	mm1,mm6
+	movq	mm5,mm7
+	paddw	mm6,mm3			; mm6=data0=(00 01 02 03)
+	paddw	mm7,mm0			; mm7=data1=(10 11 12 13)
+	psubw	mm1,mm3			; mm1=data7=(70 71 72 73)
+	psubw	mm5,mm0			; mm5=data6=(60 61 62 63)
+	psubw	mm4,mm0			; mm4=tmp5
+
+	movq      mm3,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm3,mm7		; mm3=(02 12 03 13)
+	movq      mm0,mm5		; transpose coefficients(phase 1)
+	punpcklwd mm5,mm1		; mm5=(60 70 61 71)
+	punpckhwd mm0,mm1		; mm0=(62 72 63 73)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
+	movq	mm1, MMWORD [wk(1)]	; mm1=tmp3
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(60 70 61 71)
+	movq	MMWORD [wk(1)], mm0	; wk(1)=(62 72 63 73)
+
+	paddw	mm2,mm4			; mm2=tmp4
+	movq	mm5,mm7
+	movq	mm0,mm1
+	paddw	mm7,mm4			; mm7=data2=(20 21 22 23)
+	paddw	mm1,mm2			; mm1=data4=(40 41 42 43)
+	psubw	mm5,mm4			; mm5=data5=(50 51 52 53)
+	psubw	mm0,mm2			; mm0=data3=(30 31 32 33)
+
+	movq      mm4,mm7		; transpose coefficients(phase 1)
+	punpcklwd mm7,mm0		; mm7=(20 30 21 31)
+	punpckhwd mm4,mm0		; mm4=(22 32 23 33)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm5		; mm1=(40 50 41 51)
+	punpckhwd mm2,mm5		; mm2=(42 52 43 53)
+
+	movq      mm0,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm7		; mm6=(00 10 20 30)
+	punpckhdq mm0,mm7		; mm0=(01 11 21 31)
+	movq      mm5,mm3		; transpose coefficients(phase 2)
+	punpckldq mm3,mm4		; mm3=(02 12 22 32)
+	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=(60 70 61 71)
+	movq	mm4, MMWORD [wk(1)]	; mm4=(62 72 63 73)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm7		; mm1=(40 50 60 70)
+	punpckhdq mm6,mm7		; mm6=(41 51 61 71)
+	movq      mm0,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm4		; mm2=(42 52 62 72)
+	punpckhdq mm0,mm4		; mm0=(43 53 63 73)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_IFAST_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	psubw	mm0,mm2			; mm0=tmp11
+	psubw	mm1,mm3
+	paddw	mm4,mm2			; mm4=tmp10
+	paddw	mm5,mm3			; mm5=tmp13
+
+	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	psubw	mm4,mm5			; mm4=tmp3
+	psubw	mm0,mm1			; mm0=tmp2
+	paddw	mm6,mm5			; mm6=tmp0
+	paddw	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
+	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	psubw	mm2,mm1			; mm2=z12
+	psubw	mm5,mm3			; mm5=z10
+	paddw	mm4,mm1			; mm4=z11
+	paddw	mm0,mm3			; mm0=z13
+
+	movq	mm1,mm5			; mm1=z10(unscaled)
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
+
+	movq	mm3,mm4
+	psubw	mm4,mm0
+	paddw	mm3,mm0			; mm3=tmp7
+
+	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movq	mm0,mm5
+	paddw	mm5,mm2
+	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
+	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	mm0,mm1
+	psubw	mm2,mm5			; mm2=tmp10
+	paddw	mm0,mm5			; mm0=tmp12
+
+	; -- Final output stage
+
+	psubw	mm0,mm3			; mm0=tmp6
+	movq	mm1,mm6
+	movq	mm5,mm7
+	paddw	mm6,mm3			; mm6=data0=(00 10 20 30)
+	paddw	mm7,mm0			; mm7=data1=(01 11 21 31)
+	psraw	mm6,(PASS1_BITS+3)	; descale
+	psraw	mm7,(PASS1_BITS+3)	; descale
+	psubw	mm1,mm3			; mm1=data7=(07 17 27 37)
+	psubw	mm5,mm0			; mm5=data6=(06 16 26 36)
+	psraw	mm1,(PASS1_BITS+3)	; descale
+	psraw	mm5,(PASS1_BITS+3)	; descale
+	psubw	mm4,mm0			; mm4=tmp5
+
+	packsswb  mm6,mm5		; mm6=(00 10 20 30 06 16 26 36)
+	packsswb  mm7,mm1		; mm7=(01 11 21 31 07 17 27 37)
+
+	movq	mm3, MMWORD [wk(0)]	; mm3=tmp2
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp3
+
+	paddw	mm2,mm4			; mm2=tmp4
+	movq	mm5,mm3
+	movq	mm1,mm0
+	paddw	mm3,mm4			; mm3=data2=(02 12 22 32)
+	paddw	mm0,mm2			; mm0=data4=(04 14 24 34)
+	psraw	mm3,(PASS1_BITS+3)	; descale
+	psraw	mm0,(PASS1_BITS+3)	; descale
+	psubw	mm5,mm4			; mm5=data5=(05 15 25 35)
+	psubw	mm1,mm2			; mm1=data3=(03 13 23 33)
+	psraw	mm5,(PASS1_BITS+3)	; descale
+	psraw	mm1,(PASS1_BITS+3)	; descale
+
+	movq      mm4,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm4=[PB_CENTERJSAMP]
+
+	packsswb  mm3,mm0		; mm3=(02 12 22 32 04 14 24 34)
+	packsswb  mm1,mm5		; mm1=(03 13 23 33 05 15 25 35)
+
+	paddb     mm6,mm4
+	paddb     mm7,mm4
+	paddb     mm3,mm4
+	paddb     mm1,mm4
+
+	movq      mm2,mm6		; transpose coefficients(phase 1)
+	punpcklbw mm6,mm7		; mm6=(00 01 10 11 20 21 30 31)
+	punpckhbw mm2,mm7		; mm2=(06 07 16 17 26 27 36 37)
+	movq      mm0,mm3		; transpose coefficients(phase 1)
+	punpcklbw mm3,mm1		; mm3=(02 03 12 13 22 23 32 33)
+	punpckhbw mm0,mm1		; mm0=(04 05 14 15 24 25 34 35)
+
+	movq      mm5,mm6		; transpose coefficients(phase 2)
+	punpcklwd mm6,mm3		; mm6=(00 01 02 03 10 11 12 13)
+	punpckhwd mm5,mm3		; mm5=(20 21 22 23 30 31 32 33)
+	movq      mm4,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm2		; mm0=(04 05 06 07 14 15 16 17)
+	punpckhwd mm4,mm2		; mm4=(24 25 26 27 34 35 36 37)
+
+	movq      mm7,mm6		; transpose coefficients(phase 3)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03 04 05 06 07)
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13 14 15 16 17)
+	movq      mm1,mm5		; transpose coefficients(phase 3)
+	punpckldq mm5,mm4		; mm5=(20 21 22 23 24 25 26 27)
+	punpckhdq mm1,mm4		; mm1=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jimmxint.asm b/simd/jimmxint.asm
new file mode 100644
index 0000000..7b52fae
--- /dev/null
+++ b/simd/jimmxint.asm

@@ -0,0 +1,852 @@
+;
+; jimmxint.asm - accurate integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_mmx (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		12
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_islow_mmx)
+
+EXTN(jsimd_idct_islow_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	mm0,PASS1_BITS
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movq      mm4,mm1		; mm1=in2=z2
+	movq      mm5,mm1
+	punpcklwd mm4,mm3		; mm3=in6=z3
+	punpckhwd mm5,mm3
+	movq      mm1,mm4
+	movq      mm3,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
+
+	movq      mm6,mm0
+	paddw     mm0,mm2		; mm0=in0+in4
+	psubw     mm6,mm2		; mm6=in0-in4
+
+	pxor      mm7,mm7
+	pxor      mm2,mm2
+	punpcklwd mm7,mm0		; mm7=tmp0L
+	punpckhwd mm2,mm0		; mm2=tmp0H
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
+
+	movq	mm0,mm7
+	paddd	mm7,mm4			; mm7=tmp10L
+	psubd	mm0,mm4			; mm0=tmp13L
+	movq	mm4,mm2
+	paddd	mm2,mm5			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp13H
+
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
+	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
+	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
+	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
+
+	pxor      mm5,mm5
+	pxor      mm7,mm7
+	punpcklwd mm5,mm6		; mm5=tmp1L
+	punpckhwd mm7,mm6		; mm7=tmp1H
+	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+
+	movq	mm2,mm5
+	paddd	mm5,mm1			; mm5=tmp11L
+	psubd	mm2,mm1			; mm2=tmp12L
+	movq	mm0,mm7
+	paddd	mm7,mm3			; mm7=tmp11H
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
+	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
+	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
+	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movq	mm5,mm6
+	movq	mm7,mm4
+	paddw	mm5,mm3			; mm5=z3
+	paddw	mm7,mm1			; mm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm2,mm5
+	movq      mm0,mm5
+	punpcklwd mm2,mm7
+	punpckhwd mm0,mm7
+	movq      mm5,mm2
+	movq      mm7,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movq      mm2,mm3
+	movq      mm0,mm3
+	punpcklwd mm2,mm4
+	punpckhwd mm0,mm4
+	movq      mm3,mm2
+	movq      mm4,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
+
+	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
+	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
+	paddd	mm3,mm5			; mm3=tmp3L
+	paddd	mm4,mm7			; mm4=tmp3H
+
+	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
+	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
+
+	movq      mm2,mm1
+	movq      mm0,mm1
+	punpcklwd mm2,mm6
+	punpckhwd mm0,mm6
+	movq      mm1,mm2
+	movq      mm6,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
+
+	paddd	mm2,mm5			; mm2=tmp1L
+	paddd	mm0,mm7			; mm0=tmp1H
+	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
+	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
+
+	movq	mm2,mm5
+	movq	mm0,mm7
+	paddd	mm5,mm3			; mm5=data0L
+	paddd	mm7,mm4			; mm7=data0H
+	psubd	mm2,mm3			; mm2=data7L
+	psubd	mm0,mm4			; mm0=data7H
+
+	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm3=[PD_DESCALE_P1]
+
+	paddd	mm5,mm3
+	paddd	mm7,mm3
+	psrad	mm5,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+	paddd	mm2,mm3
+	paddd	mm0,mm3
+	psrad	mm2,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+
+	packssdw  mm5,mm7		; mm5=data0=(00 01 02 03)
+	packssdw  mm2,mm0		; mm2=data7=(70 71 72 73)
+
+	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
+	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
+
+	movq	mm7,mm4
+	movq	mm0,mm3
+	paddd	mm4,mm1			; mm4=data1L
+	paddd	mm3,mm6			; mm3=data1H
+	psubd	mm7,mm1			; mm7=data6L
+	psubd	mm0,mm6			; mm0=data6H
+
+	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm1=[PD_DESCALE_P1]
+
+	paddd	mm4,mm1
+	paddd	mm3,mm1
+	psrad	mm4,DESCALE_P1
+	psrad	mm3,DESCALE_P1
+	paddd	mm7,mm1
+	paddd	mm0,mm1
+	psrad	mm7,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+
+	packssdw  mm4,mm3		; mm4=data1=(10 11 12 13)
+	packssdw  mm7,mm0		; mm7=data6=(60 61 62 63)
+
+	movq      mm6,mm5		; transpose coefficients(phase 1)
+	punpcklwd mm5,mm4		; mm5=(00 10 01 11)
+	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
+	movq      mm1,mm7		; transpose coefficients(phase 1)
+	punpcklwd mm7,mm2		; mm7=(60 70 61 71)
+	punpckhwd mm1,mm2		; mm1=(62 72 63 73)
+
+	movq	mm3, MMWORD [wk(6)]	; mm3=tmp12L
+	movq	mm0, MMWORD [wk(7)]	; mm0=tmp12H
+	movq	mm4, MMWORD [wk(10)]	; mm4=tmp1L
+	movq	mm2, MMWORD [wk(11)]	; mm2=tmp1H
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 01 11)
+	movq	MMWORD [wk(1)], mm6	; wk(1)=(02 12 03 13)
+	movq	MMWORD [wk(4)], mm7	; wk(4)=(60 70 61 71)
+	movq	MMWORD [wk(5)], mm1	; wk(5)=(62 72 63 73)
+
+	movq	mm5,mm3
+	movq	mm6,mm0
+	paddd	mm3,mm4			; mm3=data2L
+	paddd	mm0,mm2			; mm0=data2H
+	psubd	mm5,mm4			; mm5=data5L
+	psubd	mm6,mm2			; mm6=data5H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm7=[PD_DESCALE_P1]
+
+	paddd	mm3,mm7
+	paddd	mm0,mm7
+	psrad	mm3,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+	paddd	mm5,mm7
+	paddd	mm6,mm7
+	psrad	mm5,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm3,mm0		; mm3=data2=(20 21 22 23)
+	packssdw  mm5,mm6		; mm5=data5=(50 51 52 53)
+
+	movq	mm1, MMWORD [wk(2)]	; mm1=tmp13L
+	movq	mm4, MMWORD [wk(3)]	; mm4=tmp13H
+	movq	mm2, MMWORD [wk(8)]	; mm2=tmp0L
+	movq	mm7, MMWORD [wk(9)]	; mm7=tmp0H
+
+	movq	mm0,mm1
+	movq	mm6,mm4
+	paddd	mm1,mm2			; mm1=data3L
+	paddd	mm4,mm7			; mm4=data3H
+	psubd	mm0,mm2			; mm0=data4L
+	psubd	mm6,mm7			; mm6=data4H
+
+	movq	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm2=[PD_DESCALE_P1]
+
+	paddd	mm1,mm2
+	paddd	mm4,mm2
+	psrad	mm1,DESCALE_P1
+	psrad	mm4,DESCALE_P1
+	paddd	mm0,mm2
+	paddd	mm6,mm2
+	psrad	mm0,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm1,mm4		; mm1=data3=(30 31 32 33)
+	packssdw  mm0,mm6		; mm0=data4=(40 41 42 43)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=(00 10 01 11)
+	movq	mm2, MMWORD [wk(1)]	; mm2=(02 12 03 13)
+
+	movq      mm4,mm3		; transpose coefficients(phase 1)
+	punpcklwd mm3,mm1		; mm3=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm6,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm5		; mm0=(40 50 41 51)
+	punpckhwd mm6,mm5		; mm6=(42 52 43 53)
+
+	movq      mm1,mm7		; transpose coefficients(phase 2)
+	punpckldq mm7,mm3		; mm7=(00 10 20 30)
+	punpckhdq mm1,mm3		; mm1=(01 11 21 31)
+	movq      mm5,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm4		; mm2=(02 12 22 32)
+	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
+
+	movq	mm3, MMWORD [wk(4)]	; mm3=(60 70 61 71)
+	movq	mm4, MMWORD [wk(5)]	; mm4=(62 72 63 73)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+	movq      mm7,mm0		; transpose coefficients(phase 2)
+	punpckldq mm0,mm3		; mm0=(40 50 60 70)
+	punpckhdq mm7,mm3		; mm7=(41 51 61 71)
+	movq      mm1,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm4		; mm6=(42 52 62 72)
+	punpckhdq mm1,mm4		; mm1=(43 53 63 73)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movq      mm4,mm1		; mm1=in2=z2
+	movq      mm5,mm1
+	punpcklwd mm4,mm3		; mm3=in6=z3
+	punpckhwd mm5,mm3
+	movq      mm1,mm4
+	movq      mm3,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
+
+	movq      mm6,mm0
+	paddw     mm0,mm2		; mm0=in0+in4
+	psubw     mm6,mm2		; mm6=in0-in4
+
+	pxor      mm7,mm7
+	pxor      mm2,mm2
+	punpcklwd mm7,mm0		; mm7=tmp0L
+	punpckhwd mm2,mm0		; mm2=tmp0H
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
+
+	movq	mm0,mm7
+	paddd	mm7,mm4			; mm7=tmp10L
+	psubd	mm0,mm4			; mm0=tmp13L
+	movq	mm4,mm2
+	paddd	mm2,mm5			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp13H
+
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
+	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
+	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
+	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
+
+	pxor      mm5,mm5
+	pxor      mm7,mm7
+	punpcklwd mm5,mm6		; mm5=tmp1L
+	punpckhwd mm7,mm6		; mm7=tmp1H
+	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+
+	movq	mm2,mm5
+	paddd	mm5,mm1			; mm5=tmp11L
+	psubd	mm2,mm1			; mm2=tmp12L
+	movq	mm0,mm7
+	paddd	mm7,mm3			; mm7=tmp11H
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
+	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
+	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
+	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm5,mm6
+	movq	mm7,mm4
+	paddw	mm5,mm3			; mm5=z3
+	paddw	mm7,mm1			; mm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm2,mm5
+	movq      mm0,mm5
+	punpcklwd mm2,mm7
+	punpckhwd mm0,mm7
+	movq      mm5,mm2
+	movq      mm7,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movq      mm2,mm3
+	movq      mm0,mm3
+	punpcklwd mm2,mm4
+	punpckhwd mm0,mm4
+	movq      mm3,mm2
+	movq      mm4,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
+
+	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
+	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
+	paddd	mm3,mm5			; mm3=tmp3L
+	paddd	mm4,mm7			; mm4=tmp3H
+
+	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
+	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
+
+	movq      mm2,mm1
+	movq      mm0,mm1
+	punpcklwd mm2,mm6
+	punpckhwd mm0,mm6
+	movq      mm1,mm2
+	movq      mm6,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
+
+	paddd	mm2,mm5			; mm2=tmp1L
+	paddd	mm0,mm7			; mm0=tmp1H
+	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
+	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
+
+	movq	mm2,mm5
+	movq	mm0,mm7
+	paddd	mm5,mm3			; mm5=data0L
+	paddd	mm7,mm4			; mm7=data0H
+	psubd	mm2,mm3			; mm2=data7L
+	psubd	mm0,mm4			; mm0=data7H
+
+	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm3=[PD_DESCALE_P2]
+
+	paddd	mm5,mm3
+	paddd	mm7,mm3
+	psrad	mm5,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+	paddd	mm2,mm3
+	paddd	mm0,mm3
+	psrad	mm2,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+
+	packssdw  mm5,mm7		; mm5=data0=(00 10 20 30)
+	packssdw  mm2,mm0		; mm2=data7=(07 17 27 37)
+
+	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
+	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
+
+	movq	mm7,mm4
+	movq	mm0,mm3
+	paddd	mm4,mm1			; mm4=data1L
+	paddd	mm3,mm6			; mm3=data1H
+	psubd	mm7,mm1			; mm7=data6L
+	psubd	mm0,mm6			; mm0=data6H
+
+	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm1=[PD_DESCALE_P2]
+
+	paddd	mm4,mm1
+	paddd	mm3,mm1
+	psrad	mm4,DESCALE_P2
+	psrad	mm3,DESCALE_P2
+	paddd	mm7,mm1
+	paddd	mm0,mm1
+	psrad	mm7,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+
+	packssdw  mm4,mm3		; mm4=data1=(01 11 21 31)
+	packssdw  mm7,mm0		; mm7=data6=(06 16 26 36)
+
+	packsswb  mm5,mm7		; mm5=(00 10 20 30 06 16 26 36)
+	packsswb  mm4,mm2		; mm4=(01 11 21 31 07 17 27 37)
+
+	movq	mm6, MMWORD [wk(6)]	; mm6=tmp12L
+	movq	mm1, MMWORD [wk(7)]	; mm1=tmp12H
+	movq	mm3, MMWORD [wk(10)]	; mm3=tmp1L
+	movq	mm0, MMWORD [wk(11)]	; mm0=tmp1H
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 20 30 06 16 26 36)
+	movq	MMWORD [wk(1)], mm4	; wk(1)=(01 11 21 31 07 17 27 37)
+
+	movq	mm7,mm6
+	movq	mm2,mm1
+	paddd	mm6,mm3			; mm6=data2L
+	paddd	mm1,mm0			; mm1=data2H
+	psubd	mm7,mm3			; mm7=data5L
+	psubd	mm2,mm0			; mm2=data5H
+
+	movq	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm5=[PD_DESCALE_P2]
+
+	paddd	mm6,mm5
+	paddd	mm1,mm5
+	psrad	mm6,DESCALE_P2
+	psrad	mm1,DESCALE_P2
+	paddd	mm7,mm5
+	paddd	mm2,mm5
+	psrad	mm7,DESCALE_P2
+	psrad	mm2,DESCALE_P2
+
+	packssdw  mm6,mm1		; mm6=data2=(02 12 22 32)
+	packssdw  mm7,mm2		; mm7=data5=(05 15 25 35)
+
+	movq	mm4, MMWORD [wk(2)]	; mm4=tmp13L
+	movq	mm3, MMWORD [wk(3)]	; mm3=tmp13H
+	movq	mm0, MMWORD [wk(8)]	; mm0=tmp0L
+	movq	mm5, MMWORD [wk(9)]	; mm5=tmp0H
+
+	movq	mm1,mm4
+	movq	mm2,mm3
+	paddd	mm4,mm0			; mm4=data3L
+	paddd	mm3,mm5			; mm3=data3H
+	psubd	mm1,mm0			; mm1=data4L
+	psubd	mm2,mm5			; mm2=data4H
+
+	movq	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm0=[PD_DESCALE_P2]
+
+	paddd	mm4,mm0
+	paddd	mm3,mm0
+	psrad	mm4,DESCALE_P2
+	psrad	mm3,DESCALE_P2
+	paddd	mm1,mm0
+	paddd	mm2,mm0
+	psrad	mm1,DESCALE_P2
+	psrad	mm2,DESCALE_P2
+
+	movq      mm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm5=[PB_CENTERJSAMP]
+
+	packssdw  mm4,mm3		; mm4=data3=(03 13 23 33)
+	packssdw  mm1,mm2		; mm1=data4=(04 14 24 34)
+
+	movq      mm0, MMWORD [wk(0)]	; mm0=(00 10 20 30 06 16 26 36)
+	movq      mm3, MMWORD [wk(1)]	; mm3=(01 11 21 31 07 17 27 37)
+
+	packsswb  mm6,mm1		; mm6=(02 12 22 32 04 14 24 34)
+	packsswb  mm4,mm7		; mm4=(03 13 23 33 05 15 25 35)
+
+	paddb     mm0,mm5
+	paddb     mm3,mm5
+	paddb     mm6,mm5
+	paddb     mm4,mm5
+
+	movq      mm2,mm0		; transpose coefficients(phase 1)
+	punpcklbw mm0,mm3		; mm0=(00 01 10 11 20 21 30 31)
+	punpckhbw mm2,mm3		; mm2=(06 07 16 17 26 27 36 37)
+	movq      mm1,mm6		; transpose coefficients(phase 1)
+	punpcklbw mm6,mm4		; mm6=(02 03 12 13 22 23 32 33)
+	punpckhbw mm1,mm4		; mm1=(04 05 14 15 24 25 34 35)
+
+	movq      mm7,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm6		; mm0=(00 01 02 03 10 11 12 13)
+	punpckhwd mm7,mm6		; mm7=(20 21 22 23 30 31 32 33)
+	movq      mm5,mm1		; transpose coefficients(phase 2)
+	punpcklwd mm1,mm2		; mm1=(04 05 06 07 14 15 16 17)
+	punpckhwd mm5,mm2		; mm5=(24 25 26 27 34 35 36 37)
+
+	movq      mm3,mm0		; transpose coefficients(phase 3)
+	punpckldq mm0,mm1		; mm0=(00 01 02 03 04 05 06 07)
+	punpckhdq mm3,mm1		; mm3=(10 11 12 13 14 15 16 17)
+	movq      mm4,mm7		; transpose coefficients(phase 3)
+	punpckldq mm7,mm5		; mm7=(20 21 22 23 24 25 26 27)
+	punpckhdq mm4,mm5		; mm4=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jimmxred.asm b/simd/jimmxred.asm
new file mode 100644
index 0000000..a2b7103
--- /dev/null
+++ b/simd/jimmxred.asm

@@ -0,0 +1,706 @@
+;
+; jimmxred.asm - reduced-size IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076	times 2 dw  F_1_847,-F_0_765
+PW_F256_F089	times 2 dw  F_2_562, F_0_899
+PW_F106_MF217	times 2 dw  F_1_061,-F_2_172
+PW_MF060_MF050	times 2 dw -F_0_601,-F_0_509
+PW_F145_MF021	times 2 dw  F_1_451,-F_0_211
+PW_F362_MF127	times 2 dw  F_3_624,-F_1_272
+PW_F085_MF072	times 2 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4	times 2 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4	times 2 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2	times 2 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2	times 2 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_mmx (void * dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_4x4_mmx)
+
+EXTN(jsimd_idct_4x4_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm0,mm1
+	packsswb mm0,mm0
+	movd	eax,mm0
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	mm0,PASS1_BITS
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movq      mm4,mm0
+	movq      mm5,mm0
+	punpcklwd mm4,mm1
+	punpckhwd mm5,mm1
+	movq      mm0,mm4
+	movq      mm1,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
+
+	movq      mm6,mm2
+	movq      mm7,mm2
+	punpcklwd mm6,mm3
+	punpckhwd mm7,mm3
+	movq      mm2,mm6
+	movq      mm3,mm7
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
+
+	paddd	mm6,mm4			; mm6=tmp2L
+	paddd	mm7,mm5			; mm7=tmp2H
+	paddd	mm2,mm0			; mm2=tmp0L
+	paddd	mm3,mm1			; mm3=tmp0H
+
+	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
+	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	pxor      mm1,mm1
+	pxor      mm2,mm2
+	punpcklwd mm1,mm4		; mm1=tmp0L
+	punpckhwd mm2,mm4		; mm2=tmp0H
+	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
+	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+	movq      mm3,mm5		; mm5=in2=z2
+	punpcklwd mm5,mm0		; mm0=in6=z3
+	punpckhwd mm3,mm0
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
+
+	movq	mm4,mm1
+	movq	mm0,mm2
+	paddd	mm1,mm5			; mm1=tmp10L
+	paddd	mm2,mm3			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp12L
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	; -- Final output stage
+
+	movq	mm5,mm1
+	movq	mm3,mm2
+	paddd	mm1,mm6			; mm1=data0L
+	paddd	mm2,mm7			; mm2=data0H
+	psubd	mm5,mm6			; mm5=data3L
+	psubd	mm3,mm7			; mm3=data3H
+
+	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm6=[PD_DESCALE_P1_4]
+
+	paddd	mm1,mm6
+	paddd	mm2,mm6
+	psrad	mm1,DESCALE_P1_4
+	psrad	mm2,DESCALE_P1_4
+	paddd	mm5,mm6
+	paddd	mm3,mm6
+	psrad	mm5,DESCALE_P1_4
+	psrad	mm3,DESCALE_P1_4
+
+	packssdw  mm1,mm2		; mm1=data0=(00 01 02 03)
+	packssdw  mm5,mm3		; mm5=data3=(30 31 32 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
+
+	movq	mm2,mm4
+	movq	mm3,mm0
+	paddd	mm4,mm7			; mm4=data1L
+	paddd	mm0,mm6			; mm0=data1H
+	psubd	mm2,mm7			; mm2=data2L
+	psubd	mm3,mm6			; mm3=data2H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm7=[PD_DESCALE_P1_4]
+
+	paddd	mm4,mm7
+	paddd	mm0,mm7
+	psrad	mm4,DESCALE_P1_4
+	psrad	mm0,DESCALE_P1_4
+	paddd	mm2,mm7
+	paddd	mm3,mm7
+	psrad	mm2,DESCALE_P1_4
+	psrad	mm3,DESCALE_P1_4
+
+	packssdw  mm4,mm0		; mm4=data1=(10 11 12 13)
+	packssdw  mm2,mm3		; mm2=data2=(20 21 22 23)
+
+	movq      mm6,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm4		; mm1=(00 10 01 11)
+	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
+	movq      mm7,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm5		; mm2=(20 30 21 31)
+	punpckhwd mm7,mm5		; mm7=(22 32 23 33)
+
+	movq      mm0,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm2		; mm1=(00 10 20 30)
+	punpckhdq mm0,mm2		; mm0=(01 11 21 31)
+	movq      mm3,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm7		; mm6=(02 12 22 32)
+	punpckhdq mm3,mm7		; mm3=(03 13 23 33)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq      mm4,mm0
+	movq      mm5,mm0
+	punpcklwd mm4,mm1
+	punpckhwd mm5,mm1
+	movq      mm0,mm4
+	movq      mm1,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
+
+	movq      mm6,mm2
+	movq      mm7,mm2
+	punpcklwd mm6,mm3
+	punpckhwd mm7,mm3
+	movq      mm2,mm6
+	movq      mm3,mm7
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
+
+	paddd	mm6,mm4			; mm6=tmp2L
+	paddd	mm7,mm5			; mm7=tmp2H
+	paddd	mm2,mm0			; mm2=tmp0L
+	paddd	mm3,mm1			; mm3=tmp0H
+
+	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
+	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	pxor      mm1,mm1
+	pxor      mm2,mm2
+	punpcklwd mm1,mm4		; mm1=tmp0L
+	punpckhwd mm2,mm4		; mm2=tmp0H
+	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
+	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+	movq      mm3,mm5		; mm5=in2=z2
+	punpcklwd mm5,mm0		; mm0=in6=z3
+	punpckhwd mm3,mm0
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
+
+	movq	mm4,mm1
+	movq	mm0,mm2
+	paddd	mm1,mm5			; mm1=tmp10L
+	paddd	mm2,mm3			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp12L
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	; -- Final output stage
+
+	movq	mm5,mm1
+	movq	mm3,mm2
+	paddd	mm1,mm6			; mm1=data0L
+	paddd	mm2,mm7			; mm2=data0H
+	psubd	mm5,mm6			; mm5=data3L
+	psubd	mm3,mm7			; mm3=data3H
+
+	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm6=[PD_DESCALE_P2_4]
+
+	paddd	mm1,mm6
+	paddd	mm2,mm6
+	psrad	mm1,DESCALE_P2_4
+	psrad	mm2,DESCALE_P2_4
+	paddd	mm5,mm6
+	paddd	mm3,mm6
+	psrad	mm5,DESCALE_P2_4
+	psrad	mm3,DESCALE_P2_4
+
+	packssdw  mm1,mm2		; mm1=data0=(00 10 20 30)
+	packssdw  mm5,mm3		; mm5=data3=(03 13 23 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
+
+	movq	mm2,mm4
+	movq	mm3,mm0
+	paddd	mm4,mm7			; mm4=data1L
+	paddd	mm0,mm6			; mm0=data1H
+	psubd	mm2,mm7			; mm2=data2L
+	psubd	mm3,mm6			; mm3=data2H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm7=[PD_DESCALE_P2_4]
+
+	paddd	mm4,mm7
+	paddd	mm0,mm7
+	psrad	mm4,DESCALE_P2_4
+	psrad	mm0,DESCALE_P2_4
+	paddd	mm2,mm7
+	paddd	mm3,mm7
+	psrad	mm2,DESCALE_P2_4
+	psrad	mm3,DESCALE_P2_4
+
+	packssdw  mm4,mm0		; mm4=data1=(01 11 21 31)
+	packssdw  mm2,mm3		; mm2=data2=(02 12 22 32)
+
+	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
+
+	packsswb  mm1,mm2		; mm1=(00 10 20 30 02 12 22 32)
+	packsswb  mm4,mm5		; mm4=(01 11 21 31 03 13 23 33)
+	paddb     mm1,mm6
+	paddb     mm4,mm6
+
+	movq      mm7,mm1		; transpose coefficients(phase 1)
+	punpcklbw mm1,mm4		; mm1=(00 01 10 11 20 21 30 31)
+	punpckhbw mm7,mm4		; mm7=(02 03 12 13 22 23 32 33)
+
+	movq      mm0,mm1		; transpose coefficients(phase 2)
+	punpcklwd mm1,mm7		; mm1=(00 01 02 03 10 11 12 13)
+	punpckhwd mm0,mm7		; mm0=(20 21 22 23 30 31 32 33)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+	psrlq	mm1,4*BYTE_BIT
+	psrlq	mm0,4*BYTE_BIT
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_mmx (void * dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+	align	16
+	global	EXTN(jsimd_idct_2x2_mmx)
+
+EXTN(jsimd_idct_2x2_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+	mov	edx, POINTER [dct_table(ebp)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+
+	; | input:                  | result:        |
+	; | 00 01 ** 03 ** 05 ** 07 |                |
+	; | 10 11 ** 13 ** 15 ** 17 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+	; | 50 51 ** 53 ** 55 ** 57 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 70 71 ** 73 ** 75 ** 77 |                |
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+	; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+	pcmpeqd   mm7,mm7
+	pslld     mm7,WORD_BIT		; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+	movq      mm4,mm0		; mm4=(10 11 ** 13)
+	movq      mm5,mm2		; mm5=(50 51 ** 53)
+	punpcklwd mm4,mm1		; mm4=(10 30 11 31)
+	punpcklwd mm5,mm3		; mm5=(50 70 51 71)
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	psrld	mm0,WORD_BIT		; mm0=(11 -- 13 --)
+	pand	mm1,mm7			; mm1=(-- 31 -- 33)
+	psrld	mm2,WORD_BIT		; mm2=(51 -- 53 --)
+	pand	mm3,mm7			; mm3=(-- 71 -- 73)
+	por	mm0,mm1			; mm0=(11 31 13 33)
+	por	mm2,mm3			; mm2=(51 71 53 73)
+	pmaddwd	mm0,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	mm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	mm4,mm5			; mm4=tmp0[col0 col1]
+
+	movq	mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+	; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+	psrld	mm6,WORD_BIT		; mm6=(15 -- 17 --)
+	pand	mm1,mm7			; mm1=(-- 35 -- 37)
+	psrld	mm3,WORD_BIT		; mm3=(55 -- 57 --)
+	pand	mm5,mm7			; mm5=(-- 75 -- 77)
+	por	mm6,mm1			; mm6=(15 35 17 37)
+	por	mm3,mm5			; mm3=(55 75 57 77)
+	pmaddwd	mm6,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	mm3,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	mm0,mm2			; mm0=tmp0[col1 col3]
+	paddd	mm6,mm3			; mm6=tmp0[col5 col7]
+
+	; -- Even part
+
+	movq	mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+	movq	mm2,mm1				; mm2=(00 01 ** 03)
+	pslld	mm1,WORD_BIT			; mm1=(-- 00 -- **)
+	psrad	mm1,(WORD_BIT-CONST_BITS-2)	; mm1=tmp10[col0 ****]
+
+	pand	mm2,mm7				; mm2=(-- 01 -- 03)
+	pand	mm5,mm7				; mm5=(-- 05 -- 07)
+	psrad	mm2,(WORD_BIT-CONST_BITS-2)	; mm2=tmp10[col1 col3]
+	psrad	mm5,(WORD_BIT-CONST_BITS-2)	; mm5=tmp10[col5 col7]
+
+	; -- Final output stage
+
+	movq      mm3,mm1
+	paddd     mm1,mm4		; mm1=data0[col0 ****]=(A0 **)
+	psubd     mm3,mm4		; mm3=data1[col0 ****]=(B0 **)
+	punpckldq mm1,mm3		; mm1=(A0 B0)
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; mm7=[PD_DESCALE_P1_2]
+
+	movq	mm4,mm2
+	movq	mm3,mm5
+	paddd	mm2,mm0			; mm2=data0[col1 col3]=(A1 A3)
+	paddd	mm5,mm6			; mm5=data0[col5 col7]=(A5 A7)
+	psubd	mm4,mm0			; mm4=data1[col1 col3]=(B1 B3)
+	psubd	mm3,mm6			; mm3=data1[col5 col7]=(B5 B7)
+
+	paddd	mm1,mm7
+	psrad	mm1,DESCALE_P1_2
+
+	paddd	mm2,mm7
+	paddd	mm5,mm7
+	psrad	mm2,DESCALE_P1_2
+	psrad	mm5,DESCALE_P1_2
+	paddd	mm4,mm7
+	paddd	mm3,mm7
+	psrad	mm4,DESCALE_P1_2
+	psrad	mm3,DESCALE_P1_2
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(ebp)]
+
+	; | input:| result:|
+	; | A0 B0 |        |
+	; | A1 B1 | C0 C1  |
+	; | A3 B3 | D0 D1  |
+	; | A5 B5 |        |
+	; | A7 B7 |        |
+
+	; -- Odd part
+
+	packssdw  mm2,mm4		; mm2=(A1 A3 B1 B3)
+	packssdw  mm5,mm3		; mm5=(A5 A7 B5 B7)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd     mm2,mm5		; mm2=tmp0[row0 row1]
+
+	; -- Even part
+
+	pslld     mm1,(CONST_BITS+2)	; mm1=tmp10[row0 row1]
+
+	; -- Final output stage
+
+	movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]	; mm0=[PD_DESCALE_P2_2]
+
+	movq      mm6,mm1
+	paddd     mm1,mm2		; mm1=data0[row0 row1]=(C0 C1)
+	psubd     mm6,mm2		; mm6=data1[row0 row1]=(D0 D1)
+
+	paddd     mm1,mm0
+	paddd     mm6,mm0
+	psrad     mm1,DESCALE_P2_2
+	psrad     mm6,DESCALE_P2_2
+
+	movq      mm7,mm1		; transpose coefficients
+	punpckldq mm1,mm6		; mm1=(C0 D0)
+	punpckhdq mm7,mm6		; mm7=(C1 D1)
+
+	packssdw  mm1,mm7		; mm1=(C0 D0 C1 D1)
+	packsswb  mm1,mm1		; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+	paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	movd	ecx,mm1
+	movd	ebx,mm1			; ebx=(C0 D0 C1 D1)
+	shr	ecx,2*BYTE_BIT		; ecx=(C1 D1 -- --)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
+	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jiss2flt-64.asm b/simd/jiss2flt-64.asm
new file mode 100644
index 0000000..0e8522d
--- /dev/null
+++ b/simd/jiss2flt-64.asm

@@ -0,0 +1,483 @@
+;
+; jiss2flt.asm - floating-point IDCT (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414	times 4 dd  1.414213562373095048801689
+PD_1_847	times 4 dd  1.847759065022573512256366
+PD_1_082	times 4 dd  1.082392200292393968799446
+PD_M2_613	times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void * dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp	rbp+0
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],eax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [workspace]
+	push	rbx
+	collect_args
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+	mov	rdx, r10	; quantptr
+	mov	rsi, r11		; inptr
+	lea	rdi, [workspace]			; FAST_FLOAT * wsptr
+	mov	rcx, DCTSIZE/4				; ctr
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1,xmm2
+	por	xmm3,xmm4
+	por	xmm5,xmm6
+	por	xmm1,xmm3
+	por	xmm5,xmm7
+	por	xmm1,xmm5
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	rax,rax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
+	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
+
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm1,xmm0
+	movaps	xmm2,xmm0
+	movaps	xmm3,xmm0
+
+	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
+	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
+	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
+	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+	jmp	near .nextcolumn
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
+	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
+	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
+	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
+	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
+
+	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
+	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
+	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
+	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
+	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
+	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
+
+	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[rel PD_1_414]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
+	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
+	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
+	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
+	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
+	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
+
+	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
+	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
+	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
+	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
+	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
+	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
+
+	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[rel PD_1_847]	; xmm0=z5
+	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
+	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
+	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
+	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
+	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
+	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
+	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
+	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
+
+	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
+	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm0,xmm7
+	movaps	xmm3,xmm5
+	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
+	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
+	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
+	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
+
+	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
+	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
+	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
+	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
+	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
+	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
+
+	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
+	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
+	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
+	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
+	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
+	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
+	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
+	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
+	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+	add	rsi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	rcx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	rax, [original_rbp]
+	lea	rsi, [workspace]			; FAST_FLOAT * wsptr
+	mov	rdi, r12	; (JSAMPROW *)
+	mov	rax, r13
+	mov	rcx, DCTSIZE/4				; ctr
+.rowloop:
+
+	; -- Even part
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[rel PD_1_414]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[rel PD_1_847]	; xmm0=z5
+	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
+	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
+	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
+	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps	xmm1,[rel PD_RNDINT_MAGIC]	; xmm1=[rel PD_RNDINT_MAGIC]
+	pcmpeqd	xmm3,xmm3
+	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
+	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
+	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
+	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
+	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
+	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
+
+	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm7,xmm1
+	movaps	xmm5,xmm3
+	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
+	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
+	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
+	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
+
+	movaps	xmm2,[rel PD_RNDINT_MAGIC]	; xmm2=[rel PD_RNDINT_MAGIC]
+	pcmpeqd	xmm4,xmm4
+	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
+	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
+	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
+	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
+	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
+	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
+
+	movdqa    xmm2,[rel PB_CENTERJSAMP]	; xmm2=[rel PB_CENTERJSAMP]
+
+	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+	paddb     xmm6,xmm2
+	paddb     xmm1,xmm2
+
+	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
+	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
+	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+	mov	rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+	mov	rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+	add	rsi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
+	add	rdi, byte 4*SIZEOF_JSAMPROW
+	dec	rcx				; ctr
+	jnz	near .rowloop
+
+	uncollect_args
+	pop	rbx
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jiss2flt.asm b/simd/jiss2flt.asm
new file mode 100644
index 0000000..17bc363
--- /dev/null
+++ b/simd/jiss2flt.asm

@@ -0,0 +1,498 @@
+;
+; jiss2flt.asm - floating-point IDCT (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414	times 4 dd  1.414213562373095048801689
+PD_1_847	times 4 dd  1.847759065022573512256366
+PD_1_082	times 4 dd  1.082392200292393968799446
+PD_M2_613	times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm1,xmm2
+	por	xmm3,xmm4
+	por	xmm5,xmm6
+	por	xmm1,xmm3
+	por	xmm5,xmm7
+	por	xmm1,xmm5
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
+	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
+
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm1,xmm0
+	movaps	xmm2,xmm0
+	movaps	xmm3,xmm0
+
+	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
+	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
+	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
+	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
+	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
+	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
+	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
+	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
+
+	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
+	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
+	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
+	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
+	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
+	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
+
+	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
+	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
+	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
+	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
+	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
+	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
+
+	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
+	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
+	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
+	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
+	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
+	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
+
+	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
+	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
+	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
+	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
+	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
+	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
+	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
+	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
+
+	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
+	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm0,xmm7
+	movaps	xmm3,xmm5
+	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
+	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
+	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
+	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
+
+	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
+	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
+	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
+	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
+	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
+	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
+
+	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
+	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
+	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
+	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
+	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
+	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
+	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
+	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
+	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
+	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
+	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
+	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps	xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm1=[PD_RNDINT_MAGIC]
+	pcmpeqd	xmm3,xmm3
+	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
+	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
+	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
+	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
+	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
+	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
+
+	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm7,xmm1
+	movaps	xmm5,xmm3
+	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
+	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
+	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
+	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
+
+	movaps	xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm2=[PD_RNDINT_MAGIC]
+	pcmpeqd	xmm4,xmm4
+	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
+	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
+	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
+	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
+	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
+	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
+
+	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
+
+	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+	paddb     xmm6,xmm2
+	paddb     xmm1,xmm2
+
+	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
+	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
+	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jiss2fst-64.asm b/simd/jiss2fst-64.asm
new file mode 100644
index 0000000..8b664a6
--- /dev/null
+++ b/simd/jiss2fst-64.asm

@@ -0,0 +1,492 @@
+;
+; jiss2fst.asm - fast integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/projecpt/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414	times 8 dw  F_1_414 << CONST_SHIFT
+PW_F1847	times 8 dw  F_1_847 << CONST_SHIFT
+PW_MF1613	times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082	times 8 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info * compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp	rbp+0
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],eax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+
+	; ---- Pass 1: process columns from input.
+
+	mov	rdx, r10	; quantptr
+	mov	rsi, r11		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1,xmm0
+	packsswb xmm1,xmm1
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	rax,rax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa    xmm7,xmm0		; xmm0=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm7,xmm7		; xmm7=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm6,xmm0,0x00		; xmm6=col0=(00 00 00 00 00 00 00 00)
+	pshufd	xmm2,xmm0,0x55		; xmm2=col1=(01 01 01 01 01 01 01 01)
+	pshufd	xmm5,xmm0,0xAA		; xmm5=col2=(02 02 02 02 02 02 02 02)
+	pshufd	xmm0,xmm0,0xFF		; xmm0=col3=(03 03 03 03 03 03 03 03)
+	pshufd	xmm1,xmm7,0x00		; xmm1=col4=(04 04 04 04 04 04 04 04)
+	pshufd	xmm4,xmm7,0x55		; xmm4=col5=(05 05 05 05 05 05 05 05)
+	pshufd	xmm3,xmm7,0xAA		; xmm3=col6=(06 06 06 06 06 06 06 06)
+	pshufd	xmm7,xmm7,0xFF		; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=col1
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=col3
+	jmp	near .column_end
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	psubw	xmm0,xmm2		; xmm0=tmp11
+	psubw	xmm1,xmm3
+	paddw	xmm4,xmm2		; xmm4=tmp10
+	paddw	xmm5,xmm3		; xmm5=tmp13
+
+	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm1,[rel PW_F1414]
+	psubw	xmm1,xmm5		; xmm1=tmp12
+
+	movdqa	xmm6,xmm4
+	movdqa	xmm7,xmm0
+	psubw	xmm4,xmm5		; xmm4=tmp3
+	psubw	xmm0,xmm1		; xmm0=tmp2
+	paddw	xmm6,xmm5		; xmm6=tmp0
+	paddw	xmm7,xmm1		; xmm7=tmp1
+
+	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=tmp3
+	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movdqa	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm0,xmm5
+	psubw	xmm2,xmm1		; xmm2=z12
+	psubw	xmm5,xmm3		; xmm5=z10
+	paddw	xmm4,xmm1		; xmm4=z11
+	paddw	xmm0,xmm3		; xmm0=z13
+
+	movdqa	xmm1,xmm5		; xmm1=z10(unscaled)
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+
+	movdqa	xmm3,xmm4
+	psubw	xmm4,xmm0
+	paddw	xmm3,xmm0		; xmm3=tmp7
+
+	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm4,[rel PW_F1414]	; xmm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movdqa	xmm0,xmm5
+	paddw	xmm5,xmm2
+	pmulhw	xmm5,[rel PW_F1847]	; xmm5=z5
+	pmulhw	xmm0,[rel PW_MF1613]
+	pmulhw	xmm2,[rel PW_F1082]
+	psubw	xmm0,xmm1
+	psubw	xmm2,xmm5		; xmm2=tmp10
+	paddw	xmm0,xmm5		; xmm0=tmp12
+
+	; -- Final output stage
+
+	psubw	xmm0,xmm3		; xmm0=tmp6
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,xmm7
+	paddw	xmm6,xmm3		; xmm6=data0=(00 01 02 03 04 05 06 07)
+	paddw	xmm7,xmm0		; xmm7=data1=(10 11 12 13 14 15 16 17)
+	psubw	xmm1,xmm3		; xmm1=data7=(70 71 72 73 74 75 76 77)
+	psubw	xmm5,xmm0		; xmm5=data6=(60 61 62 63 64 65 66 67)
+	psubw	xmm4,xmm0		; xmm4=tmp5
+
+	movdqa    xmm3,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm3,xmm7		; xmm3=(04 14 05 15 06 16 07 17)
+	movdqa    xmm0,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm1		; xmm5=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm0,xmm1		; xmm0=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(60 70 61 71 62 72 63 73)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(64 74 65 75 66 76 67 77)
+
+	paddw	xmm2,xmm4		; xmm2=tmp4
+	movdqa	xmm5,xmm7
+	movdqa	xmm0,xmm1
+	paddw	xmm7,xmm4		; xmm7=data2=(20 21 22 23 24 25 26 27)
+	paddw	xmm1,xmm2		; xmm1=data4=(40 41 42 43 44 45 46 47)
+	psubw	xmm5,xmm4		; xmm5=data5=(50 51 52 53 54 55 56 57)
+	psubw	xmm0,xmm2		; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm0		; xmm7=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm4,xmm0		; xmm4=(24 34 25 35 26 36 27 37)
+	movdqa    xmm2,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm5		; xmm1=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm5		; xmm2=(44 54 45 55 46 56 47 57)
+
+	movdqa    xmm0,xmm3		; transpose coefficients(phase 2)
+	punpckldq xmm3,xmm4		; xmm3=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm0,xmm4		; xmm0=(06 16 26 36 07 17 27 37)
+	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm7		; xmm6=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm5,xmm7		; xmm5=(02 12 22 32 03 13 23 33)
+
+	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(60 70 61 71 62 72 63 73)
+	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=(64 74 65 75 66 76 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm3	; wk(0)=(04 14 24 34 05 15 25 35)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(06 16 26 36 07 17 27 37)
+
+	movdqa    xmm3,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm4		; xmm1=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm3,xmm4		; xmm3=(42 52 62 72 43 53 63 73)
+	movdqa    xmm0,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm7		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm0,xmm7		; xmm0=(46 56 66 76 47 57 67 77)
+
+	movdqa     xmm4,xmm6		; transpose coefficients(phase 3)
+	punpcklqdq xmm6,xmm1		; xmm6=col0=(00 10 20 30 40 50 60 70)
+	punpckhqdq xmm4,xmm1		; xmm4=col1=(01 11 21 31 41 51 61 71)
+	movdqa     xmm7,xmm5		; transpose coefficients(phase 3)
+	punpcklqdq xmm5,xmm3		; xmm5=col2=(02 12 22 32 42 52 62 72)
+	punpckhqdq xmm7,xmm3		; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(04 14 24 34 05 15 25 35)
+	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(06 16 26 36 07 17 27 37)
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=col1
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=col3
+
+	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm2		; xmm1=col4=(04 14 24 34 44 54 64 74)
+	punpckhqdq xmm4,xmm2		; xmm4=col5=(05 15 25 35 45 55 65 75)
+	movdqa     xmm7,xmm3		; transpose coefficients(phase 3)
+	punpcklqdq xmm3,xmm0		; xmm3=col6=(06 16 26 36 46 56 66 76)
+	punpckhqdq xmm7,xmm0		; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	rax, [original_rbp]
+	mov	rdi, r12	; (JSAMPROW *)
+	mov	rax, r13
+
+	; -- Even part
+
+	; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+	movdqa	xmm2,xmm6
+	movdqa	xmm0,xmm5
+	psubw	xmm6,xmm1		; xmm6=tmp11
+	psubw	xmm5,xmm3
+	paddw	xmm2,xmm1		; xmm2=tmp10
+	paddw	xmm0,xmm3		; xmm0=tmp13
+
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[rel PW_F1414]
+	psubw	xmm5,xmm0		; xmm5=tmp12
+
+	movdqa	xmm1,xmm2
+	movdqa	xmm3,xmm6
+	psubw	xmm2,xmm0		; xmm2=tmp3
+	psubw	xmm6,xmm5		; xmm6=tmp2
+	paddw	xmm1,xmm0		; xmm1=tmp0
+	paddw	xmm3,xmm5		; xmm3=tmp1
+
+	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=col1
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=col3
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp3
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp2
+
+	; -- Odd part
+
+	; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	psubw	xmm0,xmm7		; xmm0=z12
+	psubw	xmm4,xmm5		; xmm4=z10
+	paddw	xmm2,xmm7		; xmm2=z11
+	paddw	xmm6,xmm5		; xmm6=z13
+
+	movdqa	xmm7,xmm4		; xmm7=z10(unscaled)
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
+
+	movdqa	xmm5,xmm2
+	psubw	xmm2,xmm6
+	paddw	xmm5,xmm6		; xmm5=tmp7
+
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm2,[rel PW_F1414]	; xmm2=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movdqa	xmm6,xmm4
+	paddw	xmm4,xmm0
+	pmulhw	xmm4,[rel PW_F1847]	; xmm4=z5
+	pmulhw	xmm6,[rel PW_MF1613]
+	pmulhw	xmm0,[rel PW_F1082]
+	psubw	xmm6,xmm7
+	psubw	xmm0,xmm4		; xmm0=tmp10
+	paddw	xmm6,xmm4		; xmm6=tmp12
+
+	; -- Final output stage
+
+	psubw	xmm6,xmm5		; xmm6=tmp6
+	movdqa	xmm7,xmm1
+	movdqa	xmm4,xmm3
+	paddw	xmm1,xmm5		; xmm1=data0=(00 10 20 30 40 50 60 70)
+	paddw	xmm3,xmm6		; xmm3=data1=(01 11 21 31 41 51 61 71)
+	psraw	xmm1,(PASS1_BITS+3)	; descale
+	psraw	xmm3,(PASS1_BITS+3)	; descale
+	psubw	xmm7,xmm5		; xmm7=data7=(07 17 27 37 47 57 67 77)
+	psubw	xmm4,xmm6		; xmm4=data6=(06 16 26 36 46 56 66 76)
+	psraw	xmm7,(PASS1_BITS+3)	; descale
+	psraw	xmm4,(PASS1_BITS+3)	; descale
+	psubw	xmm2,xmm6		; xmm2=tmp5
+
+	packsswb  xmm1,xmm4	; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	packsswb  xmm3,xmm7	; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp2
+	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=tmp3
+
+	paddw	xmm0,xmm2		; xmm0=tmp4
+	movdqa	xmm4,xmm5
+	movdqa	xmm7,xmm6
+	paddw	xmm5,xmm2		; xmm5=data2=(02 12 22 32 42 52 62 72)
+	paddw	xmm6,xmm0		; xmm6=data4=(04 14 24 34 44 54 64 74)
+	psraw	xmm5,(PASS1_BITS+3)	; descale
+	psraw	xmm6,(PASS1_BITS+3)	; descale
+	psubw	xmm4,xmm2		; xmm4=data5=(05 15 25 35 45 55 65 75)
+	psubw	xmm7,xmm0		; xmm7=data3=(03 13 23 33 43 53 63 73)
+	psraw	xmm4,(PASS1_BITS+3)	; descale
+	psraw	xmm7,(PASS1_BITS+3)	; descale
+
+	movdqa    xmm2,[rel PB_CENTERJSAMP]	; xmm2=[rel PB_CENTERJSAMP]
+
+	packsswb  xmm5,xmm6	; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+	packsswb  xmm7,xmm4	; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+	paddb     xmm1,xmm2
+	paddb     xmm3,xmm2
+	paddb     xmm5,xmm2
+	paddb     xmm7,xmm2
+
+	movdqa    xmm0,xmm1	; transpose coefficients(phase 1)
+	punpcklbw xmm1,xmm3	; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+	punpckhbw xmm0,xmm3	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+	movdqa    xmm6,xmm5	; transpose coefficients(phase 1)
+	punpcklbw xmm5,xmm7	; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+	punpckhbw xmm6,xmm7	; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+	movdqa    xmm4,xmm1	; transpose coefficients(phase 2)
+	punpcklwd xmm1,xmm5	; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm5	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+	movdqa    xmm2,xmm6	; transpose coefficients(phase 2)
+	punpcklwd xmm6,xmm0	; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+	punpckhwd xmm2,xmm0	; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+	movdqa    xmm3,xmm1	; transpose coefficients(phase 3)
+	punpckldq xmm1,xmm6	; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm3,xmm6	; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+	movdqa    xmm7,xmm4	; transpose coefficients(phase 3)
+	punpckldq xmm4,xmm2	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+	punpckhdq xmm7,xmm2	; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+	pshufd	xmm5,xmm1,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm0,xmm3,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+	pshufd	xmm6,xmm4,0x4E	; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+	pshufd	xmm2,xmm7,0x4E	; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+	mov	rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+	mov	rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jiss2fst.asm b/simd/jiss2fst.asm
new file mode 100644
index 0000000..b53664d
--- /dev/null
+++ b/simd/jiss2fst.asm

@@ -0,0 +1,502 @@
+;
+; jiss2fst.asm - fast integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414	times 8 dw  F_1_414 << CONST_SHIFT
+PW_F1847	times 8 dw  F_1_847 << CONST_SHIFT
+PW_MF1613	times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082	times 8 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm1,xmm0
+	packsswb xmm1,xmm1
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa    xmm7,xmm0		; xmm0=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm7,xmm7		; xmm7=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm6,xmm0,0x00		; xmm6=col0=(00 00 00 00 00 00 00 00)
+	pshufd	xmm2,xmm0,0x55		; xmm2=col1=(01 01 01 01 01 01 01 01)
+	pshufd	xmm5,xmm0,0xAA		; xmm5=col2=(02 02 02 02 02 02 02 02)
+	pshufd	xmm0,xmm0,0xFF		; xmm0=col3=(03 03 03 03 03 03 03 03)
+	pshufd	xmm1,xmm7,0x00		; xmm1=col4=(04 04 04 04 04 04 04 04)
+	pshufd	xmm4,xmm7,0x55		; xmm4=col5=(05 05 05 05 05 05 05 05)
+	pshufd	xmm3,xmm7,0xAA		; xmm3=col6=(06 06 06 06 06 06 06 06)
+	pshufd	xmm7,xmm7,0xFF		; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=col1
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=col3
+	jmp	near .column_end
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	psubw	xmm0,xmm2		; xmm0=tmp11
+	psubw	xmm1,xmm3
+	paddw	xmm4,xmm2		; xmm4=tmp10
+	paddw	xmm5,xmm3		; xmm5=tmp13
+
+	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	xmm1,xmm5		; xmm1=tmp12
+
+	movdqa	xmm6,xmm4
+	movdqa	xmm7,xmm0
+	psubw	xmm4,xmm5		; xmm4=tmp3
+	psubw	xmm0,xmm1		; xmm0=tmp2
+	paddw	xmm6,xmm5		; xmm6=tmp0
+	paddw	xmm7,xmm1		; xmm7=tmp1
+
+	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=tmp3
+	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movdqa	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm0,xmm5
+	psubw	xmm2,xmm1		; xmm2=z12
+	psubw	xmm5,xmm3		; xmm5=z10
+	paddw	xmm4,xmm1		; xmm4=z11
+	paddw	xmm0,xmm3		; xmm0=z13
+
+	movdqa	xmm1,xmm5		; xmm1=z10(unscaled)
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+
+	movdqa	xmm3,xmm4
+	psubw	xmm4,xmm0
+	paddw	xmm3,xmm0		; xmm3=tmp7
+
+	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F1414)]	; xmm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movdqa	xmm0,xmm5
+	paddw	xmm5,xmm2
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F1847)]	; xmm5=z5
+	pmulhw	xmm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	xmm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	xmm0,xmm1
+	psubw	xmm2,xmm5		; xmm2=tmp10
+	paddw	xmm0,xmm5		; xmm0=tmp12
+
+	; -- Final output stage
+
+	psubw	xmm0,xmm3		; xmm0=tmp6
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,xmm7
+	paddw	xmm6,xmm3		; xmm6=data0=(00 01 02 03 04 05 06 07)
+	paddw	xmm7,xmm0		; xmm7=data1=(10 11 12 13 14 15 16 17)
+	psubw	xmm1,xmm3		; xmm1=data7=(70 71 72 73 74 75 76 77)
+	psubw	xmm5,xmm0		; xmm5=data6=(60 61 62 63 64 65 66 67)
+	psubw	xmm4,xmm0		; xmm4=tmp5
+
+	movdqa    xmm3,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm3,xmm7		; xmm3=(04 14 05 15 06 16 07 17)
+	movdqa    xmm0,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm1		; xmm5=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm0,xmm1		; xmm0=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(60 70 61 71 62 72 63 73)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(64 74 65 75 66 76 67 77)
+
+	paddw	xmm2,xmm4		; xmm2=tmp4
+	movdqa	xmm5,xmm7
+	movdqa	xmm0,xmm1
+	paddw	xmm7,xmm4		; xmm7=data2=(20 21 22 23 24 25 26 27)
+	paddw	xmm1,xmm2		; xmm1=data4=(40 41 42 43 44 45 46 47)
+	psubw	xmm5,xmm4		; xmm5=data5=(50 51 52 53 54 55 56 57)
+	psubw	xmm0,xmm2		; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm0		; xmm7=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm4,xmm0		; xmm4=(24 34 25 35 26 36 27 37)
+	movdqa    xmm2,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm5		; xmm1=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm5		; xmm2=(44 54 45 55 46 56 47 57)
+
+	movdqa    xmm0,xmm3		; transpose coefficients(phase 2)
+	punpckldq xmm3,xmm4		; xmm3=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm0,xmm4		; xmm0=(06 16 26 36 07 17 27 37)
+	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm7		; xmm6=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm5,xmm7		; xmm5=(02 12 22 32 03 13 23 33)
+
+	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(60 70 61 71 62 72 63 73)
+	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=(64 74 65 75 66 76 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm3	; wk(0)=(04 14 24 34 05 15 25 35)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(06 16 26 36 07 17 27 37)
+
+	movdqa    xmm3,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm4		; xmm1=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm3,xmm4		; xmm3=(42 52 62 72 43 53 63 73)
+	movdqa    xmm0,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm7		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm0,xmm7		; xmm0=(46 56 66 76 47 57 67 77)
+
+	movdqa     xmm4,xmm6		; transpose coefficients(phase 3)
+	punpcklqdq xmm6,xmm1		; xmm6=col0=(00 10 20 30 40 50 60 70)
+	punpckhqdq xmm4,xmm1		; xmm4=col1=(01 11 21 31 41 51 61 71)
+	movdqa     xmm7,xmm5		; transpose coefficients(phase 3)
+	punpcklqdq xmm5,xmm3		; xmm5=col2=(02 12 22 32 42 52 62 72)
+	punpckhqdq xmm7,xmm3		; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(04 14 24 34 05 15 25 35)
+	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(06 16 26 36 07 17 27 37)
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=col1
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=col3
+
+	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm2		; xmm1=col4=(04 14 24 34 44 54 64 74)
+	punpckhqdq xmm4,xmm2		; xmm4=col5=(05 15 25 35 45 55 65 75)
+	movdqa     xmm7,xmm3		; transpose coefficients(phase 3)
+	punpcklqdq xmm3,xmm0		; xmm3=col6=(06 16 26 36 46 56 66 76)
+	punpckhqdq xmm7,xmm0		; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Even part
+
+	; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+	movdqa	xmm2,xmm6
+	movdqa	xmm0,xmm5
+	psubw	xmm6,xmm1		; xmm6=tmp11
+	psubw	xmm5,xmm3
+	paddw	xmm2,xmm1		; xmm2=tmp10
+	paddw	xmm0,xmm3		; xmm0=tmp13
+
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F1414)]
+	psubw	xmm5,xmm0		; xmm5=tmp12
+
+	movdqa	xmm1,xmm2
+	movdqa	xmm3,xmm6
+	psubw	xmm2,xmm0		; xmm2=tmp3
+	psubw	xmm6,xmm5		; xmm6=tmp2
+	paddw	xmm1,xmm0		; xmm1=tmp0
+	paddw	xmm3,xmm5		; xmm3=tmp1
+
+	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=col1
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=col3
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp3
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp2
+
+	; -- Odd part
+
+	; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	psubw	xmm0,xmm7		; xmm0=z12
+	psubw	xmm4,xmm5		; xmm4=z10
+	paddw	xmm2,xmm7		; xmm2=z11
+	paddw	xmm6,xmm5		; xmm6=z13
+
+	movdqa	xmm7,xmm4		; xmm7=z10(unscaled)
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
+
+	movdqa	xmm5,xmm2
+	psubw	xmm2,xmm6
+	paddw	xmm5,xmm6		; xmm5=tmp7
+
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm2,[GOTOFF(ebx,PW_F1414)]	; xmm2=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movdqa	xmm6,xmm4
+	paddw	xmm4,xmm0
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F1847)]	; xmm4=z5
+	pmulhw	xmm6,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	xmm0,[GOTOFF(ebx,PW_F1082)]
+	psubw	xmm6,xmm7
+	psubw	xmm0,xmm4		; xmm0=tmp10
+	paddw	xmm6,xmm4		; xmm6=tmp12
+
+	; -- Final output stage
+
+	psubw	xmm6,xmm5		; xmm6=tmp6
+	movdqa	xmm7,xmm1
+	movdqa	xmm4,xmm3
+	paddw	xmm1,xmm5		; xmm1=data0=(00 10 20 30 40 50 60 70)
+	paddw	xmm3,xmm6		; xmm3=data1=(01 11 21 31 41 51 61 71)
+	psraw	xmm1,(PASS1_BITS+3)	; descale
+	psraw	xmm3,(PASS1_BITS+3)	; descale
+	psubw	xmm7,xmm5		; xmm7=data7=(07 17 27 37 47 57 67 77)
+	psubw	xmm4,xmm6		; xmm4=data6=(06 16 26 36 46 56 66 76)
+	psraw	xmm7,(PASS1_BITS+3)	; descale
+	psraw	xmm4,(PASS1_BITS+3)	; descale
+	psubw	xmm2,xmm6		; xmm2=tmp5
+
+	packsswb  xmm1,xmm4	; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	packsswb  xmm3,xmm7	; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp2
+	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=tmp3
+
+	paddw	xmm0,xmm2		; xmm0=tmp4
+	movdqa	xmm4,xmm5
+	movdqa	xmm7,xmm6
+	paddw	xmm5,xmm2		; xmm5=data2=(02 12 22 32 42 52 62 72)
+	paddw	xmm6,xmm0		; xmm6=data4=(04 14 24 34 44 54 64 74)
+	psraw	xmm5,(PASS1_BITS+3)	; descale
+	psraw	xmm6,(PASS1_BITS+3)	; descale
+	psubw	xmm4,xmm2		; xmm4=data5=(05 15 25 35 45 55 65 75)
+	psubw	xmm7,xmm0		; xmm7=data3=(03 13 23 33 43 53 63 73)
+	psraw	xmm4,(PASS1_BITS+3)	; descale
+	psraw	xmm7,(PASS1_BITS+3)	; descale
+
+	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
+
+	packsswb  xmm5,xmm6	; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+	packsswb  xmm7,xmm4	; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+	paddb     xmm1,xmm2
+	paddb     xmm3,xmm2
+	paddb     xmm5,xmm2
+	paddb     xmm7,xmm2
+
+	movdqa    xmm0,xmm1	; transpose coefficients(phase 1)
+	punpcklbw xmm1,xmm3	; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+	punpckhbw xmm0,xmm3	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+	movdqa    xmm6,xmm5	; transpose coefficients(phase 1)
+	punpcklbw xmm5,xmm7	; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+	punpckhbw xmm6,xmm7	; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+	movdqa    xmm4,xmm1	; transpose coefficients(phase 2)
+	punpcklwd xmm1,xmm5	; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm5	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+	movdqa    xmm2,xmm6	; transpose coefficients(phase 2)
+	punpcklwd xmm6,xmm0	; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+	punpckhwd xmm2,xmm0	; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+	movdqa    xmm3,xmm1	; transpose coefficients(phase 3)
+	punpckldq xmm1,xmm6	; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm3,xmm6	; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+	movdqa    xmm7,xmm4	; transpose coefficients(phase 3)
+	punpckldq xmm4,xmm2	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+	punpckhdq xmm7,xmm2	; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+	pshufd	xmm5,xmm1,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm0,xmm3,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+	pshufd	xmm6,xmm4,0x4E	; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+	pshufd	xmm2,xmm7,0x4E	; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+	mov	edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+	mov	edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jiss2int-64.asm b/simd/jiss2int-64.asm
new file mode 100644
index 0000000..82da0a7
--- /dev/null
+++ b/simd/jiss2int-64.asm

@@ -0,0 +1,848 @@
+;
+; jiss2int.asm - accurate integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info * compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp	rbp+0
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		12
+
+	align	16
+	global	EXTN(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+
+	; ---- Pass 1: process columns from input.
+
+	mov	rdx, r10	; quantptr
+	mov	rsi, r11		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1,xmm0
+	packsswb xmm1,xmm1
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	rax,rax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	xmm5,PASS1_BITS
+
+	movdqa    xmm4,xmm5		; xmm5=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm5,xmm5		; xmm5=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm4,xmm4		; xmm4=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm7,xmm5,0x00		; xmm7=col0=(00 00 00 00 00 00 00 00)
+	pshufd	xmm6,xmm5,0x55		; xmm6=col1=(01 01 01 01 01 01 01 01)
+	pshufd	xmm1,xmm5,0xAA		; xmm1=col2=(02 02 02 02 02 02 02 02)
+	pshufd	xmm5,xmm5,0xFF		; xmm5=col3=(03 03 03 03 03 03 03 03)
+	pshufd	xmm0,xmm4,0x00		; xmm0=col4=(04 04 04 04 04 04 04 04)
+	pshufd	xmm3,xmm4,0x55		; xmm3=col5=(05 05 05 05 05 05 05 05)
+	pshufd	xmm2,xmm4,0xAA		; xmm2=col6=(06 06 06 06 06 06 06 06)
+	pshufd	xmm4,xmm4,0xFF		; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+	movdqa	XMMWORD [wk(8)], xmm6	; wk(8)=col1
+	movdqa	XMMWORD [wk(9)], xmm5	; wk(9)=col3
+	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
+	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
+	jmp	near .column_end
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movdqa    xmm4,xmm1		; xmm1=in2=z2
+	movdqa    xmm5,xmm1
+	punpcklwd xmm4,xmm3		; xmm3=in6=z3
+	punpckhwd xmm5,xmm3
+	movdqa    xmm1,xmm4
+	movdqa    xmm3,xmm5
+	pmaddwd   xmm4,[rel PW_F130_F054]	; xmm4=tmp3L
+	pmaddwd   xmm5,[rel PW_F130_F054]	; xmm5=tmp3H
+	pmaddwd   xmm1,[rel PW_F054_MF130]	; xmm1=tmp2L
+	pmaddwd   xmm3,[rel PW_F054_MF130]	; xmm3=tmp2H
+
+	movdqa    xmm6,xmm0
+	paddw     xmm0,xmm2		; xmm0=in0+in4
+	psubw     xmm6,xmm2		; xmm6=in0-in4
+
+	pxor      xmm7,xmm7
+	pxor      xmm2,xmm2
+	punpcklwd xmm7,xmm0		; xmm7=tmp0L
+	punpckhwd xmm2,xmm0		; xmm2=tmp0H
+	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
+	psrad     xmm2,(16-CONST_BITS)	; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+	movdqa	xmm0,xmm7
+	paddd	xmm7,xmm4		; xmm7=tmp10L
+	psubd	xmm0,xmm4		; xmm0=tmp13L
+	movdqa	xmm4,xmm2
+	paddd	xmm2,xmm5		; xmm2=tmp10H
+	psubd	xmm4,xmm5		; xmm4=tmp13H
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=tmp10L
+	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=tmp10H
+	movdqa	XMMWORD [wk(2)], xmm0	; wk(2)=tmp13L
+	movdqa	XMMWORD [wk(3)], xmm4	; wk(3)=tmp13H
+
+	pxor      xmm5,xmm5
+	pxor      xmm7,xmm7
+	punpcklwd xmm5,xmm6		; xmm5=tmp1L
+	punpckhwd xmm7,xmm6		; xmm7=tmp1H
+	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
+	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+	movdqa	xmm2,xmm5
+	paddd	xmm5,xmm1		; xmm5=tmp11L
+	psubd	xmm2,xmm1		; xmm2=tmp12L
+	movdqa	xmm0,xmm7
+	paddd	xmm7,xmm3		; xmm7=tmp11H
+	psubd	xmm0,xmm3		; xmm0=tmp12H
+
+	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
+	movdqa	XMMWORD [wk(5)], xmm7	; wk(5)=tmp11H
+	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=tmp12L
+	movdqa	XMMWORD [wk(7)], xmm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movdqa	xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm7,xmm4
+	paddw	xmm5,xmm3		; xmm5=z3
+	paddw	xmm7,xmm1		; xmm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm2,xmm5
+	movdqa    xmm0,xmm5
+	punpcklwd xmm2,xmm7
+	punpckhwd xmm0,xmm7
+	movdqa    xmm5,xmm2
+	movdqa    xmm7,xmm0
+	pmaddwd   xmm2,[rel PW_MF078_F117]	; xmm2=z3L
+	pmaddwd   xmm0,[rel PW_MF078_F117]	; xmm0=z3H
+	pmaddwd   xmm5,[rel PW_F117_F078]	; xmm5=z4L
+	pmaddwd   xmm7,[rel PW_F117_F078]	; xmm7=z4H
+
+	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=z3L
+	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movdqa    xmm2,xmm3
+	movdqa    xmm0,xmm3
+	punpcklwd xmm2,xmm4
+	punpckhwd xmm0,xmm4
+	movdqa    xmm3,xmm2
+	movdqa    xmm4,xmm0
+	pmaddwd   xmm2,[rel PW_MF060_MF089]	; xmm2=tmp0L
+	pmaddwd   xmm0,[rel PW_MF060_MF089]	; xmm0=tmp0H
+	pmaddwd   xmm3,[rel PW_MF089_F060]	; xmm3=tmp3L
+	pmaddwd   xmm4,[rel PW_MF089_F060]	; xmm4=tmp3H
+
+	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp0L
+	paddd	xmm0, XMMWORD [wk(11)]	; xmm0=tmp0H
+	paddd	xmm3,xmm5		; xmm3=tmp3L
+	paddd	xmm4,xmm7		; xmm4=tmp3H
+
+	movdqa	XMMWORD [wk(8)], xmm2	; wk(8)=tmp0L
+	movdqa	XMMWORD [wk(9)], xmm0	; wk(9)=tmp0H
+
+	movdqa    xmm2,xmm1
+	movdqa    xmm0,xmm1
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm0,xmm6
+	movdqa    xmm1,xmm2
+	movdqa    xmm6,xmm0
+	pmaddwd   xmm2,[rel PW_MF050_MF256]	; xmm2=tmp1L
+	pmaddwd   xmm0,[rel PW_MF050_MF256]	; xmm0=tmp1H
+	pmaddwd   xmm1,[rel PW_MF256_F050]	; xmm1=tmp2L
+	pmaddwd   xmm6,[rel PW_MF256_F050]	; xmm6=tmp2H
+
+	paddd	xmm2,xmm5		; xmm2=tmp1L
+	paddd	xmm0,xmm7		; xmm0=tmp1H
+	paddd	xmm1, XMMWORD [wk(10)]	; xmm1=tmp2L
+	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
+
+	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=tmp1L
+	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
+	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=tmp10H
+
+	movdqa	xmm2,xmm5
+	movdqa	xmm0,xmm7
+	paddd	xmm5,xmm3		; xmm5=data0L
+	paddd	xmm7,xmm4		; xmm7=data0H
+	psubd	xmm2,xmm3		; xmm2=data7L
+	psubd	xmm0,xmm4		; xmm0=data7H
+
+	movdqa	xmm3,[rel PD_DESCALE_P1]	; xmm3=[rel PD_DESCALE_P1]
+
+	paddd	xmm5,xmm3
+	paddd	xmm7,xmm3
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm7,DESCALE_P1
+	paddd	xmm2,xmm3
+	paddd	xmm0,xmm3
+	psrad	xmm2,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm5,xmm7		; xmm5=data0=(00 01 02 03 04 05 06 07)
+	packssdw  xmm2,xmm0		; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+	movdqa	xmm4, XMMWORD [wk(4)]	; xmm4=tmp11L
+	movdqa	xmm3, XMMWORD [wk(5)]	; xmm3=tmp11H
+
+	movdqa	xmm7,xmm4
+	movdqa	xmm0,xmm3
+	paddd	xmm4,xmm1		; xmm4=data1L
+	paddd	xmm3,xmm6		; xmm3=data1H
+	psubd	xmm7,xmm1		; xmm7=data6L
+	psubd	xmm0,xmm6		; xmm0=data6H
+
+	movdqa	xmm1,[rel PD_DESCALE_P1]	; xmm1=[rel PD_DESCALE_P1]
+
+	paddd	xmm4,xmm1
+	paddd	xmm3,xmm1
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm3,DESCALE_P1
+	paddd	xmm7,xmm1
+	paddd	xmm0,xmm1
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm4,xmm3		; xmm4=data1=(10 11 12 13 14 15 16 17)
+	packssdw  xmm7,xmm0		; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+	movdqa    xmm6,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm4		; xmm5=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm6,xmm4		; xmm6=(04 14 05 15 06 16 07 17)
+	movdqa    xmm1,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm2		; xmm7=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm1,xmm2		; xmm1=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm3, XMMWORD [wk(6)]	; xmm3=tmp12L
+	movdqa	xmm0, XMMWORD [wk(7)]	; xmm0=tmp12H
+	movdqa	xmm4, XMMWORD [wk(10)]	; xmm4=tmp1L
+	movdqa	xmm2, XMMWORD [wk(11)]	; xmm2=tmp1H
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 01 11 02 12 03 13)
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=(04 14 05 15 06 16 07 17)
+	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=(60 70 61 71 62 72 63 73)
+	movdqa	XMMWORD [wk(5)], xmm1	; wk(5)=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm5,xmm3
+	movdqa	xmm6,xmm0
+	paddd	xmm3,xmm4		; xmm3=data2L
+	paddd	xmm0,xmm2		; xmm0=data2H
+	psubd	xmm5,xmm4		; xmm5=data5L
+	psubd	xmm6,xmm2		; xmm6=data5H
+
+	movdqa	xmm7,[rel PD_DESCALE_P1]	; xmm7=[rel PD_DESCALE_P1]
+
+	paddd	xmm3,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm3,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+	paddd	xmm5,xmm7
+	paddd	xmm6,xmm7
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+
+	packssdw  xmm3,xmm0		; xmm3=data2=(20 21 22 23 24 25 26 27)
+	packssdw  xmm5,xmm6		; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+	movdqa	xmm1, XMMWORD [wk(2)]	; xmm1=tmp13L
+	movdqa	xmm4, XMMWORD [wk(3)]	; xmm4=tmp13H
+	movdqa	xmm2, XMMWORD [wk(8)]	; xmm2=tmp0L
+	movdqa	xmm7, XMMWORD [wk(9)]	; xmm7=tmp0H
+
+	movdqa	xmm0,xmm1
+	movdqa	xmm6,xmm4
+	paddd	xmm1,xmm2		; xmm1=data3L
+	paddd	xmm4,xmm7		; xmm4=data3H
+	psubd	xmm0,xmm2		; xmm0=data4L
+	psubd	xmm6,xmm7		; xmm6=data4H
+
+	movdqa	xmm2,[rel PD_DESCALE_P1]	; xmm2=[rel PD_DESCALE_P1]
+
+	paddd	xmm1,xmm2
+	paddd	xmm4,xmm2
+	psrad	xmm1,DESCALE_P1
+	psrad	xmm4,DESCALE_P1
+	paddd	xmm0,xmm2
+	paddd	xmm6,xmm2
+	psrad	xmm0,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+
+	packssdw  xmm1,xmm4		; xmm1=data3=(30 31 32 33 34 35 36 37)
+	packssdw  xmm0,xmm6		; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 01 11 02 12 03 13)
+	movdqa	xmm2, XMMWORD [wk(1)]	; xmm2=(04 14 05 15 06 16 07 17)
+
+	movdqa    xmm4,xmm3		; transpose coefficients(phase 1)
+	punpcklwd xmm3,xmm1		; xmm3=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm4,xmm1		; xmm4=(24 34 25 35 26 36 27 37)
+	movdqa    xmm6,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm5		; xmm0=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm6,xmm5		; xmm6=(44 54 45 55 46 56 47 57)
+
+	movdqa    xmm1,xmm7		; transpose coefficients(phase 2)
+	punpckldq xmm7,xmm3		; xmm7=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm1,xmm3		; xmm1=(02 12 22 32 03 13 23 33)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm4		; xmm2=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm5,xmm4		; xmm5=(06 16 26 36 07 17 27 37)
+
+	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=(60 70 61 71 62 72 63 73)
+	movdqa	xmm4, XMMWORD [wk(5)]	; xmm4=(64 74 65 75 66 76 67 77)
+
+	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=(04 14 24 34 05 15 25 35)
+	movdqa	XMMWORD [wk(7)], xmm5	; wk(7)=(06 16 26 36 07 17 27 37)
+
+	movdqa    xmm2,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm3		; xmm0=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm2,xmm3		; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm4		; xmm6=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm5,xmm4		; xmm5=(46 56 66 76 47 57 67 77)
+
+	movdqa     xmm3,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm0		; xmm7=col0=(00 10 20 30 40 50 60 70)
+	punpckhqdq xmm3,xmm0		; xmm3=col1=(01 11 21 31 41 51 61 71)
+	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm2		; xmm1=col2=(02 12 22 32 42 52 62 72)
+	punpckhqdq xmm4,xmm2		; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+	movdqa	xmm0, XMMWORD [wk(6)]	; xmm0=(04 14 24 34 05 15 25 35)
+	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa	XMMWORD [wk(8)], xmm3	; wk(8)=col1
+	movdqa	XMMWORD [wk(9)], xmm4	; wk(9)=col3
+
+	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=col4=(04 14 24 34 44 54 64 74)
+	punpckhqdq xmm3,xmm6		; xmm3=col5=(05 15 25 35 45 55 65 75)
+	movdqa     xmm4,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm5		; xmm2=col6=(06 16 26 36 46 56 66 76)
+	punpckhqdq xmm4,xmm5		; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
+	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	rax, [original_rbp]
+	mov	rdi, r12	; (JSAMPROW *)
+	mov	rax, r13
+
+	; -- Even part
+
+	; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movdqa    xmm6,xmm1		; xmm1=in2=z2
+	movdqa    xmm5,xmm1
+	punpcklwd xmm6,xmm2		; xmm2=in6=z3
+	punpckhwd xmm5,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm2,xmm5
+	pmaddwd   xmm6,[rel PW_F130_F054]	; xmm6=tmp3L
+	pmaddwd   xmm5,[rel PW_F130_F054]	; xmm5=tmp3H
+	pmaddwd   xmm1,[rel PW_F054_MF130]	; xmm1=tmp2L
+	pmaddwd   xmm2,[rel PW_F054_MF130]	; xmm2=tmp2H
+
+	movdqa    xmm3,xmm7
+	paddw     xmm7,xmm0		; xmm7=in0+in4
+	psubw     xmm3,xmm0		; xmm3=in0-in4
+
+	pxor      xmm4,xmm4
+	pxor      xmm0,xmm0
+	punpcklwd xmm4,xmm7		; xmm4=tmp0L
+	punpckhwd xmm0,xmm7		; xmm0=tmp0H
+	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
+	psrad     xmm0,(16-CONST_BITS)	; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+	movdqa	xmm7,xmm4
+	paddd	xmm4,xmm6		; xmm4=tmp10L
+	psubd	xmm7,xmm6		; xmm7=tmp13L
+	movdqa	xmm6,xmm0
+	paddd	xmm0,xmm5		; xmm0=tmp10H
+	psubd	xmm6,xmm5		; xmm6=tmp13H
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=tmp10L
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp10H
+	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=tmp13L
+	movdqa	XMMWORD [wk(3)], xmm6	; wk(3)=tmp13H
+
+	pxor      xmm5,xmm5
+	pxor      xmm4,xmm4
+	punpcklwd xmm5,xmm3		; xmm5=tmp1L
+	punpckhwd xmm4,xmm3		; xmm4=tmp1H
+	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
+	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+	movdqa	xmm0,xmm5
+	paddd	xmm5,xmm1		; xmm5=tmp11L
+	psubd	xmm0,xmm1		; xmm0=tmp12L
+	movdqa	xmm7,xmm4
+	paddd	xmm4,xmm2		; xmm4=tmp11H
+	psubd	xmm7,xmm2		; xmm7=tmp12H
+
+	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=tmp11H
+	movdqa	XMMWORD [wk(6)], xmm0	; wk(6)=tmp12L
+	movdqa	XMMWORD [wk(7)], xmm7	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movdqa	xmm6, XMMWORD [wk(9)]	; xmm6=col3
+	movdqa	xmm3, XMMWORD [wk(8)]	; xmm3=col1
+	movdqa	xmm1, XMMWORD [wk(11)]	; xmm1=col7
+	movdqa	xmm2, XMMWORD [wk(10)]	; xmm2=col5
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm4,xmm3
+	paddw	xmm5,xmm1		; xmm5=z3
+	paddw	xmm4,xmm2		; xmm4=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm0,xmm5
+	movdqa    xmm7,xmm5
+	punpcklwd xmm0,xmm4
+	punpckhwd xmm7,xmm4
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm7
+	pmaddwd   xmm0,[rel PW_MF078_F117]	; xmm0=z3L
+	pmaddwd   xmm7,[rel PW_MF078_F117]	; xmm7=z3H
+	pmaddwd   xmm5,[rel PW_F117_F078]	; xmm5=z4L
+	pmaddwd   xmm4,[rel PW_F117_F078]	; xmm4=z4H
+
+	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=z3L
+	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movdqa    xmm0,xmm1
+	movdqa    xmm7,xmm1
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm7,xmm3
+	movdqa    xmm1,xmm0
+	movdqa    xmm3,xmm7
+	pmaddwd   xmm0,[rel PW_MF060_MF089]	; xmm0=tmp0L
+	pmaddwd   xmm7,[rel PW_MF060_MF089]	; xmm7=tmp0H
+	pmaddwd   xmm1,[rel PW_MF089_F060]	; xmm1=tmp3L
+	pmaddwd   xmm3,[rel PW_MF089_F060]	; xmm3=tmp3H
+
+	paddd	xmm0, XMMWORD [wk(10)]	; xmm0=tmp0L
+	paddd	xmm7, XMMWORD [wk(11)]	; xmm7=tmp0H
+	paddd	xmm1,xmm5		; xmm1=tmp3L
+	paddd	xmm3,xmm4		; xmm3=tmp3H
+
+	movdqa	XMMWORD [wk(8)], xmm0	; wk(8)=tmp0L
+	movdqa	XMMWORD [wk(9)], xmm7	; wk(9)=tmp0H
+
+	movdqa    xmm0,xmm2
+	movdqa    xmm7,xmm2
+	punpcklwd xmm0,xmm6
+	punpckhwd xmm7,xmm6
+	movdqa    xmm2,xmm0
+	movdqa    xmm6,xmm7
+	pmaddwd   xmm0,[rel PW_MF050_MF256]	; xmm0=tmp1L
+	pmaddwd   xmm7,[rel PW_MF050_MF256]	; xmm7=tmp1H
+	pmaddwd   xmm2,[rel PW_MF256_F050]	; xmm2=tmp2L
+	pmaddwd   xmm6,[rel PW_MF256_F050]	; xmm6=tmp2H
+
+	paddd	xmm0,xmm5		; xmm0=tmp1L
+	paddd	xmm7,xmm4		; xmm7=tmp1H
+	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp2L
+	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
+
+	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=tmp1L
+	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
+	movdqa	xmm4, XMMWORD [wk(1)]	; xmm4=tmp10H
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm7,xmm4
+	paddd	xmm5,xmm1		; xmm5=data0L
+	paddd	xmm4,xmm3		; xmm4=data0H
+	psubd	xmm0,xmm1		; xmm0=data7L
+	psubd	xmm7,xmm3		; xmm7=data7H
+
+	movdqa	xmm1,[rel PD_DESCALE_P2]	; xmm1=[rel PD_DESCALE_P2]
+
+	paddd	xmm5,xmm1
+	paddd	xmm4,xmm1
+	psrad	xmm5,DESCALE_P2
+	psrad	xmm4,DESCALE_P2
+	paddd	xmm0,xmm1
+	paddd	xmm7,xmm1
+	psrad	xmm0,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm5,xmm4		; xmm5=data0=(00 10 20 30 40 50 60 70)
+	packssdw  xmm0,xmm7		; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=tmp11L
+	movdqa	xmm1, XMMWORD [wk(5)]	; xmm1=tmp11H
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm7,xmm1
+	paddd	xmm3,xmm2		; xmm3=data1L
+	paddd	xmm1,xmm6		; xmm1=data1H
+	psubd	xmm4,xmm2		; xmm4=data6L
+	psubd	xmm7,xmm6		; xmm7=data6H
+
+	movdqa	xmm2,[rel PD_DESCALE_P2]	; xmm2=[rel PD_DESCALE_P2]
+
+	paddd	xmm3,xmm2
+	paddd	xmm1,xmm2
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm4,xmm2
+	paddd	xmm7,xmm2
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm3,xmm1		; xmm3=data1=(01 11 21 31 41 51 61 71)
+	packssdw  xmm4,xmm7		; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+	packsswb  xmm5,xmm4		; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	packsswb  xmm3,xmm0		; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm6, XMMWORD [wk(6)]	; xmm6=tmp12L
+	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=tmp12H
+	movdqa	xmm1, XMMWORD [wk(10)]	; xmm1=tmp1L
+	movdqa	xmm7, XMMWORD [wk(11)]	; xmm7=tmp1H
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm4,xmm6
+	movdqa	xmm0,xmm2
+	paddd	xmm6,xmm1		; xmm6=data2L
+	paddd	xmm2,xmm7		; xmm2=data2H
+	psubd	xmm4,xmm1		; xmm4=data5L
+	psubd	xmm0,xmm7		; xmm0=data5H
+
+	movdqa	xmm5,[rel PD_DESCALE_P2]	; xmm5=[rel PD_DESCALE_P2]
+
+	paddd	xmm6,xmm5
+	paddd	xmm2,xmm5
+	psrad	xmm6,DESCALE_P2
+	psrad	xmm2,DESCALE_P2
+	paddd	xmm4,xmm5
+	paddd	xmm0,xmm5
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm0,DESCALE_P2
+
+	packssdw  xmm6,xmm2		; xmm6=data2=(02 12 22 32 42 52 62 72)
+	packssdw  xmm4,xmm0		; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+	movdqa	xmm3, XMMWORD [wk(2)]	; xmm3=tmp13L
+	movdqa	xmm1, XMMWORD [wk(3)]	; xmm1=tmp13H
+	movdqa	xmm7, XMMWORD [wk(8)]	; xmm7=tmp0L
+	movdqa	xmm5, XMMWORD [wk(9)]	; xmm5=tmp0H
+
+	movdqa	xmm2,xmm3
+	movdqa	xmm0,xmm1
+	paddd	xmm3,xmm7		; xmm3=data3L
+	paddd	xmm1,xmm5		; xmm1=data3H
+	psubd	xmm2,xmm7		; xmm2=data4L
+	psubd	xmm0,xmm5		; xmm0=data4H
+
+	movdqa	xmm7,[rel PD_DESCALE_P2]	; xmm7=[rel PD_DESCALE_P2]
+
+	paddd	xmm3,xmm7
+	paddd	xmm1,xmm7
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm2,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm2,DESCALE_P2
+	psrad	xmm0,DESCALE_P2
+
+	movdqa    xmm5,[rel PB_CENTERJSAMP]	; xmm5=[rel PB_CENTERJSAMP]
+
+	packssdw  xmm3,xmm1		; xmm3=data3=(03 13 23 33 43 53 63 73)
+	packssdw  xmm2,xmm0		; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+	movdqa    xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	packsswb  xmm6,xmm2		; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+	packsswb  xmm3,xmm4		; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+	paddb     xmm7,xmm5
+	paddb     xmm1,xmm5
+	paddb     xmm6,xmm5
+	paddb     xmm3,xmm5
+
+	movdqa    xmm0,xmm7	; transpose coefficients(phase 1)
+	punpcklbw xmm7,xmm1	; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+	punpckhbw xmm0,xmm1	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+	movdqa    xmm2,xmm6	; transpose coefficients(phase 1)
+	punpcklbw xmm6,xmm3	; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+	punpckhbw xmm2,xmm3	; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+	movdqa    xmm4,xmm7	; transpose coefficients(phase 2)
+	punpcklwd xmm7,xmm6	; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm6	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+	movdqa    xmm5,xmm2	; transpose coefficients(phase 2)
+	punpcklwd xmm2,xmm0	; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+	punpckhwd xmm5,xmm0	; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+	movdqa    xmm1,xmm7	; transpose coefficients(phase 3)
+	punpckldq xmm7,xmm2	; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm1,xmm2	; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+	movdqa    xmm3,xmm4	; transpose coefficients(phase 3)
+	punpckldq xmm4,xmm5	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+	punpckhdq xmm3,xmm5	; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+	pshufd	xmm6,xmm7,0x4E	; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm0,xmm1,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+	pshufd	xmm2,xmm4,0x4E	; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+	pshufd	xmm5,xmm3,0x4E	; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+	mov	rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+	mov	rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+	movq	XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jiss2int.asm b/simd/jiss2int.asm
new file mode 100644
index 0000000..adf39fb
--- /dev/null
+++ b/simd/jiss2int.asm

@@ -0,0 +1,859 @@
+;
+; jiss2int.asm - accurate integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; jpeg_component_info * compptr
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		12
+
+	align	16
+	global	EXTN(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm1,xmm0
+	packsswb xmm1,xmm1
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	xmm5,PASS1_BITS
+
+	movdqa    xmm4,xmm5		; xmm5=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm5,xmm5		; xmm5=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm4,xmm4		; xmm4=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm7,xmm5,0x00		; xmm7=col0=(00 00 00 00 00 00 00 00)
+	pshufd	xmm6,xmm5,0x55		; xmm6=col1=(01 01 01 01 01 01 01 01)
+	pshufd	xmm1,xmm5,0xAA		; xmm1=col2=(02 02 02 02 02 02 02 02)
+	pshufd	xmm5,xmm5,0xFF		; xmm5=col3=(03 03 03 03 03 03 03 03)
+	pshufd	xmm0,xmm4,0x00		; xmm0=col4=(04 04 04 04 04 04 04 04)
+	pshufd	xmm3,xmm4,0x55		; xmm3=col5=(05 05 05 05 05 05 05 05)
+	pshufd	xmm2,xmm4,0xAA		; xmm2=col6=(06 06 06 06 06 06 06 06)
+	pshufd	xmm4,xmm4,0xFF		; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+	movdqa	XMMWORD [wk(8)], xmm6	; wk(8)=col1
+	movdqa	XMMWORD [wk(9)], xmm5	; wk(9)=col3
+	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
+	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
+	jmp	near .column_end
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movdqa    xmm4,xmm1		; xmm1=in2=z2
+	movdqa    xmm5,xmm1
+	punpcklwd xmm4,xmm3		; xmm3=in6=z3
+	punpckhwd xmm5,xmm3
+	movdqa    xmm1,xmm4
+	movdqa    xmm3,xmm5
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]	; xmm4=tmp3L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]	; xmm5=tmp3H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=tmp2L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_F054_MF130)]	; xmm3=tmp2H
+
+	movdqa    xmm6,xmm0
+	paddw     xmm0,xmm2		; xmm0=in0+in4
+	psubw     xmm6,xmm2		; xmm6=in0-in4
+
+	pxor      xmm7,xmm7
+	pxor      xmm2,xmm2
+	punpcklwd xmm7,xmm0		; xmm7=tmp0L
+	punpckhwd xmm2,xmm0		; xmm2=tmp0H
+	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
+	psrad     xmm2,(16-CONST_BITS)	; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+	movdqa	xmm0,xmm7
+	paddd	xmm7,xmm4		; xmm7=tmp10L
+	psubd	xmm0,xmm4		; xmm0=tmp13L
+	movdqa	xmm4,xmm2
+	paddd	xmm2,xmm5		; xmm2=tmp10H
+	psubd	xmm4,xmm5		; xmm4=tmp13H
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=tmp10L
+	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=tmp10H
+	movdqa	XMMWORD [wk(2)], xmm0	; wk(2)=tmp13L
+	movdqa	XMMWORD [wk(3)], xmm4	; wk(3)=tmp13H
+
+	pxor      xmm5,xmm5
+	pxor      xmm7,xmm7
+	punpcklwd xmm5,xmm6		; xmm5=tmp1L
+	punpckhwd xmm7,xmm6		; xmm7=tmp1H
+	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
+	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+	movdqa	xmm2,xmm5
+	paddd	xmm5,xmm1		; xmm5=tmp11L
+	psubd	xmm2,xmm1		; xmm2=tmp12L
+	movdqa	xmm0,xmm7
+	paddd	xmm7,xmm3		; xmm7=tmp11H
+	psubd	xmm0,xmm3		; xmm0=tmp12H
+
+	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
+	movdqa	XMMWORD [wk(5)], xmm7	; wk(5)=tmp11H
+	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=tmp12L
+	movdqa	XMMWORD [wk(7)], xmm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movdqa	xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm7,xmm4
+	paddw	xmm5,xmm3		; xmm5=z3
+	paddw	xmm7,xmm1		; xmm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm2,xmm5
+	movdqa    xmm0,xmm5
+	punpcklwd xmm2,xmm7
+	punpckhwd xmm0,xmm7
+	movdqa    xmm5,xmm2
+	movdqa    xmm7,xmm0
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF078_F117)]	; xmm2=z3L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]	; xmm0=z3H
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]	; xmm5=z4L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_F117_F078)]	; xmm7=z4H
+
+	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=z3L
+	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movdqa    xmm2,xmm3
+	movdqa    xmm0,xmm3
+	punpcklwd xmm2,xmm4
+	punpckhwd xmm0,xmm4
+	movdqa    xmm3,xmm2
+	movdqa    xmm4,xmm0
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm2=tmp0L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm0=tmp0H
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]	; xmm3=tmp3L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF089_F060)]	; xmm4=tmp3H
+
+	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp0L
+	paddd	xmm0, XMMWORD [wk(11)]	; xmm0=tmp0H
+	paddd	xmm3,xmm5		; xmm3=tmp3L
+	paddd	xmm4,xmm7		; xmm4=tmp3H
+
+	movdqa	XMMWORD [wk(8)], xmm2	; wk(8)=tmp0L
+	movdqa	XMMWORD [wk(9)], xmm0	; wk(9)=tmp0H
+
+	movdqa    xmm2,xmm1
+	movdqa    xmm0,xmm1
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm0,xmm6
+	movdqa    xmm1,xmm2
+	movdqa    xmm6,xmm0
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm2=tmp1L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm0=tmp1H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF256_F050)]	; xmm1=tmp2L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]	; xmm6=tmp2H
+
+	paddd	xmm2,xmm5		; xmm2=tmp1L
+	paddd	xmm0,xmm7		; xmm0=tmp1H
+	paddd	xmm1, XMMWORD [wk(10)]	; xmm1=tmp2L
+	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
+
+	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=tmp1L
+	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
+	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=tmp10H
+
+	movdqa	xmm2,xmm5
+	movdqa	xmm0,xmm7
+	paddd	xmm5,xmm3		; xmm5=data0L
+	paddd	xmm7,xmm4		; xmm7=data0H
+	psubd	xmm2,xmm3		; xmm2=data7L
+	psubd	xmm0,xmm4		; xmm0=data7H
+
+	movdqa	xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm3=[PD_DESCALE_P1]
+
+	paddd	xmm5,xmm3
+	paddd	xmm7,xmm3
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm7,DESCALE_P1
+	paddd	xmm2,xmm3
+	paddd	xmm0,xmm3
+	psrad	xmm2,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm5,xmm7		; xmm5=data0=(00 01 02 03 04 05 06 07)
+	packssdw  xmm2,xmm0		; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+	movdqa	xmm4, XMMWORD [wk(4)]	; xmm4=tmp11L
+	movdqa	xmm3, XMMWORD [wk(5)]	; xmm3=tmp11H
+
+	movdqa	xmm7,xmm4
+	movdqa	xmm0,xmm3
+	paddd	xmm4,xmm1		; xmm4=data1L
+	paddd	xmm3,xmm6		; xmm3=data1H
+	psubd	xmm7,xmm1		; xmm7=data6L
+	psubd	xmm0,xmm6		; xmm0=data6H
+
+	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm1=[PD_DESCALE_P1]
+
+	paddd	xmm4,xmm1
+	paddd	xmm3,xmm1
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm3,DESCALE_P1
+	paddd	xmm7,xmm1
+	paddd	xmm0,xmm1
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm4,xmm3		; xmm4=data1=(10 11 12 13 14 15 16 17)
+	packssdw  xmm7,xmm0		; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+	movdqa    xmm6,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm4		; xmm5=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm6,xmm4		; xmm6=(04 14 05 15 06 16 07 17)
+	movdqa    xmm1,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm2		; xmm7=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm1,xmm2		; xmm1=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm3, XMMWORD [wk(6)]	; xmm3=tmp12L
+	movdqa	xmm0, XMMWORD [wk(7)]	; xmm0=tmp12H
+	movdqa	xmm4, XMMWORD [wk(10)]	; xmm4=tmp1L
+	movdqa	xmm2, XMMWORD [wk(11)]	; xmm2=tmp1H
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 01 11 02 12 03 13)
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=(04 14 05 15 06 16 07 17)
+	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=(60 70 61 71 62 72 63 73)
+	movdqa	XMMWORD [wk(5)], xmm1	; wk(5)=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm5,xmm3
+	movdqa	xmm6,xmm0
+	paddd	xmm3,xmm4		; xmm3=data2L
+	paddd	xmm0,xmm2		; xmm0=data2H
+	psubd	xmm5,xmm4		; xmm5=data5L
+	psubd	xmm6,xmm2		; xmm6=data5H
+
+	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm7=[PD_DESCALE_P1]
+
+	paddd	xmm3,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm3,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+	paddd	xmm5,xmm7
+	paddd	xmm6,xmm7
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+
+	packssdw  xmm3,xmm0		; xmm3=data2=(20 21 22 23 24 25 26 27)
+	packssdw  xmm5,xmm6		; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+	movdqa	xmm1, XMMWORD [wk(2)]	; xmm1=tmp13L
+	movdqa	xmm4, XMMWORD [wk(3)]	; xmm4=tmp13H
+	movdqa	xmm2, XMMWORD [wk(8)]	; xmm2=tmp0L
+	movdqa	xmm7, XMMWORD [wk(9)]	; xmm7=tmp0H
+
+	movdqa	xmm0,xmm1
+	movdqa	xmm6,xmm4
+	paddd	xmm1,xmm2		; xmm1=data3L
+	paddd	xmm4,xmm7		; xmm4=data3H
+	psubd	xmm0,xmm2		; xmm0=data4L
+	psubd	xmm6,xmm7		; xmm6=data4H
+
+	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm2=[PD_DESCALE_P1]
+
+	paddd	xmm1,xmm2
+	paddd	xmm4,xmm2
+	psrad	xmm1,DESCALE_P1
+	psrad	xmm4,DESCALE_P1
+	paddd	xmm0,xmm2
+	paddd	xmm6,xmm2
+	psrad	xmm0,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+
+	packssdw  xmm1,xmm4		; xmm1=data3=(30 31 32 33 34 35 36 37)
+	packssdw  xmm0,xmm6		; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 01 11 02 12 03 13)
+	movdqa	xmm2, XMMWORD [wk(1)]	; xmm2=(04 14 05 15 06 16 07 17)
+
+	movdqa    xmm4,xmm3		; transpose coefficients(phase 1)
+	punpcklwd xmm3,xmm1		; xmm3=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm4,xmm1		; xmm4=(24 34 25 35 26 36 27 37)
+	movdqa    xmm6,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm5		; xmm0=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm6,xmm5		; xmm6=(44 54 45 55 46 56 47 57)
+
+	movdqa    xmm1,xmm7		; transpose coefficients(phase 2)
+	punpckldq xmm7,xmm3		; xmm7=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm1,xmm3		; xmm1=(02 12 22 32 03 13 23 33)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm4		; xmm2=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm5,xmm4		; xmm5=(06 16 26 36 07 17 27 37)
+
+	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=(60 70 61 71 62 72 63 73)
+	movdqa	xmm4, XMMWORD [wk(5)]	; xmm4=(64 74 65 75 66 76 67 77)
+
+	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=(04 14 24 34 05 15 25 35)
+	movdqa	XMMWORD [wk(7)], xmm5	; wk(7)=(06 16 26 36 07 17 27 37)
+
+	movdqa    xmm2,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm3		; xmm0=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm2,xmm3		; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm4		; xmm6=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm5,xmm4		; xmm5=(46 56 66 76 47 57 67 77)
+
+	movdqa     xmm3,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm0		; xmm7=col0=(00 10 20 30 40 50 60 70)
+	punpckhqdq xmm3,xmm0		; xmm3=col1=(01 11 21 31 41 51 61 71)
+	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm2		; xmm1=col2=(02 12 22 32 42 52 62 72)
+	punpckhqdq xmm4,xmm2		; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+	movdqa	xmm0, XMMWORD [wk(6)]	; xmm0=(04 14 24 34 05 15 25 35)
+	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa	XMMWORD [wk(8)], xmm3	; wk(8)=col1
+	movdqa	XMMWORD [wk(9)], xmm4	; wk(9)=col3
+
+	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=col4=(04 14 24 34 44 54 64 74)
+	punpckhqdq xmm3,xmm6		; xmm3=col5=(05 15 25 35 45 55 65 75)
+	movdqa     xmm4,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm5		; xmm2=col6=(06 16 26 36 46 56 66 76)
+	punpckhqdq xmm4,xmm5		; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
+	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Even part
+
+	; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movdqa    xmm6,xmm1		; xmm1=in2=z2
+	movdqa    xmm5,xmm1
+	punpcklwd xmm6,xmm2		; xmm2=in6=z3
+	punpckhwd xmm5,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm2,xmm5
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]	; xmm6=tmp3L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]	; xmm5=tmp3H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=tmp2L
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F054_MF130)]	; xmm2=tmp2H
+
+	movdqa    xmm3,xmm7
+	paddw     xmm7,xmm0		; xmm7=in0+in4
+	psubw     xmm3,xmm0		; xmm3=in0-in4
+
+	pxor      xmm4,xmm4
+	pxor      xmm0,xmm0
+	punpcklwd xmm4,xmm7		; xmm4=tmp0L
+	punpckhwd xmm0,xmm7		; xmm0=tmp0H
+	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
+	psrad     xmm0,(16-CONST_BITS)	; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+	movdqa	xmm7,xmm4
+	paddd	xmm4,xmm6		; xmm4=tmp10L
+	psubd	xmm7,xmm6		; xmm7=tmp13L
+	movdqa	xmm6,xmm0
+	paddd	xmm0,xmm5		; xmm0=tmp10H
+	psubd	xmm6,xmm5		; xmm6=tmp13H
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=tmp10L
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp10H
+	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=tmp13L
+	movdqa	XMMWORD [wk(3)], xmm6	; wk(3)=tmp13H
+
+	pxor      xmm5,xmm5
+	pxor      xmm4,xmm4
+	punpcklwd xmm5,xmm3		; xmm5=tmp1L
+	punpckhwd xmm4,xmm3		; xmm4=tmp1H
+	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
+	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+	movdqa	xmm0,xmm5
+	paddd	xmm5,xmm1		; xmm5=tmp11L
+	psubd	xmm0,xmm1		; xmm0=tmp12L
+	movdqa	xmm7,xmm4
+	paddd	xmm4,xmm2		; xmm4=tmp11H
+	psubd	xmm7,xmm2		; xmm7=tmp12H
+
+	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=tmp11H
+	movdqa	XMMWORD [wk(6)], xmm0	; wk(6)=tmp12L
+	movdqa	XMMWORD [wk(7)], xmm7	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movdqa	xmm6, XMMWORD [wk(9)]	; xmm6=col3
+	movdqa	xmm3, XMMWORD [wk(8)]	; xmm3=col1
+	movdqa	xmm1, XMMWORD [wk(11)]	; xmm1=col7
+	movdqa	xmm2, XMMWORD [wk(10)]	; xmm2=col5
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm4,xmm3
+	paddw	xmm5,xmm1		; xmm5=z3
+	paddw	xmm4,xmm2		; xmm4=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm0,xmm5
+	movdqa    xmm7,xmm5
+	punpcklwd xmm0,xmm4
+	punpckhwd xmm7,xmm4
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm7
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]	; xmm0=z3L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]	; xmm7=z3H
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]	; xmm5=z4L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F117_F078)]	; xmm4=z4H
+
+	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=z3L
+	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movdqa    xmm0,xmm1
+	movdqa    xmm7,xmm1
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm7,xmm3
+	movdqa    xmm1,xmm0
+	movdqa    xmm3,xmm7
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm0=tmp0L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm7=tmp0H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]	; xmm1=tmp3L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]	; xmm3=tmp3H
+
+	paddd	xmm0, XMMWORD [wk(10)]	; xmm0=tmp0L
+	paddd	xmm7, XMMWORD [wk(11)]	; xmm7=tmp0H
+	paddd	xmm1,xmm5		; xmm1=tmp3L
+	paddd	xmm3,xmm4		; xmm3=tmp3H
+
+	movdqa	XMMWORD [wk(8)], xmm0	; wk(8)=tmp0L
+	movdqa	XMMWORD [wk(9)], xmm7	; wk(9)=tmp0H
+
+	movdqa    xmm0,xmm2
+	movdqa    xmm7,xmm2
+	punpcklwd xmm0,xmm6
+	punpckhwd xmm7,xmm6
+	movdqa    xmm2,xmm0
+	movdqa    xmm6,xmm7
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm0=tmp1L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm7=tmp1H
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF256_F050)]	; xmm2=tmp2L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]	; xmm6=tmp2H
+
+	paddd	xmm0,xmm5		; xmm0=tmp1L
+	paddd	xmm7,xmm4		; xmm7=tmp1H
+	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp2L
+	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
+
+	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=tmp1L
+	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
+	movdqa	xmm4, XMMWORD [wk(1)]	; xmm4=tmp10H
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm7,xmm4
+	paddd	xmm5,xmm1		; xmm5=data0L
+	paddd	xmm4,xmm3		; xmm4=data0H
+	psubd	xmm0,xmm1		; xmm0=data7L
+	psubd	xmm7,xmm3		; xmm7=data7H
+
+	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm1=[PD_DESCALE_P2]
+
+	paddd	xmm5,xmm1
+	paddd	xmm4,xmm1
+	psrad	xmm5,DESCALE_P2
+	psrad	xmm4,DESCALE_P2
+	paddd	xmm0,xmm1
+	paddd	xmm7,xmm1
+	psrad	xmm0,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm5,xmm4		; xmm5=data0=(00 10 20 30 40 50 60 70)
+	packssdw  xmm0,xmm7		; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=tmp11L
+	movdqa	xmm1, XMMWORD [wk(5)]	; xmm1=tmp11H
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm7,xmm1
+	paddd	xmm3,xmm2		; xmm3=data1L
+	paddd	xmm1,xmm6		; xmm1=data1H
+	psubd	xmm4,xmm2		; xmm4=data6L
+	psubd	xmm7,xmm6		; xmm7=data6H
+
+	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm2=[PD_DESCALE_P2]
+
+	paddd	xmm3,xmm2
+	paddd	xmm1,xmm2
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm4,xmm2
+	paddd	xmm7,xmm2
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm3,xmm1		; xmm3=data1=(01 11 21 31 41 51 61 71)
+	packssdw  xmm4,xmm7		; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+	packsswb  xmm5,xmm4		; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	packsswb  xmm3,xmm0		; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm6, XMMWORD [wk(6)]	; xmm6=tmp12L
+	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=tmp12H
+	movdqa	xmm1, XMMWORD [wk(10)]	; xmm1=tmp1L
+	movdqa	xmm7, XMMWORD [wk(11)]	; xmm7=tmp1H
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm4,xmm6
+	movdqa	xmm0,xmm2
+	paddd	xmm6,xmm1		; xmm6=data2L
+	paddd	xmm2,xmm7		; xmm2=data2H
+	psubd	xmm4,xmm1		; xmm4=data5L
+	psubd	xmm0,xmm7		; xmm0=data5H
+
+	movdqa	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm5=[PD_DESCALE_P2]
+
+	paddd	xmm6,xmm5
+	paddd	xmm2,xmm5
+	psrad	xmm6,DESCALE_P2
+	psrad	xmm2,DESCALE_P2
+	paddd	xmm4,xmm5
+	paddd	xmm0,xmm5
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm0,DESCALE_P2
+
+	packssdw  xmm6,xmm2		; xmm6=data2=(02 12 22 32 42 52 62 72)
+	packssdw  xmm4,xmm0		; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+	movdqa	xmm3, XMMWORD [wk(2)]	; xmm3=tmp13L
+	movdqa	xmm1, XMMWORD [wk(3)]	; xmm1=tmp13H
+	movdqa	xmm7, XMMWORD [wk(8)]	; xmm7=tmp0L
+	movdqa	xmm5, XMMWORD [wk(9)]	; xmm5=tmp0H
+
+	movdqa	xmm2,xmm3
+	movdqa	xmm0,xmm1
+	paddd	xmm3,xmm7		; xmm3=data3L
+	paddd	xmm1,xmm5		; xmm1=data3H
+	psubd	xmm2,xmm7		; xmm2=data4L
+	psubd	xmm0,xmm5		; xmm0=data4H
+
+	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm7=[PD_DESCALE_P2]
+
+	paddd	xmm3,xmm7
+	paddd	xmm1,xmm7
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm2,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm2,DESCALE_P2
+	psrad	xmm0,DESCALE_P2
+
+	movdqa    xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm5=[PB_CENTERJSAMP]
+
+	packssdw  xmm3,xmm1		; xmm3=data3=(03 13 23 33 43 53 63 73)
+	packssdw  xmm2,xmm0		; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+	movdqa    xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	packsswb  xmm6,xmm2		; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+	packsswb  xmm3,xmm4		; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+	paddb     xmm7,xmm5
+	paddb     xmm1,xmm5
+	paddb     xmm6,xmm5
+	paddb     xmm3,xmm5
+
+	movdqa    xmm0,xmm7	; transpose coefficients(phase 1)
+	punpcklbw xmm7,xmm1	; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+	punpckhbw xmm0,xmm1	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+	movdqa    xmm2,xmm6	; transpose coefficients(phase 1)
+	punpcklbw xmm6,xmm3	; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+	punpckhbw xmm2,xmm3	; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+	movdqa    xmm4,xmm7	; transpose coefficients(phase 2)
+	punpcklwd xmm7,xmm6	; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm6	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+	movdqa    xmm5,xmm2	; transpose coefficients(phase 2)
+	punpcklwd xmm2,xmm0	; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+	punpckhwd xmm5,xmm0	; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+	movdqa    xmm1,xmm7	; transpose coefficients(phase 3)
+	punpckldq xmm7,xmm2	; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm1,xmm2	; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+	movdqa    xmm3,xmm4	; transpose coefficients(phase 3)
+	punpckldq xmm4,xmm5	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+	punpckhdq xmm3,xmm5	; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+	pshufd	xmm6,xmm7,0x4E	; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm0,xmm1,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+	pshufd	xmm2,xmm4,0x4E	; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+	pshufd	xmm5,xmm3,0x4E	; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+	mov	edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+	mov	edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+	movq	XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jiss2red-64.asm b/simd/jiss2red-64.asm
new file mode 100644
index 0000000..85ba941
--- /dev/null
+++ b/simd/jiss2red-64.asm

@@ -0,0 +1,575 @@
+;
+; jiss2red.asm - reduced-size IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076	times 4 dw  F_1_847,-F_0_765
+PW_F256_F089	times 4 dw  F_2_562, F_0_899
+PW_F106_MF217	times 4 dw  F_1_061,-F_2_172
+PW_MF060_MF050	times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021	times 4 dw  F_1_451,-F_0_211
+PW_F362_MF127	times 4 dw  F_3_624,-F_1_272
+PW_F085_MF072	times 4 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4	times 4 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4	times 4 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2	times 4 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2	times 4 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void * dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp	rbp+0
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],eax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+
+	; ---- Pass 1: process columns from input.
+
+	mov	rdx, r10	; quantptr
+	mov	rsi, r11		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	por	xmm0,xmm1
+	packsswb xmm0,xmm0
+	packsswb xmm0,xmm0
+	movd	eax,xmm0
+	test	rax,rax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	xmm0,PASS1_BITS
+
+	movdqa    xmm3,xmm0	; xmm0=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm0,xmm0	; xmm0=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm3,xmm3	; xmm3=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm1,xmm0,0x50	; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+	pshufd	xmm0,xmm0,0xFA	; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+	pshufd	xmm6,xmm3,0x50	; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+	pshufd	xmm3,xmm3,0xFA	; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+	jmp	near .column_end
+%endif
+.columnDCT:
+
+	; -- Odd part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa    xmm4,xmm0
+	movdqa    xmm5,xmm0
+	punpcklwd xmm4,xmm1
+	punpckhwd xmm5,xmm1
+	movdqa    xmm0,xmm4
+	movdqa    xmm1,xmm5
+	pmaddwd   xmm4,[rel PW_F256_F089]	; xmm4=(tmp2L)
+	pmaddwd   xmm5,[rel PW_F256_F089]	; xmm5=(tmp2H)
+	pmaddwd   xmm0,[rel PW_F106_MF217]	; xmm0=(tmp0L)
+	pmaddwd   xmm1,[rel PW_F106_MF217]	; xmm1=(tmp0H)
+
+	movdqa    xmm6,xmm2
+	movdqa    xmm7,xmm2
+	punpcklwd xmm6,xmm3
+	punpckhwd xmm7,xmm3
+	movdqa    xmm2,xmm6
+	movdqa    xmm3,xmm7
+	pmaddwd   xmm6,[rel PW_MF060_MF050]	; xmm6=(tmp2L)
+	pmaddwd   xmm7,[rel PW_MF060_MF050]	; xmm7=(tmp2H)
+	pmaddwd   xmm2,[rel PW_F145_MF021]	; xmm2=(tmp0L)
+	pmaddwd   xmm3,[rel PW_F145_MF021]	; xmm3=(tmp0H)
+
+	paddd	xmm6,xmm4		; xmm6=tmp2L
+	paddd	xmm7,xmm5		; xmm7=tmp2H
+	paddd	xmm2,xmm0		; xmm2=tmp0L
+	paddd	xmm3,xmm1		; xmm3=tmp0H
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp0L
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	pxor      xmm1,xmm1
+	pxor      xmm2,xmm2
+	punpcklwd xmm1,xmm4		; xmm1=tmp0L
+	punpckhwd xmm2,xmm4		; xmm2=tmp0H
+	psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+	psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+	movdqa    xmm3,xmm5		; xmm5=in2=z2
+	punpcklwd xmm5,xmm0		; xmm0=in6=z3
+	punpckhwd xmm3,xmm0
+	pmaddwd   xmm5,[rel PW_F184_MF076]	; xmm5=tmp2L
+	pmaddwd   xmm3,[rel PW_F184_MF076]	; xmm3=tmp2H
+
+	movdqa	xmm4,xmm1
+	movdqa	xmm0,xmm2
+	paddd	xmm1,xmm5		; xmm1=tmp10L
+	paddd	xmm2,xmm3		; xmm2=tmp10H
+	psubd	xmm4,xmm5		; xmm4=tmp12L
+	psubd	xmm0,xmm3		; xmm0=tmp12H
+
+	; -- Final output stage
+
+	movdqa	xmm5,xmm1
+	movdqa	xmm3,xmm2
+	paddd	xmm1,xmm6		; xmm1=data0L
+	paddd	xmm2,xmm7		; xmm2=data0H
+	psubd	xmm5,xmm6		; xmm5=data3L
+	psubd	xmm3,xmm7		; xmm3=data3H
+
+	movdqa	xmm6,[rel PD_DESCALE_P1_4]	; xmm6=[rel PD_DESCALE_P1_4]
+
+	paddd	xmm1,xmm6
+	paddd	xmm2,xmm6
+	psrad	xmm1,DESCALE_P1_4
+	psrad	xmm2,DESCALE_P1_4
+	paddd	xmm5,xmm6
+	paddd	xmm3,xmm6
+	psrad	xmm5,DESCALE_P1_4
+	psrad	xmm3,DESCALE_P1_4
+
+	packssdw  xmm1,xmm2		; xmm1=data0=(00 01 02 03 04 05 06 07)
+	packssdw  xmm5,xmm3		; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp0L
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp0H
+
+	movdqa	xmm2,xmm4
+	movdqa	xmm3,xmm0
+	paddd	xmm4,xmm7		; xmm4=data1L
+	paddd	xmm0,xmm6		; xmm0=data1H
+	psubd	xmm2,xmm7		; xmm2=data2L
+	psubd	xmm3,xmm6		; xmm3=data2H
+
+	movdqa	xmm7,[rel PD_DESCALE_P1_4]	; xmm7=[rel PD_DESCALE_P1_4]
+
+	paddd	xmm4,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm4,DESCALE_P1_4
+	psrad	xmm0,DESCALE_P1_4
+	paddd	xmm2,xmm7
+	paddd	xmm3,xmm7
+	psrad	xmm2,DESCALE_P1_4
+	psrad	xmm3,DESCALE_P1_4
+
+	packssdw  xmm4,xmm0		; xmm4=data1=(10 11 12 13 14 15 16 17)
+	packssdw  xmm2,xmm3		; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+	movdqa    xmm6,xmm1	; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm4	; xmm1=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm6,xmm4	; xmm6=(04 14 05 15 06 16 07 17)
+	movdqa    xmm7,xmm2	; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm5	; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm7,xmm5	; xmm7=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm0,xmm1	; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm2	; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm0,xmm2	; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+	movdqa    xmm3,xmm6	; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm7	; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm3,xmm7	; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	rax, [original_rbp]
+	mov	rdi, r12	; (JSAMPROW *)
+	mov	rax, r13
+
+	; -- Even part
+
+	pxor      xmm4,xmm4
+	punpcklwd xmm4,xmm1		; xmm4=tmp0
+	psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+	; -- Odd part
+
+	punpckhwd xmm1,xmm0
+	punpckhwd xmm6,xmm3
+	movdqa    xmm5,xmm1
+	movdqa    xmm2,xmm6
+	pmaddwd   xmm1,[rel PW_F256_F089]	; xmm1=(tmp2)
+	pmaddwd   xmm6,[rel PW_MF060_MF050]	; xmm6=(tmp2)
+	pmaddwd   xmm5,[rel PW_F106_MF217]	; xmm5=(tmp0)
+	pmaddwd   xmm2,[rel PW_F145_MF021]	; xmm2=(tmp0)
+
+	paddd     xmm6,xmm1		; xmm6=tmp2
+	paddd     xmm2,xmm5		; xmm2=tmp0
+
+	; -- Even part
+
+	punpcklwd xmm0,xmm3
+	pmaddwd   xmm0,[rel PW_F184_MF076]	; xmm0=tmp2
+
+	movdqa    xmm7,xmm4
+	paddd     xmm4,xmm0		; xmm4=tmp10
+	psubd     xmm7,xmm0		; xmm7=tmp12
+
+	; -- Final output stage
+
+	movdqa	xmm1,[rel PD_DESCALE_P2_4]	; xmm1=[rel PD_DESCALE_P2_4]
+
+	movdqa	xmm5,xmm4
+	movdqa	xmm3,xmm7
+	paddd	xmm4,xmm6		; xmm4=data0=(00 10 20 30)
+	paddd	xmm7,xmm2		; xmm7=data1=(01 11 21 31)
+	psubd	xmm5,xmm6		; xmm5=data3=(03 13 23 33)
+	psubd	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
+
+	paddd	xmm4,xmm1
+	paddd	xmm7,xmm1
+	psrad	xmm4,DESCALE_P2_4
+	psrad	xmm7,DESCALE_P2_4
+	paddd	xmm5,xmm1
+	paddd	xmm3,xmm1
+	psrad	xmm5,DESCALE_P2_4
+	psrad	xmm3,DESCALE_P2_4
+
+	packssdw  xmm4,xmm3		; xmm4=(00 10 20 30 02 12 22 32)
+	packssdw  xmm7,xmm5		; xmm7=(01 11 21 31 03 13 23 33)
+
+	movdqa    xmm0,xmm4		; transpose coefficients(phase 1)
+	punpcklwd xmm4,xmm7		; xmm4=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm0,xmm7		; xmm0=(02 03 12 13 22 23 32 33)
+
+	movdqa    xmm6,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm0		; xmm4=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm6,xmm0		; xmm6=(20 21 22 23 30 31 32 33)
+
+	packsswb  xmm4,xmm6		; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+	paddb     xmm4,[rel PB_CENTERJSAMP]
+
+	pshufd    xmm2,xmm4,0x39	; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+	pshufd    xmm1,xmm4,0x4E	; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+	pshufd    xmm3,xmm4,0x93	; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+	movd	XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+	movd	XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+	mov	rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+	movd	XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+	movd	XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void * dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+	align	16
+	global	EXTN(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+	push	rbp
+	mov	rbp,rsp
+	push	rbx
+	collect_args
+
+	; ---- Pass 1: process columns from input.
+
+	mov	rdx, r10	; quantptr
+	mov	rsi, r11		; inptr
+
+	; | input:                  | result:        |
+	; | 00 01 ** 03 ** 05 ** 07 |                |
+	; | 10 11 ** 13 ** 15 ** 17 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+	; | 50 51 ** 53 ** 55 ** 57 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 70 71 ** 73 ** 75 ** 77 |                |
+
+	; -- Odd part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+	; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+	pcmpeqd   xmm7,xmm7
+	pslld     xmm7,WORD_BIT		; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+	movdqa    xmm4,xmm0		; xmm4=(10 11 ** 13 ** 15 ** 17)
+	movdqa    xmm5,xmm2		; xmm5=(50 51 ** 53 ** 55 ** 57)
+	punpcklwd xmm4,xmm1		; xmm4=(10 30 11 31 ** ** 13 33)
+	punpcklwd xmm5,xmm3		; xmm5=(50 70 51 71 ** ** 53 73)
+	pmaddwd   xmm4,[rel PW_F362_MF127]
+	pmaddwd   xmm5,[rel PW_F085_MF072]
+
+	psrld	xmm0,WORD_BIT		; xmm0=(11 -- 13 -- 15 -- 17 --)
+	pand	xmm1,xmm7		; xmm1=(-- 31 -- 33 -- 35 -- 37)
+	psrld	xmm2,WORD_BIT		; xmm2=(51 -- 53 -- 55 -- 57 --)
+	pand	xmm3,xmm7		; xmm3=(-- 71 -- 73 -- 75 -- 77)
+	por	xmm0,xmm1		; xmm0=(11 31 13 33 15 35 17 37)
+	por	xmm2,xmm3		; xmm2=(51 71 53 73 55 75 57 77)
+	pmaddwd	xmm0,[rel PW_F362_MF127]
+	pmaddwd	xmm2,[rel PW_F085_MF072]
+
+	paddd	xmm4,xmm5		; xmm4=tmp0[col0 col1 **** col3]
+	paddd	xmm0,xmm2		; xmm0=tmp0[col1 col3 col5 col7]
+
+	; -- Even part
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+	pmullw	xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+	movdqa	xmm1,xmm6		; xmm1=(00 01 ** 03 ** 05 ** 07)
+	pslld	xmm6,WORD_BIT		; xmm6=(-- 00 -- ** -- ** -- **)
+	pand	xmm1,xmm7		; xmm1=(-- 01 -- 03 -- 05 -- 07)
+	psrad	xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+	psrad	xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+	; -- Final output stage
+
+	movdqa	xmm3,xmm6
+	movdqa	xmm5,xmm1
+	paddd	xmm6,xmm4	; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+	paddd	xmm1,xmm0	; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+	psubd	xmm3,xmm4	; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+	psubd	xmm5,xmm0	; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+	movdqa	xmm2,[rel PD_DESCALE_P1_2]	; xmm2=[rel PD_DESCALE_P1_2]
+
+	punpckldq  xmm6,xmm3		; xmm6=(A0 B0 ** **)
+
+	movdqa     xmm7,xmm1
+	punpcklqdq xmm1,xmm5		; xmm1=(A1 A3 B1 B3)
+	punpckhqdq xmm7,xmm5		; xmm7=(A5 A7 B5 B7)
+
+	paddd	xmm6,xmm2
+	psrad	xmm6,DESCALE_P1_2
+
+	paddd	xmm1,xmm2
+	paddd	xmm7,xmm2
+	psrad	xmm1,DESCALE_P1_2
+	psrad	xmm7,DESCALE_P1_2
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	rdi, r12	; (JSAMPROW *)
+	mov	rax, r13
+
+	; | input:| result:|
+	; | A0 B0 |        |
+	; | A1 B1 | C0 C1  |
+	; | A3 B3 | D0 D1  |
+	; | A5 B5 |        |
+	; | A7 B7 |        |
+
+	; -- Odd part
+
+	packssdw  xmm1,xmm1		; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+	packssdw  xmm7,xmm7		; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+	pmaddwd   xmm1,[rel PW_F362_MF127]
+	pmaddwd   xmm7,[rel PW_F085_MF072]
+
+	paddd     xmm1,xmm7		; xmm1=tmp0[row0 row1 row0 row1]
+
+	; -- Even part
+
+	pslld     xmm6,(CONST_BITS+2)	; xmm6=tmp10[row0 row1 **** ****]
+
+	; -- Final output stage
+
+	movdqa    xmm4,xmm6
+	paddd     xmm6,xmm1	; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+	psubd     xmm4,xmm1	; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+	punpckldq xmm6,xmm4	; xmm6=(C0 D0 C1 D1)
+
+	paddd     xmm6,[rel PD_DESCALE_P2_2]
+	psrad     xmm6,DESCALE_P2_2
+
+	packssdw  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+	packsswb  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+	paddb     xmm6,[rel PB_CENTERJSAMP]
+
+	pextrw	ebx,xmm6,0x00		; ebx=(C0 D0 -- --)
+	pextrw	ecx,xmm6,0x01		; ecx=(C1 D1 -- --)
+
+	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+	mov	rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+	mov	WORD [rdx+rax*SIZEOF_JSAMPLE], bx
+	mov	WORD [rsi+rax*SIZEOF_JSAMPLE], cx
+
+	uncollect_args
+	pop	rbx
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jiss2red.asm b/simd/jiss2red.asm
new file mode 100644
index 0000000..238c61d
--- /dev/null
+++ b/simd/jiss2red.asm

@@ -0,0 +1,594 @@
+;
+; jiss2red.asm - reduced-size IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076	times 4 dw  F_1_847,-F_0_765
+PW_F256_F089	times 4 dw  F_2_562, F_0_899
+PW_F106_MF217	times 4 dw  F_1_061,-F_2_172
+PW_MF060_MF050	times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021	times 4 dw  F_1_451,-F_0_211
+PW_F362_MF127	times 4 dw  F_3_624,-F_1_272
+PW_F085_MF072	times 4 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4	times 4 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4	times 4 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2	times 4 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2	times 4 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm0,xmm1
+	packsswb xmm0,xmm0
+	packsswb xmm0,xmm0
+	movd	eax,xmm0
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	xmm0,PASS1_BITS
+
+	movdqa    xmm3,xmm0	; xmm0=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm0,xmm0	; xmm0=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm3,xmm3	; xmm3=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm1,xmm0,0x50	; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+	pshufd	xmm0,xmm0,0xFA	; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+	pshufd	xmm6,xmm3,0x50	; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+	pshufd	xmm3,xmm3,0xFA	; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+	jmp	near .column_end
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Odd part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa    xmm4,xmm0
+	movdqa    xmm5,xmm0
+	punpcklwd xmm4,xmm1
+	punpckhwd xmm5,xmm1
+	movdqa    xmm0,xmm4
+	movdqa    xmm1,xmm5
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F256_F089)]	; xmm4=(tmp2L)
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F256_F089)]	; xmm5=(tmp2H)
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F106_MF217)]	; xmm0=(tmp0L)
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F106_MF217)]	; xmm1=(tmp0H)
+
+	movdqa    xmm6,xmm2
+	movdqa    xmm7,xmm2
+	punpcklwd xmm6,xmm3
+	punpckhwd xmm7,xmm3
+	movdqa    xmm2,xmm6
+	movdqa    xmm3,xmm7
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm6=(tmp2L)
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm7=(tmp2H)
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]	; xmm2=(tmp0L)
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_F145_MF021)]	; xmm3=(tmp0H)
+
+	paddd	xmm6,xmm4		; xmm6=tmp2L
+	paddd	xmm7,xmm5		; xmm7=tmp2H
+	paddd	xmm2,xmm0		; xmm2=tmp0L
+	paddd	xmm3,xmm1		; xmm3=tmp0H
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp0L
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	pxor      xmm1,xmm1
+	pxor      xmm2,xmm2
+	punpcklwd xmm1,xmm4		; xmm1=tmp0L
+	punpckhwd xmm2,xmm4		; xmm2=tmp0H
+	psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+	psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+	movdqa    xmm3,xmm5		; xmm5=in2=z2
+	punpcklwd xmm5,xmm0		; xmm0=in6=z3
+	punpckhwd xmm3,xmm0
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F184_MF076)]	; xmm5=tmp2L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_F184_MF076)]	; xmm3=tmp2H
+
+	movdqa	xmm4,xmm1
+	movdqa	xmm0,xmm2
+	paddd	xmm1,xmm5		; xmm1=tmp10L
+	paddd	xmm2,xmm3		; xmm2=tmp10H
+	psubd	xmm4,xmm5		; xmm4=tmp12L
+	psubd	xmm0,xmm3		; xmm0=tmp12H
+
+	; -- Final output stage
+
+	movdqa	xmm5,xmm1
+	movdqa	xmm3,xmm2
+	paddd	xmm1,xmm6		; xmm1=data0L
+	paddd	xmm2,xmm7		; xmm2=data0H
+	psubd	xmm5,xmm6		; xmm5=data3L
+	psubd	xmm3,xmm7		; xmm3=data3H
+
+	movdqa	xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; xmm6=[PD_DESCALE_P1_4]
+
+	paddd	xmm1,xmm6
+	paddd	xmm2,xmm6
+	psrad	xmm1,DESCALE_P1_4
+	psrad	xmm2,DESCALE_P1_4
+	paddd	xmm5,xmm6
+	paddd	xmm3,xmm6
+	psrad	xmm5,DESCALE_P1_4
+	psrad	xmm3,DESCALE_P1_4
+
+	packssdw  xmm1,xmm2		; xmm1=data0=(00 01 02 03 04 05 06 07)
+	packssdw  xmm5,xmm3		; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp0L
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp0H
+
+	movdqa	xmm2,xmm4
+	movdqa	xmm3,xmm0
+	paddd	xmm4,xmm7		; xmm4=data1L
+	paddd	xmm0,xmm6		; xmm0=data1H
+	psubd	xmm2,xmm7		; xmm2=data2L
+	psubd	xmm3,xmm6		; xmm3=data2H
+
+	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; xmm7=[PD_DESCALE_P1_4]
+
+	paddd	xmm4,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm4,DESCALE_P1_4
+	psrad	xmm0,DESCALE_P1_4
+	paddd	xmm2,xmm7
+	paddd	xmm3,xmm7
+	psrad	xmm2,DESCALE_P1_4
+	psrad	xmm3,DESCALE_P1_4
+
+	packssdw  xmm4,xmm0		; xmm4=data1=(10 11 12 13 14 15 16 17)
+	packssdw  xmm2,xmm3		; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+	movdqa    xmm6,xmm1	; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm4	; xmm1=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm6,xmm4	; xmm6=(04 14 05 15 06 16 07 17)
+	movdqa    xmm7,xmm2	; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm5	; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm7,xmm5	; xmm7=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm0,xmm1	; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm2	; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm0,xmm2	; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+	movdqa    xmm3,xmm6	; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm7	; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm3,xmm7	; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	eax, [original_ebp]
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Even part
+
+	pxor      xmm4,xmm4
+	punpcklwd xmm4,xmm1		; xmm4=tmp0
+	psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+	; -- Odd part
+
+	punpckhwd xmm1,xmm0
+	punpckhwd xmm6,xmm3
+	movdqa    xmm5,xmm1
+	movdqa    xmm2,xmm6
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F256_F089)]	; xmm1=(tmp2)
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm6=(tmp2)
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F106_MF217)]	; xmm5=(tmp0)
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]	; xmm2=(tmp0)
+
+	paddd     xmm6,xmm1		; xmm6=tmp2
+	paddd     xmm2,xmm5		; xmm2=tmp0
+
+	; -- Even part
+
+	punpcklwd xmm0,xmm3
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F184_MF076)]	; xmm0=tmp2
+
+	movdqa    xmm7,xmm4
+	paddd     xmm4,xmm0		; xmm4=tmp10
+	psubd     xmm7,xmm0		; xmm7=tmp12
+
+	; -- Final output stage
+
+	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; xmm1=[PD_DESCALE_P2_4]
+
+	movdqa	xmm5,xmm4
+	movdqa	xmm3,xmm7
+	paddd	xmm4,xmm6		; xmm4=data0=(00 10 20 30)
+	paddd	xmm7,xmm2		; xmm7=data1=(01 11 21 31)
+	psubd	xmm5,xmm6		; xmm5=data3=(03 13 23 33)
+	psubd	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
+
+	paddd	xmm4,xmm1
+	paddd	xmm7,xmm1
+	psrad	xmm4,DESCALE_P2_4
+	psrad	xmm7,DESCALE_P2_4
+	paddd	xmm5,xmm1
+	paddd	xmm3,xmm1
+	psrad	xmm5,DESCALE_P2_4
+	psrad	xmm3,DESCALE_P2_4
+
+	packssdw  xmm4,xmm3		; xmm4=(00 10 20 30 02 12 22 32)
+	packssdw  xmm7,xmm5		; xmm7=(01 11 21 31 03 13 23 33)
+
+	movdqa    xmm0,xmm4		; transpose coefficients(phase 1)
+	punpcklwd xmm4,xmm7		; xmm4=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm0,xmm7		; xmm0=(02 03 12 13 22 23 32 33)
+
+	movdqa    xmm6,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm0		; xmm4=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm6,xmm0		; xmm6=(20 21 22 23 30 31 32 33)
+
+	packsswb  xmm4,xmm6		; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+	paddb     xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	pshufd    xmm2,xmm4,0x39	; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+	pshufd    xmm1,xmm4,0x4E	; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+	pshufd    xmm3,xmm4,0x93	; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movd	XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+	movd	XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movd	XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+	movd	XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+	align	16
+	global	EXTN(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+	mov	edx, POINTER [dct_table(ebp)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+
+	; | input:                  | result:        |
+	; | 00 01 ** 03 ** 05 ** 07 |                |
+	; | 10 11 ** 13 ** 15 ** 17 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+	; | 50 51 ** 53 ** 55 ** 57 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 70 71 ** 73 ** 75 ** 77 |                |
+
+	; -- Odd part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+	; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+	pcmpeqd   xmm7,xmm7
+	pslld     xmm7,WORD_BIT		; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+	movdqa    xmm4,xmm0		; xmm4=(10 11 ** 13 ** 15 ** 17)
+	movdqa    xmm5,xmm2		; xmm5=(50 51 ** 53 ** 55 ** 57)
+	punpcklwd xmm4,xmm1		; xmm4=(10 30 11 31 ** ** 13 33)
+	punpcklwd xmm5,xmm3		; xmm5=(50 70 51 71 ** ** 53 73)
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	psrld	xmm0,WORD_BIT		; xmm0=(11 -- 13 -- 15 -- 17 --)
+	pand	xmm1,xmm7		; xmm1=(-- 31 -- 33 -- 35 -- 37)
+	psrld	xmm2,WORD_BIT		; xmm2=(51 -- 53 -- 55 -- 57 --)
+	pand	xmm3,xmm7		; xmm3=(-- 71 -- 73 -- 75 -- 77)
+	por	xmm0,xmm1		; xmm0=(11 31 13 33 15 35 17 37)
+	por	xmm2,xmm3		; xmm2=(51 71 53 73 55 75 57 77)
+	pmaddwd	xmm0,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	xmm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	xmm4,xmm5		; xmm4=tmp0[col0 col1 **** col3]
+	paddd	xmm0,xmm2		; xmm0=tmp0[col1 col3 col5 col7]
+
+	; -- Even part
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+	movdqa	xmm1,xmm6		; xmm1=(00 01 ** 03 ** 05 ** 07)
+	pslld	xmm6,WORD_BIT		; xmm6=(-- 00 -- ** -- ** -- **)
+	pand	xmm1,xmm7		; xmm1=(-- 01 -- 03 -- 05 -- 07)
+	psrad	xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+	psrad	xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+	; -- Final output stage
+
+	movdqa	xmm3,xmm6
+	movdqa	xmm5,xmm1
+	paddd	xmm6,xmm4	; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+	paddd	xmm1,xmm0	; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+	psubd	xmm3,xmm4	; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+	psubd	xmm5,xmm0	; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; xmm2=[PD_DESCALE_P1_2]
+
+	punpckldq  xmm6,xmm3		; xmm6=(A0 B0 ** **)
+
+	movdqa     xmm7,xmm1
+	punpcklqdq xmm1,xmm5		; xmm1=(A1 A3 B1 B3)
+	punpckhqdq xmm7,xmm5		; xmm7=(A5 A7 B5 B7)
+
+	paddd	xmm6,xmm2
+	psrad	xmm6,DESCALE_P1_2
+
+	paddd	xmm1,xmm2
+	paddd	xmm7,xmm2
+	psrad	xmm1,DESCALE_P1_2
+	psrad	xmm7,DESCALE_P1_2
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(ebp)]
+
+	; | input:| result:|
+	; | A0 B0 |        |
+	; | A1 B1 | C0 C1  |
+	; | A3 B3 | D0 D1  |
+	; | A5 B5 |        |
+	; | A7 B7 |        |
+
+	; -- Odd part
+
+	packssdw  xmm1,xmm1		; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+	packssdw  xmm7,xmm7		; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd     xmm1,xmm7		; xmm1=tmp0[row0 row1 row0 row1]
+
+	; -- Even part
+
+	pslld     xmm6,(CONST_BITS+2)	; xmm6=tmp10[row0 row1 **** ****]
+
+	; -- Final output stage
+
+	movdqa    xmm4,xmm6
+	paddd     xmm6,xmm1	; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+	psubd     xmm4,xmm1	; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+	punpckldq xmm6,xmm4	; xmm6=(C0 D0 C1 D1)
+
+	paddd     xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
+	psrad     xmm6,DESCALE_P2_2
+
+	packssdw  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+	packsswb  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+	paddb     xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	pextrw	ebx,xmm6,0x00		; ebx=(C0 D0 -- --)
+	pextrw	ecx,xmm6,0x01		; ecx=(C1 D1 -- --)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
+	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jisseflt.asm b/simd/jisseflt.asm
new file mode 100644
index 0000000..d6147c1
--- /dev/null
+++ b/simd/jisseflt.asm

@@ -0,0 +1,572 @@
+;
+; jisseflt.asm - floating-point IDCT (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414	times 4 dd  1.414213562373095048801689
+PD_1_847	times 4 dd  1.847759065022573512256366
+PD_1_082	times 4 dd  1.082392200292393968799446
+PD_M2_613	times 4 dd -2.613125929752753055713286
+PD_0_125	times 4 dd  0.125	; 1/8
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)	(b)+8			; void * dct_table
+%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+20		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jsimd_idct_float_sse)
+
+EXTN(jsimd_idct_float_sse):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [dct_table(eax)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+	punpckhwd mm1,mm0			; mm1=(** 02 ** 03)
+	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in0H=(02 03)
+	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
+	cvtpi2ps  xmm3,mm1			; xmm3=(02 03 ** **)
+	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
+	movlhps   xmm0,xmm3			; xmm0=in0=(00 01 02 03)
+
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm1,xmm0
+	movaps	xmm2,xmm0
+	movaps	xmm3,xmm0
+
+	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
+	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
+	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
+	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq      mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq      mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq      mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	punpckhwd mm4,mm0			; mm4=(** 02 ** 03)
+	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
+	punpckhwd mm5,mm1			; mm5=(** 22 ** 23)
+	punpcklwd mm1,mm1			; mm1=(20 20 21 21)
+
+	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in0H=(02 03)
+	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
+	cvtpi2ps  xmm4,mm4			; xmm4=(02 03 ** **)
+	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
+	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in2H=(22 23)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in2L=(20 21)
+	cvtpi2ps  xmm5,mm5			; xmm5=(22 23 ** **)
+	cvtpi2ps  xmm1,mm1			; xmm1=(20 21 ** **)
+
+	punpckhwd mm6,mm2			; mm6=(** 42 ** 43)
+	punpcklwd mm2,mm2			; mm2=(40 40 41 41)
+	punpckhwd mm7,mm3			; mm7=(** 62 ** 63)
+	punpcklwd mm3,mm3			; mm3=(60 60 61 61)
+
+	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in4H=(42 43)
+	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in4L=(40 41)
+	cvtpi2ps  xmm6,mm6			; xmm6=(42 43 ** **)
+	cvtpi2ps  xmm2,mm2			; xmm2=(40 41 ** **)
+	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in6H=(62 63)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in6L=(60 61)
+	cvtpi2ps  xmm7,mm7			; xmm7=(62 63 ** **)
+	cvtpi2ps  xmm3,mm3			; xmm3=(60 61 ** **)
+
+	movlhps   xmm0,xmm4			; xmm0=in0=(00 01 02 03)
+	movlhps   xmm1,xmm5			; xmm1=in2=(20 21 22 23)
+	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movlhps   xmm2,xmm6			; xmm2=in4=(40 41 42 43)
+	movlhps   xmm3,xmm7			; xmm3=in6=(60 61 62 63)
+	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movq      mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq      mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq      mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq      mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	punpckhwd mm6,mm4			; mm6=(** 12 ** 13)
+	punpcklwd mm4,mm4			; mm4=(10 10 11 11)
+	punpckhwd mm2,mm0			; mm2=(** 32 ** 33)
+	punpcklwd mm0,mm0			; mm0=(30 30 31 31)
+
+	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in1H=(12 13)
+	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in1L=(10 11)
+	cvtpi2ps  xmm4,mm6			; xmm4=(12 13 ** **)
+	cvtpi2ps  xmm2,mm4			; xmm2=(10 11 ** **)
+	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in3H=(32 33)
+	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in3L=(30 31)
+	cvtpi2ps  xmm0,mm2			; xmm0=(32 33 ** **)
+	cvtpi2ps  xmm3,mm0			; xmm3=(30 31 ** **)
+
+	punpckhwd mm7,mm5			; mm7=(** 52 ** 53)
+	punpcklwd mm5,mm5			; mm5=(50 50 51 51)
+	punpckhwd mm3,mm1			; mm3=(** 72 ** 73)
+	punpcklwd mm1,mm1			; mm1=(70 70 71 71)
+
+	movlhps   xmm2,xmm4			; xmm2=in1=(10 11 12 13)
+	movlhps   xmm3,xmm0			; xmm3=in3=(30 31 32 33)
+
+	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in5H=(52 53)
+	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in5L=(50 51)
+	cvtpi2ps  xmm4,mm7			; xmm4=(52 53 ** **)
+	cvtpi2ps  xmm5,mm5			; xmm5=(50 51 ** **)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in7H=(72 73)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in7L=(70 71)
+	cvtpi2ps  xmm0,mm3			; xmm0=(72 73 ** **)
+	cvtpi2ps  xmm1,mm1			; xmm1=(70 71 ** **)
+
+	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movlhps   xmm5,xmm4			; xmm5=in5=(50 51 52 53)
+	movlhps   xmm1,xmm0			; xmm1=in7=(70 71 72 73)
+	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
+	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
+	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
+	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
+	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
+	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
+	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
+	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
+
+	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
+	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm0,xmm7
+	movaps	xmm3,xmm5
+	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
+	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
+	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
+	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
+
+	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
+	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
+	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
+	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
+	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
+	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
+
+	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
+	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
+	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
+	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
+	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
+	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
+	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
+	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
+	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
+	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
+	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
+	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps	xmm1,[GOTOFF(ebx,PD_0_125)]	; xmm1=[PD_0_125]
+
+	mulps	xmm6,xmm1		; descale(1/8)
+	mulps	xmm7,xmm1		; descale(1/8)
+	mulps	xmm5,xmm1		; descale(1/8)
+	mulps	xmm0,xmm1		; descale(1/8)
+
+	movhlps   xmm3,xmm6
+	movhlps   xmm1,xmm7
+	cvtps2pi  mm0,xmm6		; round to int32, mm0=data0L=(00 10)
+	cvtps2pi  mm1,xmm7		; round to int32, mm1=data1L=(01 11)
+	cvtps2pi  mm2,xmm3		; round to int32, mm2=data0H=(20 30)
+	cvtps2pi  mm3,xmm1		; round to int32, mm3=data1H=(21 31)
+	packssdw  mm0,mm2		; mm0=data0=(00 10 20 30)
+	packssdw  mm1,mm3		; mm1=data1=(01 11 21 31)
+
+	movhlps   xmm6,xmm5
+	movhlps   xmm7,xmm0
+	cvtps2pi  mm4,xmm5		; round to int32, mm4=data7L=(07 17)
+	cvtps2pi  mm5,xmm0		; round to int32, mm5=data6L=(06 16)
+	cvtps2pi  mm6,xmm6		; round to int32, mm6=data7H=(27 37)
+	cvtps2pi  mm7,xmm7		; round to int32, mm7=data6H=(26 36)
+	packssdw  mm4,mm6		; mm4=data7=(07 17 27 37)
+	packssdw  mm5,mm7		; mm5=data6=(06 16 26 36)
+
+	packsswb  mm0,mm5		; mm0=(00 10 20 30 06 16 26 36)
+	packsswb  mm1,mm4		; mm1=(01 11 21 31 07 17 27 37)
+
+	movaps	xmm3, XMMWORD [wk(0)]	; xmm3=tmp2
+	movaps	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
+
+	movaps	xmm6,[GOTOFF(ebx,PD_0_125)]	; xmm6=[PD_0_125]
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm5,xmm3
+	movaps	xmm0,xmm1
+	addps	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
+	addps	xmm1,xmm4		; xmm1=data4=(04 14 24 34)
+	subps	xmm5,xmm2		; xmm5=data5=(05 15 25 35)
+	subps	xmm0,xmm4		; xmm0=data3=(03 13 23 33)
+
+	mulps	xmm3,xmm6		; descale(1/8)
+	mulps	xmm1,xmm6		; descale(1/8)
+	mulps	xmm5,xmm6		; descale(1/8)
+	mulps	xmm0,xmm6		; descale(1/8)
+
+	movhlps   xmm7,xmm3
+	movhlps   xmm2,xmm1
+	cvtps2pi  mm2,xmm3		; round to int32, mm2=data2L=(02 12)
+	cvtps2pi  mm3,xmm1		; round to int32, mm3=data4L=(04 14)
+	cvtps2pi  mm6,xmm7		; round to int32, mm6=data2H=(22 32)
+	cvtps2pi  mm7,xmm2		; round to int32, mm7=data4H=(24 34)
+	packssdw  mm2,mm6		; mm2=data2=(02 12 22 32)
+	packssdw  mm3,mm7		; mm3=data4=(04 14 24 34)
+
+	movhlps   xmm4,xmm5
+	movhlps   xmm6,xmm0
+	cvtps2pi  mm5,xmm5		; round to int32, mm5=data5L=(05 15)
+	cvtps2pi  mm4,xmm0		; round to int32, mm4=data3L=(03 13)
+	cvtps2pi  mm6,xmm4		; round to int32, mm6=data5H=(25 35)
+	cvtps2pi  mm7,xmm6		; round to int32, mm7=data3H=(23 33)
+	packssdw  mm5,mm6		; mm5=data5=(05 15 25 35)
+	packssdw  mm4,mm7		; mm4=data3=(03 13 23 33)
+
+	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
+
+	packsswb  mm2,mm3		; mm2=(02 12 22 32 04 14 24 34)
+	packsswb  mm4,mm5		; mm4=(03 13 23 33 05 15 25 35)
+
+	paddb     mm0,mm6
+	paddb     mm1,mm6
+	paddb     mm2,mm6
+	paddb     mm4,mm6
+
+	movq      mm7,mm0		; transpose coefficients(phase 1)
+	punpcklbw mm0,mm1		; mm0=(00 01 10 11 20 21 30 31)
+	punpckhbw mm7,mm1		; mm7=(06 07 16 17 26 27 36 37)
+	movq      mm3,mm2		; transpose coefficients(phase 1)
+	punpcklbw mm2,mm4		; mm2=(02 03 12 13 22 23 32 33)
+	punpckhbw mm3,mm4		; mm3=(04 05 14 15 24 25 34 35)
+
+	movq      mm5,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm2		; mm0=(00 01 02 03 10 11 12 13)
+	punpckhwd mm5,mm2		; mm5=(20 21 22 23 30 31 32 33)
+	movq      mm6,mm3		; transpose coefficients(phase 2)
+	punpcklwd mm3,mm7		; mm3=(04 05 06 07 14 15 16 17)
+	punpckhwd mm6,mm7		; mm6=(24 25 26 27 34 35 36 37)
+
+	movq      mm1,mm0		; transpose coefficients(phase 3)
+	punpckldq mm0,mm3		; mm0=(00 01 02 03 04 05 06 07)
+	punpckhdq mm1,mm3		; mm1=(10 11 12 13 14 15 16 17)
+	movq      mm4,mm5		; transpose coefficients(phase 3)
+	punpckldq mm5,mm6		; mm5=(20 21 22 23 24 25 26 27)
+	punpckhdq mm4,mm6		; mm4=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jsimd.h b/simd/jsimd.h
new file mode 100644
index 0000000..c21cf29
--- /dev/null
+++ b/simd/jsimd.h

@@ -0,0 +1,503 @@
+/*
+ * simd/jsimd.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ */
+
+/* Bitmask for supported acceleration methods */
+
+#define JSIMD_NONE    0x00
+#define JSIMD_MMX     0x01
+#define JSIMD_3DNOW   0x02
+#define JSIMD_SSE     0x04
+#define JSIMD_SSE2    0x08
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jpeg_simd_cpu_support                 jSiCpuSupport
+#define jsimd_rgb_ycc_convert_mmx             jSRGBYCCM
+#define jsimd_extrgb_ycc_convert_mmx          jSEXTRGBYCCM
+#define jsimd_extrgbx_ycc_convert_mmx         jSEXTRGBXYCCM
+#define jsimd_extbgr_ycc_convert_mmx          jSEXTBGRYCCM
+#define jsimd_extbgrx_ycc_convert_mmx         jSEXTBGRXYCCM
+#define jsimd_extxbgr_ycc_convert_mmx         jSEXTXBGRYCCM
+#define jsimd_extxrgb_ycc_convert_mmx         jSEXTXRGBYCCM
+#define jsimd_ycc_rgb_convert_mmx             jSYCCRGBM
+#define jsimd_ycc_extrgb_convert_mmx          jSYCCEXTRGBM
+#define jsimd_ycc_extrgbx_convert_mmx         jSYCCEXTRGBXM
+#define jsimd_ycc_extbgr_convert_mmx          jSYCCEXTBGRM
+#define jsimd_ycc_extbgrx_convert_mmx         jSYCCEXTBGRXM
+#define jsimd_ycc_extxbgr_convert_mmx         jSYCCEXTXBGRM
+#define jsimd_ycc_extxrgb_convert_mmx         jSYCCEXTXRGBM
+#define jconst_rgb_ycc_convert_sse2           jSCRGBYCCS2
+#define jsimd_rgb_ycc_convert_sse2            jSRGBYCCS2
+#define jsimd_extrgb_ycc_convert_sse2         jSEXTRGBYCCS2
+#define jsimd_extrgbx_ycc_convert_sse2        jSEXTRGBXYCCS2
+#define jsimd_extbgr_ycc_convert_sse2         jSEXTBGRYCCS2
+#define jsimd_extbgrx_ycc_convert_sse2        jSEXTBGRXYCCS2
+#define jsimd_extxbgr_ycc_convert_sse2        jSEXTXBGRYCCS2
+#define jsimd_extxrgb_ycc_convert_sse2        jSEXTXRGBYCCS2
+#define jconst_ycc_rgb_convert_sse2           jSCYCCRGBS2
+#define jsimd_ycc_rgb_convert_sse2            jSYCCRGBS2
+#define jsimd_ycc_extrgb_convert_sse2         jSYCCEXTRGBS2
+#define jsimd_ycc_extrgbx_convert_sse2        jSYCCEXTRGBXS2
+#define jsimd_ycc_extbgr_convert_sse2         jSYCCEXTBGRS2
+#define jsimd_ycc_extbgrx_convert_sse2        jSYCCEXTBGRXS2
+#define jsimd_ycc_extxbgr_convert_sse2        jSYCCEXTXBGRS2
+#define jsimd_ycc_extxrgb_convert_sse2        jSYCCEXTXRGBS2
+#define jsimd_h2v2_downsample_mmx             jSDnH2V2M
+#define jsimd_h2v1_downsample_mmx             jSDnH2V1M
+#define jsimd_h2v2_downsample_sse2            jSDnH2V2S2
+#define jsimd_h2v1_downsample_sse2            jSDnH2V1S2
+#define jsimd_h2v2_upsample_mmx               jSUpH2V2M
+#define jsimd_h2v1_upsample_mmx               jSUpH2V1M
+#define jsimd_h2v2_fancy_upsample_mmx         jSFUpH2V2M
+#define jsimd_h2v1_fancy_upsample_mmx         jSFUpH2V1M
+#define jsimd_h2v2_merged_upsample_mmx        jSMUpH2V2M
+#define jsimd_h2v2_extrgb_merged_upsample_mmx jSMUpH2V2EXTRGBM
+#define jsimd_h2v2_extrgbx_merged_upsample_mmx jSMUpH2V2EXTRGBXM
+#define jsimd_h2v2_extbgr_merged_upsample_mmx jSMUpH2V2EXTBGRM
+#define jsimd_h2v2_extbgrx_merged_upsample_mmx jSMUpH2V2EXTBGRXM
+#define jsimd_h2v2_extxbgr_merged_upsample_mmx jSMUpH2V2EXTXBGRM
+#define jsimd_h2v2_extxrgb_merged_upsample_mmx jSMUpH2V2EXTXRGBM
+#define jsimd_h2v1_merged_upsample_mmx        jSMUpH2V1M
+#define jsimd_h2v1_extrgb_merged_upsample_mmx jSMUpH2V1EXTRGBM
+#define jsimd_h2v1_extrgbx_merged_upsample_mmx jSMUpH2V1EXTRGBXM
+#define jsimd_h2v1_extbgr_merged_upsample_mmx jSMUpH2V1EXTBGRM
+#define jsimd_h2v1_extbgrx_merged_upsample_mmx jSMUpH2V1EXTBGRXM
+#define jsimd_h2v1_extxbgr_merged_upsample_mmx jSMUpH2V1EXTXBGRM
+#define jsimd_h2v1_extxrgb_merged_upsample_mmx jSMUpH2V1EXTXRGBM
+#define jsimd_h2v2_upsample_sse2              jSUpH2V2S2
+#define jsimd_h2v1_upsample_sse2              jSUpH2V1S2
+#define jconst_fancy_upsample_sse2            jSCFUpS2
+#define jsimd_h2v2_fancy_upsample_sse2        jSFUpH2V2S2
+#define jsimd_h2v1_fancy_upsample_sse2        jSFUpH2V1S2
+#define jconst_merged_upsample_sse2           jSCMUpS2
+#define jsimd_h2v2_merged_upsample_sse2       jSMUpH2V2S2
+#define jsimd_h2v2_extrgb_merged_upsample_sse2 jSMUpH2V2EXTRGBS2
+#define jsimd_h2v2_extrgbx_merged_upsample_sse2 jSMUpH2V2EXTRGBXS2
+#define jsimd_h2v2_extbgr_merged_upsample_sse2 jSMUpH2V2EXTBGRS2
+#define jsimd_h2v2_extbgrx_merged_upsample_sse2 jSMUpH2V2EXTBGRXS2
+#define jsimd_h2v2_extxbgr_merged_upsample_sse2 jSMUpH2V2EXTXBGRS2
+#define jsimd_h2v2_extxrgb_merged_upsample_sse2 jSMUpH2V2EXTXRGBS2
+#define jsimd_h2v1_merged_upsample_sse2       jSMUpH2V1S2
+#define jsimd_h2v1_extrgb_merged_upsample_sse2 jSMUpH2V1EXTRGBS2
+#define jsimd_h2v1_extrgbx_merged_upsample_sse2 jSMUpH2V1EXTRGBXS2
+#define jsimd_h2v1_extbgr_merged_upsample_sse2 jSMUpH2V1EXTBGRS2
+#define jsimd_h2v1_extbgrx_merged_upsample_sse2 jSMUpH2V1EXTBGRXS2
+#define jsimd_h2v1_extxbgr_merged_upsample_sse2 jSMUpH2V1EXTXBGRS2
+#define jsimd_h2v1_extxrgb_merged_upsample_sse2 jSMUpH2V1EXTXRGBS2
+#define jsimd_convsamp_mmx                    jSConvM
+#define jsimd_convsamp_sse2                   jSConvS2
+#define jsimd_convsamp_float_3dnow            jSConvF3D
+#define jsimd_convsamp_float_sse              jSConvFS
+#define jsimd_convsamp_float_sse2             jSConvFS2
+#define jsimd_fdct_islow_mmx                  jSFDMIS
+#define jsimd_fdct_ifast_mmx                  jSFDMIF
+#define jconst_fdct_islow_sse2                jSCFDS2IS
+#define jsimd_fdct_islow_sse2                 jSFDS2IS
+#define jconst_fdct_ifast_sse2                jSCFDS2IF
+#define jsimd_fdct_ifast_sse2                 jSFDS2IF
+#define jsimd_fdct_float_3dnow                jSFD3DF
+#define jconst_fdct_float_sse                 jSCFDSF
+#define jsimd_fdct_float_sse                  jSFDSF
+#define jsimd_quantize_mmx                    jSQuantM
+#define jsimd_quantize_sse2                   jSQuantS2
+#define jsimd_quantize_float_3dnow            jSQuantF3D
+#define jsimd_quantize_float_sse              jSQuantFS
+#define jsimd_quantize_float_sse2             jSQuantFS2
+#define jsimd_idct_2x2_mmx                    jSIDM22
+#define jsimd_idct_4x4_mmx                    jSIDM44
+#define jconst_idct_red_sse2                  jSCIDS2R
+#define jsimd_idct_2x2_sse2                   jSIDS222
+#define jsimd_idct_4x4_sse2                   jSIDS244
+#define jsimd_idct_islow_mmx                  jSIDMIS
+#define jsimd_idct_ifast_mmx                  jSIDMIF
+#define jconst_idct_islow_sse2                jSCIDS2IS
+#define jsimd_idct_islow_sse2                 jSIDS2IS
+#define jconst_idct_ifast_sse2                jSCIDS2IF
+#define jsimd_idct_ifast_sse2                 jSIDS2IF
+#define jsimd_idct_float_3dnow                jSID3DF
+#define jconst_fdct_float_sse                 jSCIDSF
+#define jsimd_idct_float_sse                  jSIDSF
+#define jconst_fdct_float_sse2                jSCIDS2F
+#define jsimd_idct_float_sse2                 jSIDS2F
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+/* SIMD Ext: retrieve SIMD/CPU information */
+EXTERN(unsigned int) jpeg_simd_cpu_support JPP((void));
+
+/* SIMD Color Space Conversion */
+EXTERN(void) jsimd_rgb_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+EXTERN(void) jsimd_ycc_rgb_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
+extern const int jconst_rgb_ycc_convert_sse2[];
+EXTERN(void) jsimd_rgb_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+extern const int jconst_ycc_rgb_convert_sse2[];
+EXTERN(void) jsimd_ycc_rgb_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
+/* SIMD Downsample */
+EXTERN(void) jsimd_h2v2_downsample_mmx
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample_mmx
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+EXTERN(void) jsimd_h2v2_downsample_sse2
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample_sse2
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+/* SIMD Upsample */
+EXTERN(void) jsimd_h2v2_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+
+EXTERN(void) jsimd_h2v2_upsample_sse2
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample_sse2
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+extern const int jconst_fancy_upsample_sse2[];
+EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+extern const int jconst_merged_upsample_sse2[];
+EXTERN(void) jsimd_h2v2_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+
+/* SIMD Sample Conversion */
+EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
+                                     JDIMENSION start_col,
+                                     DCTELEM * workspace));
+
+EXTERN(void) jsimd_convsamp_sse2 JPP((JSAMPARRAY sample_data,
+                                      JDIMENSION start_col,
+                                      DCTELEM * workspace));
+
+EXTERN(void) jsimd_convsamp_float_3dnow JPP((JSAMPARRAY sample_data,
+                                             JDIMENSION start_col,
+                                             FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_convsamp_float_sse JPP((JSAMPARRAY sample_data,
+                                           JDIMENSION start_col,
+                                           FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_convsamp_float_sse2 JPP((JSAMPARRAY sample_data,
+                                            JDIMENSION start_col,
+                                            FAST_FLOAT * workspace));
+
+/* SIMD Forward DCT */
+EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data));
+
+extern const int jconst_fdct_ifast_sse2[];
+EXTERN(void) jsimd_fdct_islow_sse2 JPP((DCTELEM * data));
+extern const int jconst_fdct_islow_sse2[];
+EXTERN(void) jsimd_fdct_ifast_sse2 JPP((DCTELEM * data));
+
+EXTERN(void) jsimd_fdct_float_3dnow JPP((FAST_FLOAT * data));
+
+extern const int jconst_fdct_float_sse[];
+EXTERN(void) jsimd_fdct_float_sse JPP((FAST_FLOAT * data));
+
+/* SIMD Quantization */
+EXTERN(void) jsimd_quantize_mmx JPP((JCOEFPTR coef_block,
+                                     DCTELEM * divisors,
+                                     DCTELEM * workspace));
+
+EXTERN(void) jsimd_quantize_sse2 JPP((JCOEFPTR coef_block,
+                                      DCTELEM * divisors,
+                                      DCTELEM * workspace));
+
+EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block,
+                                             FAST_FLOAT * divisors,
+                                             FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_quantize_float_sse JPP((JCOEFPTR coef_block,
+                                           FAST_FLOAT * divisors,
+                                           FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_quantize_float_sse2 JPP((JCOEFPTR coef_block,
+                                            FAST_FLOAT * divisors,
+                                            FAST_FLOAT * workspace));
+
+/* SIMD Reduced Inverse DCT */
+EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table,
+                                     JCOEFPTR coef_block,
+                                     JSAMPARRAY output_buf,
+                                     JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_mmx JPP((void * dct_table,
+                                     JCOEFPTR coef_block,
+                                     JSAMPARRAY output_buf,
+                                     JDIMENSION output_col));
+
+extern const int jconst_idct_red_sse2[];
+EXTERN(void) jsimd_idct_2x2_sse2 JPP((void * dct_table,
+                                      JCOEFPTR coef_block,
+                                      JSAMPARRAY output_buf,
+                                      JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table,
+                                      JCOEFPTR coef_block,
+                                      JSAMPARRAY output_buf,
+                                      JDIMENSION output_col));
+
+/* SIMD Inverse DCT */
+EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
+                                       JCOEFPTR coef_block,
+                                       JSAMPARRAY output_buf,
+                                       JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast_mmx JPP((void * dct_table,
+                                       JCOEFPTR coef_block,
+                                       JSAMPARRAY output_buf,
+                                       JDIMENSION output_col));
+
+extern const int jconst_idct_islow_sse2[];
+EXTERN(void) jsimd_idct_islow_sse2 JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+extern const int jconst_idct_ifast_sse2[];
+EXTERN(void) jsimd_idct_ifast_sse2 JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+
+EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
+                                         JCOEFPTR coef_block,
+                                         JSAMPARRAY output_buf,
+                                         JDIMENSION output_col));
+
+extern const int jconst_idct_float_sse[];
+EXTERN(void) jsimd_idct_float_sse JPP((void * dct_table,
+                                       JCOEFPTR coef_block,
+                                       JSAMPARRAY output_buf,
+                                       JDIMENSION output_col));
+
+extern const int jconst_idct_float_sse2[];
+EXTERN(void) jsimd_idct_float_sse2 JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+

diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c
new file mode 100644
index 0000000..f5aec18
--- /dev/null
+++ b/simd/jsimd_i386.c

@@ -0,0 +1,956 @@
+/*
+ * jsimd_i386.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+
+static unsigned int simd_support = ~0;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd (void)
+{
+  char *env = NULL;
+
+  if (simd_support != ~0)
+    return;
+
+  simd_support = jpeg_simd_cpu_support();
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEMMX");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_MMX;
+  env = getenv("JSIMD_FORCE3DNOW");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_3DNOW;
+  env = getenv("JSIMD_FORCESSE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_SSE;
+  env = getenv("JSIMD_FORCESSE2");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_SSE2;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_extrgb_ycc_convert_sse2;
+      mmxfct=jsimd_extrgb_ycc_convert_mmx;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_extrgbx_ycc_convert_sse2;
+      mmxfct=jsimd_extrgbx_ycc_convert_mmx;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_extbgr_ycc_convert_sse2;
+      mmxfct=jsimd_extbgr_ycc_convert_mmx;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_extbgrx_ycc_convert_sse2;
+      mmxfct=jsimd_extbgrx_ycc_convert_mmx;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_extxbgr_ycc_convert_sse2;
+      mmxfct=jsimd_extxbgr_ycc_convert_mmx;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_extxrgb_ycc_convert_sse2;
+      mmxfct=jsimd_extxrgb_ycc_convert_mmx;
+      break;
+    default:
+      sse2fct=jsimd_rgb_ycc_convert_sse2;
+      mmxfct=jsimd_rgb_ycc_convert_mmx;
+      break;
+  }
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    sse2fct(cinfo->image_width, input_buf,
+        output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_MMX)
+    mmxfct(cinfo->image_width, input_buf,
+        output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_ycc_extrgb_convert_sse2;
+      mmxfct=jsimd_ycc_extrgb_convert_mmx;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_ycc_extrgbx_convert_sse2;
+      mmxfct=jsimd_ycc_extrgbx_convert_mmx;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_ycc_extbgr_convert_sse2;
+      mmxfct=jsimd_ycc_extbgr_convert_mmx;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_ycc_extbgrx_convert_sse2;
+      mmxfct=jsimd_ycc_extbgrx_convert_mmx;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_ycc_extxbgr_convert_sse2;
+      mmxfct=jsimd_ycc_extxbgr_convert_mmx;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_ycc_extxrgb_convert_sse2;
+      mmxfct=jsimd_ycc_extxrgb_convert_mmx;
+      break;
+    default:
+      sse2fct=jsimd_ycc_rgb_convert_sse2;
+      mmxfct=jsimd_ycc_rgb_convert_mmx;
+      break;
+  }
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    sse2fct(cinfo->output_width, input_buf,
+        input_row, output_buf, num_rows);
+  else if (simd_support & JSIMD_MMX)
+    mmxfct(cinfo->output_width, input_buf,
+        input_row, output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, compptr->width_in_blocks,
+        input_data, output_data);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, compptr->width_in_blocks,
+        input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, compptr->width_in_blocks,
+        input_data, output_data);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, compptr->width_in_blocks,
+        input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor,
+        cinfo->output_width, input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extrgbx_merged_upsample_mmx;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extbgr_merged_upsample_mmx;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extbgrx_merged_upsample_mmx;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extxbgr_merged_upsample_mmx;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extxrgb_merged_upsample_mmx;
+      break;
+    default:
+      sse2fct=jsimd_h2v2_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_merged_upsample_mmx;
+      break;
+  }
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    sse2fct(cinfo->output_width, input_buf,
+        in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_MMX)
+    mmxfct(cinfo->output_width, input_buf,
+        in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extrgbx_merged_upsample_mmx;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extbgr_merged_upsample_mmx;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extbgrx_merged_upsample_mmx;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extxbgr_merged_upsample_mmx;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extxrgb_merged_upsample_mmx;
+      break;
+    default:
+      sse2fct=jsimd_h2v1_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_merged_upsample_mmx;
+      break;
+  }
+
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    sse2fct(cinfo->output_width, input_buf,
+        in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_MMX)
+    mmxfct(cinfo->output_width, input_buf,
+        in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM * workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_convsamp_mmx(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT * workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_convsamp_float_sse(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    jsimd_fdct_islow_sse2(data);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_fdct_islow_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    jsimd_fdct_ifast_sse2(data);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_fdct_ifast_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    jsimd_fdct_float_sse(data);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_fdct_float_3dnow(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+                DCTELEM * workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_quantize_mmx(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                      FAST_FLOAT * workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_quantize_float_sse(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  else if (simd_support & JSIMD_MMX)
+    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    jsimd_idct_float_sse2(compptr->dct_table, coef_block,
+        output_buf, output_col);
+  else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    jsimd_idct_float_sse(compptr->dct_table, coef_block,
+        output_buf, output_col);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_idct_float_3dnow(compptr->dct_table, coef_block,
+        output_buf, output_col);
+}
+

diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c
new file mode 100644
index 0000000..0da564c
--- /dev/null
+++ b/simd/jsimd_x86_64.c

@@ -0,0 +1,680 @@
+/*
+ * jsimd_x86_64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * x86_64 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "simd/jsimd.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order) (((unsigned long)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_extrgb_ycc_convert_sse2;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_extrgbx_ycc_convert_sse2;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_extbgr_ycc_convert_sse2;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_extbgrx_ycc_convert_sse2;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_extxbgr_ycc_convert_sse2;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_extxrgb_ycc_convert_sse2;
+      break;
+    default:
+      sse2fct=jsimd_rgb_ycc_convert_sse2;
+      break;
+  }
+
+  sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_ycc_extrgb_convert_sse2;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_ycc_extrgbx_convert_sse2;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_ycc_extbgr_convert_sse2;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_ycc_extbgrx_convert_sse2;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_ycc_extxbgr_convert_sse2;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_ycc_extxrgb_convert_sse2;
+      break;
+    default:
+      sse2fct=jsimd_ycc_rgb_convert_sse2;
+      break;
+  }
+
+  sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_sse2(cinfo->image_width,
+                             cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor,
+                             compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_sse2(cinfo->image_width,
+                             cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor,
+                             compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+  jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor,
+                           cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+  jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor,
+                           cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width,
+                                 input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width,
+                                 input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
+      break;
+    default:
+      sse2fct=jsimd_h2v2_merged_upsample_sse2;
+      break;
+  }
+
+  sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
+      break;
+    default:
+      sse2fct=jsimd_h2v1_merged_upsample_sse2;
+      break;
+  }
+
+  sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM * workspace)
+{
+  jsimd_convsamp_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT * workspace)
+{
+  jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+  jsimd_fdct_islow_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+  jsimd_fdct_ifast_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+  jsimd_fdct_float_sse(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+                DCTELEM * workspace)
+{
+  jsimd_quantize_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                      FAST_FLOAT * workspace)
+{
+  jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if (!IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 0;
+
+  return 1;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+  jsimd_idct_float_sse2(compptr->dct_table, coef_block,
+                        output_buf, output_col);
+}
+

diff --git a/simd/jsimdcfg.inc.h b/simd/jsimdcfg.inc.h
new file mode 100644
index 0000000..4876038
--- /dev/null
+++ b/simd/jsimdcfg.inc.h

@@ -0,0 +1,168 @@
+// This file generates the include file for the assembly
+// implementations by abusing the C preprocessor.
+//
+// Note: Some things are manually defined as they need to
+// be mapped to NASM types.
+
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+
+#define JPEG_INTERNALS
+
+#include "../jpeglib.h"
+#include "../jconfig.h"
+#include "../jmorecfg.h"
+#include "jsimd.h"
+
+#define define(var) %define _cpp_protection_##var
+#define definev(var) %define _cpp_protection_##var var
+
+;
+; -- jpeglib.h
+;
+
+definev(DCTSIZE)
+definev(DCTSIZE2)
+
+;
+; -- jmorecfg.h
+;
+
+definev(RGB_RED)
+definev(RGB_GREEN)
+definev(RGB_BLUE)
+
+definev(RGB_PIXELSIZE)
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+
+%define JSAMPLE                 byte          ; unsigned char
+%define SIZEOF_JSAMPLE          SIZEOF_BYTE   ; sizeof(JSAMPLE)
+
+definev(CENTERJSAMPLE)
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF                   word          ; short
+%define SIZEOF_JCOEF            SIZEOF_WORD   ; sizeof(JCOEF)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION              dword         ; unsigned int
+%define SIZEOF_JDIMENSION       SIZEOF_DWORD  ; sizeof(JDIMENSION)
+
+%define JSAMPROW                POINTER       ; JSAMPLE FAR * (jpeglib.h)
+%define JSAMPARRAY              POINTER       ; JSAMPROW *    (jpeglib.h)
+%define JSAMPIMAGE              POINTER       ; JSAMPARRAY *  (jpeglib.h)
+%define JCOEFPTR                POINTER       ; JCOEF FAR *   (jpeglib.h)
+%define SIZEOF_JSAMPROW         SIZEOF_POINTER  ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY       SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE       SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR         SIZEOF_POINTER  ; sizeof(JCOEFPTR)
+
+;
+; -- jdct.h
+;
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM                 word          ; short
+%define SIZEOF_DCTELEM          SIZEOF_WORD   ; sizeof(DCTELEM)
+
+%define FAST_FLOAT              FP32            ; float
+%define SIZEOF_FAST_FLOAT       SIZEOF_FP32     ; sizeof(FAST_FLOAT)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define ISLOW_MULT_TYPE         word          ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE  SIZEOF_WORD   ; sizeof(ISLOW_MULT_TYPE)
+
+%define IFAST_MULT_TYPE         word          ; must be short
+%define SIZEOF_IFAST_MULT_TYPE  SIZEOF_WORD   ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS        2             ; fractional bits in scale factors
+
+%define FLOAT_MULT_TYPE         FP32          ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE  SIZEOF_FP32   ; sizeof(FLOAT_MULT_TYPE)
+
+;
+; -- jsimd.h
+;
+
+definev(JSIMD_NONE)
+definev(JSIMD_MMX)
+definev(JSIMD_3DNOW)
+definev(JSIMD_SSE)
+definev(JSIMD_SSE2)
+
+; Short forms of external names for systems with brain-damaged linkers.
+;
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+definev(jpeg_simd_cpu_support)
+definev(jsimd_rgb_ycc_convert_mmx)
+definev(jsimd_ycc_rgb_convert_mmx)
+definev(jconst_rgb_ycc_convert_sse2)
+definev(jsimd_rgb_ycc_convert_sse2)
+definev(jconst_ycc_rgb_convert_sse2)
+definev(jsimd_ycc_rgb_convert_sse2)
+definev(jsimd_h2v2_downsample_mmx)
+definev(jsimd_h2v1_downsample_mmx)
+definev(jsimd_h2v2_downsample_sse2)
+definev(jsimd_h2v1_downsample_sse2)
+definev(jsimd_h2v2_upsample_mmx)
+definev(jsimd_h2v1_upsample_mmx)
+definev(jsimd_h2v1_fancy_upsample_mmx)
+definev(jsimd_h2v2_fancy_upsample_mmx)
+definev(jsimd_h2v1_merged_upsample_mmx)
+definev(jsimd_h2v2_merged_upsample_mmx)
+definev(jsimd_h2v2_upsample_sse2)
+definev(jsimd_h2v1_upsample_sse2)
+definev(jconst_fancy_upsample_sse2)
+definev(jsimd_h2v1_fancy_upsample_sse2)
+definev(jsimd_h2v2_fancy_upsample_sse2)
+definev(jconst_merged_upsample_sse2)
+definev(jsimd_h2v1_merged_upsample_sse2)
+definev(jsimd_h2v2_merged_upsample_sse2)
+definev(jsimd_convsamp_mmx)
+definev(jsimd_convsamp_sse2)
+definev(jsimd_convsamp_float_3dnow)
+definev(jsimd_convsamp_float_sse)
+definev(jsimd_convsamp_float_sse2)
+definev(jsimd_fdct_islow_mmx)
+definev(jsimd_fdct_ifast_mmx)
+definev(jconst_fdct_islow_sse2)
+definev(jsimd_fdct_islow_sse2)
+definev(jconst_fdct_ifast_sse2)
+definev(jsimd_fdct_ifast_sse2)
+definev(jsimd_fdct_float_3dnow)
+definev(jconst_fdct_float_sse)
+definev(jsimd_fdct_float_sse)
+definev(jsimd_quantize_mmx)
+definev(jsimd_quantize_sse2)
+definev(jsimd_quantize_float_3dnow)
+definev(jsimd_quantize_float_sse)
+definev(jsimd_quantize_float_sse2)
+definev(jsimd_idct_2x2_mmx)
+definev(jsimd_idct_4x4_mmx)
+definev(jconst_idct_red_sse2)
+definev(jsimd_idct_2x2_sse2)
+definev(jsimd_idct_4x4_sse2)
+definev(jsimd_idct_islow_mmx)
+definev(jsimd_idct_ifast_mmx)
+definev(jconst_idct_islow_sse2)
+definev(jsimd_idct_islow_sse2)
+definev(jconst_idct_ifast_sse2)
+definev(jsimd_idct_ifast_sse2)
+definev(jsimd_idct_float_3dnow)
+definev(jconst_idct_float_sse)
+definev(jsimd_idct_float_sse)
+definev(jconst_idct_float_sse2)
+definev(jsimd_idct_float_sse2)
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+

diff --git a/simd/jsimdcpu.asm b/simd/jsimdcpu.asm
new file mode 100644
index 0000000..bdbcc23
--- /dev/null
+++ b/simd/jsimdcpu.asm

@@ -0,0 +1,105 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support (void)
+;
+
+	align	16
+	global	EXTN(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+	push	edi
+
+	xor	edi,edi			; simd support flag
+
+	pushfd
+	pop	eax
+	mov	edx,eax
+	xor	eax, 1<<21		; flip ID bit in EFLAGS
+	push	eax
+	popfd
+	pushfd
+	pop	eax
+	xor	eax,edx
+	jz	short .return		; CPUID is not supported
+
+	; Check for MMX instruction support
+	xor	eax,eax
+	cpuid
+	test	eax,eax
+	jz	short .return
+
+	xor	eax,eax
+	inc	eax
+	cpuid
+	mov	eax,edx			; eax = Standard feature flags
+
+	test	eax, 1<<23		; bit23:MMX
+	jz	short .no_mmx
+	or	edi, byte JSIMD_MMX
+.no_mmx:
+	test	eax, 1<<25		; bit25:SSE
+	jz	short .no_sse
+	or	edi, byte JSIMD_SSE
+.no_sse:
+	test	eax, 1<<26		; bit26:SSE2
+	jz	short .no_sse2
+	or	edi, byte JSIMD_SSE2
+.no_sse2:
+
+	; Check for 3DNow! instruction support
+	mov	eax, 0x80000000
+	cpuid
+	cmp	eax, 0x80000000
+	jbe	short .return
+
+	mov	eax, 0x80000001
+	cpuid
+	mov	eax,edx			; eax = Extended feature flags
+
+	test	eax, 1<<31		; bit31:3DNow!(vendor independent)
+	jz	short .no_3dnow
+	or	edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+	mov	eax,edi
+
+	pop	edi
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16

diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc
new file mode 100644
index 0000000..4695360
--- /dev/null
+++ b/simd/jsimdext.inc

@@ -0,0 +1,317 @@
+;
+; jsimdext.inc - common declarations
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty.  In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+;    claim that you wrote the original software. If you use this software
+;    in a product, an acknowledgment in the product documentation would be
+;    appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+;    misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+;
+; [TAB8]
+
+; ==========================================================================
+;  System-dependent configurations
+
+%ifdef WIN32	; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  align=16 public use32 class=CODE
+%define SEG_CONST   .rdata align=16 public use32 class=CONST
+
+%elifdef OBJ32	; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  align=16 public use32 class=CODE
+%define SEG_CONST   .data  align=16 public use32 class=DATA
+
+%elifdef ELF	; ----(nasm -felf[64] -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; mark stack as non-executable
+section .note.GNU-stack noalloc noexec nowrite progbits
+
+; -- segment definition --
+;
+%ifdef __x86_64__
+%define SEG_TEXT    .text   progbits align=16
+%define SEG_CONST   .rodata progbits align=16
+%else
+%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
+%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
+%endif
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_	; ELF supports PIC
+%define EXTN(name)  name			; foo() -> foo
+
+%elifdef AOUT	; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text
+%define SEG_CONST   .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_	; BSD-style a.out supports PIC
+
+%elifdef MACHO	; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  ;align=16	; nasm doesn't accept align=16. why?
+%define SEG_CONST   .rodata align=16
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL  _MACHO_PIC_		; Mach-O style code-relative addressing
+
+%else		; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text
+%define SEG_CONST   .data
+
+%endif	; ----------------------------------------------
+
+; ==========================================================================
+
+; --------------------------------------------------------------------------
+;  Common types
+;
+%ifdef __x86_64__
+%define POINTER                 qword           ; general pointer type
+%define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
+%define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%else
+%define POINTER                 dword           ; general pointer type
+%define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
+%define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%endif
+
+%define INT                     dword           ; signed integer type
+%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
+%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT
+
+%define FP32                    dword           ; IEEE754 single
+%define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
+%define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
+
+%define MMWORD                  qword           ; int64  (MMX register)
+%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
+%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
+
+; NASM is buggy and doesn't properly handle operand sizes for SSE
+; instructions, so for now we have to define XMMWORD as blank.
+%define XMMWORD                                 ; int128 (SSE register)
+%define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
+%define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
+
+; Similar hacks for when we load a dword or MMWORD into an xmm# register
+%define XMM_DWORD
+%define XMM_MMWORD
+
+%define SIZEOF_BYTE             1               ; sizeof(BYTE)
+%define SIZEOF_WORD             2               ; sizeof(WORD)
+%define SIZEOF_DWORD            4               ; sizeof(DWORD)
+%define SIZEOF_QWORD            8               ; sizeof(QWORD)
+%define SIZEOF_OWORD            16              ; sizeof(OWORD)
+
+%define BYTE_BIT                8               ; CHAR_BIT in C
+%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
+%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
+%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
+%define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT
+
+; --------------------------------------------------------------------------
+;  External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name)   _ %+ name		; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+;  Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC ; -------------------------------------------
+
+%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+	SECTION	SEG_CONST
+const_base:
+
+%define GOTOFF(got,sym) (got) + (sym) - const_base
+
+%imacro get_GOT	1
+	; NOTE: this macro destroys ecx resister.
+	call	%%geteip
+	add	ecx, byte (%%ref - $)
+	jmp	short %%adjust
+%%geteip:
+	mov	ecx, POINTER [esp]
+	ret
+%%adjust:
+	push	ebp
+	xor	ebp,ebp		; ebp = 0
+%ifidni %1,ebx	; (%1 == ebx)
+	; db 0x8D,0x9C + jmp near const_base =
+	;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+	db	0x8D,0x9C		; 8D,9C
+	jmp	near const_base		; E9,(const_base-%%ref)
+%%ref:
+%else  ; (%1 != ebx)
+	; db 0x8D,0x8C + jmp near const_base =
+	;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+	db	0x8D,0x8C		; 8D,8C
+	jmp	near const_base		; E9,(const_base-%%ref)
+%%ref:	mov	%1, ecx
+%endif ; (%1 == ebx)
+	pop	ebp
+%endmacro
+
+%else	; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT	1
+	extern	GOT_SYMBOL
+	call	%%geteip
+	add	%1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+	jmp	short %%done
+%%geteip:
+	mov	%1, POINTER [esp]
+	ret
+%%done:
+%endmacro
+
+%endif	; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic	1.nolist
+	push	%1
+%endmacro
+%imacro poppic	1.nolist
+	pop	%1
+%endmacro
+%imacro movpic	2.nolist
+	mov	%1,%2
+%endmacro
+
+%else	; !PIC -----------------------------------------
+
+%define GOTOFF(got,sym) (sym)
+
+%imacro get_GOT	1.nolist
+%endmacro
+%imacro pushpic	1.nolist
+%endmacro
+%imacro poppic	1.nolist
+%endmacro
+%imacro movpic	2.nolist
+%endmacro
+
+%endif	;  PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+;  Align the next instruction on {2,4,8,16,..}-byte boundary.
+;  ".balign n,,m" in GNU as
+;
+%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b,n)  (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs:	times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
+	       db 0x90                               ; nop
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
+	       db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
+	       db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
+	       db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
+	       db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
+	       db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
+	       db 0x8B,0xED                          ; mov ebp,ebp
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
+	       db 0x90                               ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+	align %1, db 0		; filling zeros
+%endmacro
+
+%ifdef __x86_64__
+%imacro collect_args 0
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	mov r10, rdi
+	mov r11, rsi
+	mov r12, rdx
+	mov r13, rcx
+	mov r14, r8
+	mov r15, r9
+%endmacro
+
+%imacro uncollect_args 0
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+%endmacro
+
+%endif
+
+; --------------------------------------------------------------------------
+;  Defines picked up from the C headers
+;
+%include "jsimdcfg.inc"
+
+; --------------------------------------------------------------------------

diff --git a/simd/nasm_lt.sh b/simd/nasm_lt.sh
new file mode 100755
index 0000000..b112862
--- /dev/null
+++ b/simd/nasm_lt.sh

@@ -0,0 +1,57 @@
+#! /bin/sh
+command=""
+infile=""
+o_opt=no
+pic=no
+while [ $# -gt 0 ]; do
+    case "$1" in
+        -DPIC|-fPIC|-fpic)
+            if [ "$pic" != "yes" ] ; then
+                command="$command -DPIC"
+                pic=yes
+            fi
+            ;;
+        -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \
+        -fobj|-fwin32|-frdf|-fieee|-fmacho|-fmacho64)
+            # it's a file format specifier for nasm.
+            command="$command $1"
+            ;;
+        -f*)
+            # maybe a code-generation flag for gcc.
+            ;;
+        -[Ii]*)
+            incdir=`echo "$1" | sed 's/^-[Ii]//'`
+            if [ "x$incdir" = x -a "x$2" != x ] ; then
+                case "$2" in
+                    -*) ;;
+                    *) incdir="$2"; shift;;
+                esac
+            fi
+            if [ "x$incdir" != x ] ; then
+                # In the case of NASM, the trailing slash is necessary.
+                incdir=`echo "$incdir" | sed 's%/*$%/%'`
+                command="$command -I$incdir"
+            fi
+            ;;
+        -o*)
+            o_opt=yes
+            command="$command $1"
+            ;;
+        *.asm)
+            infile=$1
+            command="$command $1"
+            ;;
+        *)
+            command="$command $1"
+            ;;
+    esac
+    shift
+done
+if [ "$o_opt" != yes ] ; then
+    # By default, NASM creates an output file
+    # in the same directory as the input file.
+    outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o"
+    command="$command $outfile"
+fi
+echo $command
+exec $command

diff --git a/testimg.bmp b/testimg.bmp
index 8603d15..d67e025 100644
--- a/testimg.bmp
+++ b/testimg.bmp
Binary files differ

diff --git a/testimg.jpg b/testimg.jpg
index b34ca5d..76d223f 100644
--- a/testimg.jpg
+++ b/testimg.jpg
Binary files differ

diff --git a/testimg.ppm b/testimg.ppm
index 9d81ce2..b391684 100644
--- a/testimg.ppm
+++ b/testimg.ppm
Binary files differ

diff --git a/testimgp.jpg b/testimgp.jpg
index 8cbb658..b78f6d0 100644
--- a/testimgp.jpg
+++ b/testimgp.jpg
Binary files differ

diff --git a/turbojpeg.h b/turbojpeg.h
new file mode 100644
index 0000000..808e2f3
--- /dev/null
+++ b/turbojpeg.h

@@ -0,0 +1,231 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005, 2006 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version.  The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * wxWindows Library License for more details.
+ */
+
+#if (defined(_MSC_VER) || defined(__CYGWIN__) || defined(__MINGW32__)) && defined(_WIN32) && defined(DLLDEFINE)
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif
+
+#define DLLCALL
+
+/* Subsampling */
+#define NUMSUBOPT 4
+
+enum {TJ_444=0, TJ_422, TJ_420, TJ_GRAYSCALE};
+
+/* Flags */
+#define TJ_BGR       1
+#define TJ_BOTTOMUP  2
+#define TJ_FORCEMMX  8   /* Force IPP to use MMX code even if SSE available */
+#define TJ_FORCESSE  16  /* Force IPP to use SSE1 code even if SSE2 available */
+#define TJ_FORCESSE2 32  /* Force IPP to use SSE2 code (useful if auto-detect is not working properly) */
+#define TJ_ALPHAFIRST 64 /* BGR buffer is ABGR and RGB buffer is ARGB */
+#define TJ_FORCESSE3 128 /* Force IPP to use SSE3 code (useful if auto-detect is not working properly) */
+#define TJ_FASTUPSAMPLE 256 /* Use fast, inaccurate 4:2:2 and 4:2:0 YUV upsampling routines in libjpeg decompressor */
+
+typedef void* tjhandle;
+
+#define TJPAD(p) (((p)+3)&(~3))
+#ifndef max
+ #define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* API follows */
+
+
+/*
+  tjhandle tjInitCompress(void)
+
+  Creates a new JPEG compressor instance, allocates memory for the structures,
+  and returns a handle to the instance.  Most applications will only
+  need to call this once at the beginning of the program or once for each
+  concurrent thread.  Don't try to create a new instance every time you
+  compress an image, because this will cause performance to suffer.
+
+  RETURNS: NULL on error
+*/
+DLLEXPORT tjhandle DLLCALL tjInitCompress(void);
+
+
+/*
+  int tjCompress(tjhandle j,
+     unsigned char *srcbuf, int width, int pitch, int height, int pixelsize,
+     unsigned char *dstbuf, unsigned long *size,
+     int jpegsubsamp, int jpegqual, int flags)
+
+  [INPUT] j = instance handle previously returned from a call to
+     tjInitCompress()
+  [INPUT] srcbuf = pointer to user-allocated image buffer containing pixels in
+     RGB(A) or BGR(A) form
+  [INPUT] width =  width (in pixels) of the source image
+  [INPUT] pitch = bytes per line of the source image (width*pixelsize if the
+     bitmap is unpadded, else TJPAD(width*pixelsize) if each line of the bitmap
+     is padded to the nearest 32-bit boundary, such as is the case for Windows
+     bitmaps.  You can also be clever and use this parameter to skip lines, etc.,
+     as long as the pitch is greater than 0.)
+  [INPUT] height = height (in pixels) of the source image
+  [INPUT] pixelsize = size (in bytes) of each pixel in the source image
+     RGBA and BGRA: 4, RGB and BGR: 3
+  [INPUT] dstbuf = pointer to user-allocated image buffer which will receive
+     the JPEG image.  Use the macro TJBUFSIZE(width, height) to determine
+     the appropriate size for this buffer based on the image width and height.
+  [OUTPUT] size = pointer to unsigned long which receives the size (in bytes)
+     of the compressed image
+  [INPUT] jpegsubsamp = Specifies either 4:2:0, 4:2:2, or 4:4:4 subsampling.
+     When the image is converted from the RGB to YCbCr colorspace as part of the
+     JPEG compression process, every other Cb and Cr (chrominance) pixel can be
+     discarded to produce a smaller image with little perceptible loss of
+     image clarity (the human eye is more sensitive to small changes in
+     brightness than small changes in color.)
+
+     TJ_420: 4:2:0 subsampling.  Discards every other Cb, Cr pixel in both
+        horizontal and vertical directions.
+     TJ_422: 4:2:2 subsampling.  Discards every other Cb, Cr pixel only in
+        the horizontal direction.
+     TJ_444: no subsampling.
+     TJ_GRAYSCALE: Generate grayscale JPEG image
+
+  [INPUT] jpegqual = JPEG quality (an integer between 0 and 100 inclusive.)
+  [INPUT] flags = the bitwise OR of one or more of the following
+
+     TJ_BGR: The components of each pixel in the source image are stored in
+        B,G,R order, not R,G,B
+     TJ_BOTTOMUP: The source image is stored in bottom-up (Windows) order,
+        not top-down
+     TJ_FORCEMMX: Valid only for the Intel Performance Primitives implementation
+        of this codec-- force IPP to use MMX code (bypass CPU auto-detection)
+     TJ_FORCESSE: Valid only for the Intel Performance Primitives implementation
+        of this codec-- force IPP to use SSE code (bypass CPU auto-detection)
+     TJ_FORCESSE2: Valid only for the Intel Performance Primitives implementation
+        of this codec-- force IPP to use SSE2 code (bypass CPU auto-detection)
+     TJ_FORCESSE3: Valid only for the Intel Performance Primitives implementation
+        of this codec-- force IPP to use SSE3 code (bypass CPU auto-detection)
+
+  RETURNS: 0 on success, -1 on error
+*/
+DLLEXPORT int DLLCALL tjCompress(tjhandle j,
+	unsigned char *srcbuf, int width, int pitch, int height, int pixelsize,
+	unsigned char *dstbuf, unsigned long *size,
+	int jpegsubsamp, int jpegqual, int flags);
+
+DLLEXPORT unsigned long DLLCALL TJBUFSIZE(int width, int height);
+
+/*
+  tjhandle tjInitDecompress(void)
+
+  Creates a new JPEG decompressor instance, allocates memory for the
+  structures, and returns a handle to the instance.  Most applications will
+  only need to call this once at the beginning of the program or once for each
+  concurrent thread.  Don't try to create a new instance every time you
+  decompress an image, because this will cause performance to suffer.
+
+  RETURNS: NULL on error
+*/
+DLLEXPORT tjhandle DLLCALL tjInitDecompress(void);
+
+
+/*
+  int tjDecompressHeader(tjhandle j,
+     unsigned char *srcbuf, unsigned long size,
+     int *width, int *height)
+
+  [INPUT] j = instance handle previously returned from a call to
+     tjInitDecompress()
+  [INPUT] srcbuf = pointer to a user-allocated buffer containing the JPEG image
+     to decompress
+  [INPUT] size = size of the JPEG image buffer (in bytes)
+  [OUTPUT] width = width (in pixels) of the JPEG image
+  [OUTPUT] height = height (in pixels) of the JPEG image
+
+  RETURNS: 0 on success, -1 on error
+*/
+DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle j,
+	unsigned char *srcbuf, unsigned long size,
+	int *width, int *height);
+
+
+/*
+  int tjDecompress(tjhandle j,
+     unsigned char *srcbuf, unsigned long size,
+     unsigned char *dstbuf, int width, int pitch, int height, int pixelsize,
+     int flags)
+
+  [INPUT] j = instance handle previously returned from a call to
+     tjInitDecompress()
+  [INPUT] srcbuf = pointer to a user-allocated buffer containing the JPEG image
+     to decompress
+  [INPUT] size = size of the JPEG image buffer (in bytes)
+  [INPUT] dstbuf = pointer to user-allocated image buffer which will receive
+     the bitmap image.  This buffer should normally be pitch*height
+     bytes in size, although this pointer may also be used to decompress into
+     a specific region of a larger buffer.
+  [INPUT] width =  width (in pixels) of the destination image
+  [INPUT] pitch = bytes per line of the destination image (width*pixelsize if the
+     bitmap is unpadded, else TJPAD(width*pixelsize) if each line of the bitmap
+     is padded to the nearest 32-bit boundary, such as is the case for Windows
+     bitmaps.  You can also be clever and use this parameter to skip lines, etc.,
+     as long as the pitch is greater than 0.)
+  [INPUT] height = height (in pixels) of the destination image
+  [INPUT] pixelsize = size (in bytes) of each pixel in the destination image
+     RGBA/RGBx and BGRA/BGRx: 4, RGB and BGR: 3
+  [INPUT] flags = the bitwise OR of one or more of the following
+
+     TJ_BGR: The components of each pixel in the destination image should be
+        written in B,G,R order, not R,G,B
+     TJ_BOTTOMUP: The destination image should be stored in bottom-up
+        (Windows) order, not top-down
+     TJ_FORCEMMX: Valid only for the Intel Performance Primitives implementation
+        of this codec-- force IPP to use MMX code (bypass CPU auto-detection)
+     TJ_FORCESSE: Valid only for the Intel Performance Primitives implementation
+        of this codec-- force IPP to use SSE code (bypass CPU auto-detection)
+     TJ_FORCESSE2: Valid only for the Intel Performance Primitives implementation
+        of this codec-- force IPP to use SSE2 code (bypass CPU auto-detection)
+
+  RETURNS: 0 on success, -1 on error
+*/
+DLLEXPORT int DLLCALL tjDecompress(tjhandle j,
+	unsigned char *srcbuf, unsigned long size,
+	unsigned char *dstbuf, int width, int pitch, int height, int pixelsize,
+	int flags);
+
+
+/*
+  int tjDestroy(tjhandle h)
+
+  Frees structures associated with a compression or decompression instance
+  
+  [INPUT] h = instance handle (returned from a previous call to
+     tjInitCompress() or tjInitDecompress()
+
+  RETURNS: 0 on success, -1 on error
+*/
+DLLEXPORT int DLLCALL tjDestroy(tjhandle h);
+
+
+/*
+  char *tjGetErrorStr(void)
+  
+  Returns a descriptive error message explaining why the last command failed
+*/
+DLLEXPORT char* DLLCALL tjGetErrorStr(void);
+
+#ifdef __cplusplus
+}
+#endif

diff --git a/turbojpegl.c b/turbojpegl.c
new file mode 100644
index 0000000..82acf23
--- /dev/null
+++ b/turbojpegl.c

@@ -0,0 +1,354 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version.  The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * wxWindows Library License for more details.
+ */
+
+// This implements a JPEG compressor/decompressor using the libjpeg API
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <jpeglib.h>
+#include <jerror.h>
+#include <setjmp.h>
+#include "./turbojpeg.h"
+
+
+// Error handling
+
+static char lasterror[JMSG_LENGTH_MAX]="No error";
+
+typedef struct _error_mgr
+{
+	struct jpeg_error_mgr pub;
+	jmp_buf jb;
+} error_mgr;
+
+static void my_error_exit(j_common_ptr cinfo)
+{
+	error_mgr *myerr = (error_mgr *)cinfo->err;
+	(*cinfo->err->output_message)(cinfo);
+	longjmp(myerr->jb, 1);
+}
+
+static void my_output_message(j_common_ptr cinfo)
+{
+	(*cinfo->err->format_message)(cinfo, lasterror);
+}
+
+
+// Global structures, macros, etc.
+
+typedef struct _jpgstruct
+{
+	struct jpeg_compress_struct cinfo;
+	struct jpeg_decompress_struct dinfo;
+	struct jpeg_destination_mgr jdms;
+	struct jpeg_source_mgr jsms;
+	error_mgr jerr;
+	int initc, initd;
+} jpgstruct;
+
+static const int hsampfactor[NUMSUBOPT]={1, 2, 2, 1};
+static const int vsampfactor[NUMSUBOPT]={1, 1, 2, 1};
+
+#define _throw(c) {sprintf(lasterror, "%s", c);  return -1;}
+#define _catch(f) {if((f)==-1) return -1;}
+#define checkhandle(h) jpgstruct *j=(jpgstruct *)h; \
+	if(!j) _throw("Invalid handle");
+
+
+// CO
+
+static boolean empty_output_buffer(struct jpeg_compress_struct *cinfo)
+{
+	ERREXIT(cinfo, JERR_BUFFER_SIZE);
+	return TRUE;
+}
+
+static void destination_noop(struct jpeg_compress_struct *cinfo)
+{
+}
+
+DLLEXPORT tjhandle DLLCALL tjInitCompress(void)
+{
+	jpgstruct *j=NULL;
+	if((j=(jpgstruct *)malloc(sizeof(jpgstruct)))==NULL)
+		{sprintf(lasterror, "Memory allocation failure");  return NULL;}
+	memset(j, 0, sizeof(jpgstruct));
+	j->cinfo.err=jpeg_std_error(&j->jerr.pub);
+	j->jerr.pub.error_exit=my_error_exit;
+	j->jerr.pub.output_message=my_output_message;
+
+	if(setjmp(j->jerr.jb))
+	{ // this will execute if LIBJPEG has an error
+		if(j) free(j);  return NULL;
+  }
+
+	jpeg_create_compress(&j->cinfo);
+	j->cinfo.dest=&j->jdms;
+	j->jdms.init_destination=destination_noop;
+	j->jdms.empty_output_buffer=empty_output_buffer;
+	j->jdms.term_destination=destination_noop;
+
+	j->initc=1;
+	return (tjhandle)j;
+}
+
+DLLEXPORT unsigned long DLLCALL TJBUFSIZE(int width, int height)
+{
+	// This allows enough room in case the image doesn't compress
+	return ((width+15)&(~15)) * ((height+15)&(~15)) * 6 + 2048;
+}
+
+DLLEXPORT int DLLCALL tjCompress(tjhandle h,
+	unsigned char *srcbuf, int width, int pitch, int height, int ps,
+	unsigned char *dstbuf, unsigned long *size,
+	int jpegsub, int qual, int flags)
+{
+	int i;  JSAMPROW *row_pointer=NULL;
+
+	checkhandle(h);
+
+	if(srcbuf==NULL || width<=0 || pitch<0 || height<=0
+		|| dstbuf==NULL || size==NULL
+		|| jpegsub<0 || jpegsub>=NUMSUBOPT || qual<0 || qual>100)
+		_throw("Invalid argument in tjCompress()");
+	if(ps!=3 && ps!=4) _throw("This compressor can only take 24-bit or 32-bit RGB input");
+	if(!j->initc) _throw("Instance has not been initialized for compression");
+
+	if(pitch==0) pitch=width*ps;
+
+	j->cinfo.image_width = width;
+	j->cinfo.image_height = height;
+	j->cinfo.input_components = ps;
+
+	#if JCS_EXTENSIONS==1
+	j->cinfo.in_color_space = JCS_EXT_RGB;
+	if(ps==3 && (flags&TJ_BGR))
+		j->cinfo.in_color_space = JCS_EXT_BGR;
+	else if(ps==4 && !(flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
+		j->cinfo.in_color_space = JCS_EXT_RGBX;
+	else if(ps==4 && (flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
+		j->cinfo.in_color_space = JCS_EXT_BGRX;
+	else if(ps==4 && (flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
+		j->cinfo.in_color_space = JCS_EXT_XBGR;
+	else if(ps==4 && !(flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
+		j->cinfo.in_color_space = JCS_EXT_XRGB;
+	#else
+	#error "TurboJPEG requires JPEG colorspace extensions"
+	#endif
+
+	if(setjmp(j->jerr.jb))
+	{  // this will execute if LIBJPEG has an error
+		if(row_pointer) free(row_pointer);
+		return -1;
+  }
+
+	jpeg_set_defaults(&j->cinfo);
+
+	jpeg_set_quality(&j->cinfo, qual, TRUE);
+	if(jpegsub==TJ_GRAYSCALE)
+		jpeg_set_colorspace(&j->cinfo, JCS_GRAYSCALE);
+	else
+		jpeg_set_colorspace(&j->cinfo, JCS_YCbCr);
+	j->cinfo.dct_method = JDCT_FASTEST;
+
+	j->cinfo.comp_info[0].h_samp_factor=hsampfactor[jpegsub];
+	j->cinfo.comp_info[1].h_samp_factor=1;
+	j->cinfo.comp_info[2].h_samp_factor=1;
+	j->cinfo.comp_info[0].v_samp_factor=vsampfactor[jpegsub];
+	j->cinfo.comp_info[1].v_samp_factor=1;
+	j->cinfo.comp_info[2].v_samp_factor=1;
+
+	j->jdms.next_output_byte = dstbuf;
+	j->jdms.free_in_buffer = TJBUFSIZE(j->cinfo.image_width, j->cinfo.image_height);
+
+	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
+		_throw("Memory allocation failed in tjInitCompress()");
+	for(i=0; i<height; i++)
+	{
+		if(flags&TJ_BOTTOMUP) row_pointer[i]= &srcbuf[(height-i-1)*pitch];
+		else row_pointer[i]= &srcbuf[i*pitch];
+	}
+	jpeg_start_compress(&j->cinfo, TRUE);
+	while(j->cinfo.next_scanline<j->cinfo.image_height)
+	{
+		jpeg_write_scanlines(&j->cinfo, &row_pointer[j->cinfo.next_scanline],
+			j->cinfo.image_height-j->cinfo.next_scanline);
+	}
+	jpeg_finish_compress(&j->cinfo);
+	*size=TJBUFSIZE(j->cinfo.image_width, j->cinfo.image_height)-(j->jdms.free_in_buffer);
+
+	if(row_pointer) free(row_pointer);
+	return 0;
+}
+
+
+// DEC
+
+static boolean fill_input_buffer (struct jpeg_decompress_struct *dinfo)
+{
+	ERREXIT(dinfo, JERR_BUFFER_SIZE);
+	return TRUE;
+}
+
+static void skip_input_data (struct jpeg_decompress_struct *dinfo, long num_bytes)
+{
+	dinfo->src->next_input_byte += (size_t) num_bytes;
+	dinfo->src->bytes_in_buffer -= (size_t) num_bytes;
+}
+
+static void source_noop (struct jpeg_decompress_struct *dinfo)
+{
+}
+
+DLLEXPORT tjhandle DLLCALL tjInitDecompress(void)
+{
+	jpgstruct *j;
+	if((j=(jpgstruct *)malloc(sizeof(jpgstruct)))==NULL)
+		{sprintf(lasterror, "Memory allocation failure");  return NULL;}
+	memset(j, 0, sizeof(jpgstruct));
+	j->dinfo.err=jpeg_std_error(&j->jerr.pub);
+	j->jerr.pub.error_exit=my_error_exit;
+	j->jerr.pub.output_message=my_output_message;
+
+	if(setjmp(j->jerr.jb))
+	{ // this will execute if LIBJPEG has an error
+		free(j);  return NULL;
+  }
+
+	jpeg_create_decompress(&j->dinfo);
+	j->dinfo.src=&j->jsms;
+	j->jsms.init_source=source_noop;
+	j->jsms.fill_input_buffer = fill_input_buffer;
+	j->jsms.skip_input_data = skip_input_data;
+	j->jsms.resync_to_restart = jpeg_resync_to_restart;
+	j->jsms.term_source = source_noop;
+
+	j->initd=1;
+	return (tjhandle)j;
+}
+
+
+DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle h,
+	unsigned char *srcbuf, unsigned long size,
+	int *width, int *height)
+{
+	checkhandle(h);
+
+	if(srcbuf==NULL || size<=0 || width==NULL || height==NULL)
+		_throw("Invalid argument in tjDecompressHeader()");
+	if(!j->initd) _throw("Instance has not been initialized for decompression");
+
+	if(setjmp(j->jerr.jb))
+	{  // this will execute if LIBJPEG has an error
+		return -1;
+	}
+
+	j->jsms.bytes_in_buffer = size;
+	j->jsms.next_input_byte = srcbuf;
+
+	jpeg_read_header(&j->dinfo, TRUE);
+
+	*width=j->dinfo.image_width;  *height=j->dinfo.image_height;
+
+	jpeg_abort_decompress(&j->dinfo);
+
+	if(*width<1 || *height<1) _throw("Invalid data returned in header");
+	return 0;
+}
+
+
+DLLEXPORT int DLLCALL tjDecompress(tjhandle h,
+	unsigned char *srcbuf, unsigned long size,
+	unsigned char *dstbuf, int width, int pitch, int height, int ps,
+	int flags)
+{
+	int i;  JSAMPROW *row_pointer=NULL;
+
+	checkhandle(h);
+
+	if(srcbuf==NULL || size<=0
+		|| dstbuf==NULL || width<=0 || pitch<0 || height<=0)
+		_throw("Invalid argument in tjDecompress()");
+	if(ps!=3 && ps!=4) _throw("This compressor can only take 24-bit or 32-bit RGB input");
+	if(!j->initd) _throw("Instance has not been initialized for decompression");
+
+	if(pitch==0) pitch=width*ps;
+
+	if(setjmp(j->jerr.jb))
+	{  // this will execute if LIBJPEG has an error
+		if(row_pointer) free(row_pointer);
+		return -1;
+  }
+
+	j->jsms.bytes_in_buffer = size;
+	j->jsms.next_input_byte = srcbuf;
+
+	jpeg_read_header(&j->dinfo, TRUE);
+
+	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
+		_throw("Memory allocation failed in tjInitDecompress()");
+	for(i=0; i<height; i++)
+	{
+		if(flags&TJ_BOTTOMUP) row_pointer[i]= &dstbuf[(height-i-1)*pitch];
+		else row_pointer[i]= &dstbuf[i*pitch];
+	}
+
+	#if JCS_EXTENSIONS==1
+	j->dinfo.out_color_space = JCS_EXT_RGB;
+	if(ps==3 && (flags&TJ_BGR))
+		j->dinfo.out_color_space = JCS_EXT_BGR;
+	else if(ps==4 && !(flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
+		j->dinfo.out_color_space = JCS_EXT_RGBX;
+	else if(ps==4 && (flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
+		j->dinfo.out_color_space = JCS_EXT_BGRX;
+	else if(ps==4 && (flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
+		j->dinfo.out_color_space = JCS_EXT_XBGR;
+	else if(ps==4 && !(flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
+		j->dinfo.out_color_space = JCS_EXT_XRGB;
+	#else
+	#error "TurboJPEG requires JPEG colorspace extensions"
+	#endif
+	if(flags&TJ_FASTUPSAMPLE) j->dinfo.do_fancy_upsampling=FALSE;
+
+	jpeg_start_decompress(&j->dinfo);
+	while(j->dinfo.output_scanline<j->dinfo.output_height)
+	{
+		jpeg_read_scanlines(&j->dinfo, &row_pointer[j->dinfo.output_scanline],
+			j->dinfo.output_height-j->dinfo.output_scanline);
+	}
+	jpeg_finish_decompress(&j->dinfo);
+
+	if(row_pointer) free(row_pointer);
+	return 0;
+}
+
+
+// General
+
+DLLEXPORT char* DLLCALL tjGetErrorStr(void)
+{
+	return lasterror;
+}
+
+DLLEXPORT int DLLCALL tjDestroy(tjhandle h)
+{
+	checkhandle(h);
+	if(setjmp(j->jerr.jb)) return -1;
+	if(j->initc) jpeg_destroy_compress(&j->cinfo);
+	if(j->initd) jpeg_destroy_decompress(&j->dinfo);
+	free(j);
+	return 0;
+}

diff --git a/jconfig.vc b/win/jconfig.h
similarity index 97%
rename from jconfig.vc
rename to win/jconfig.h
index 7e291c7..7987ba6 100644
--- a/jconfig.vc
+++ b/win/jconfig.h

@@ -21,6 +21,7 @@
 #endif
 #define HAVE_BOOLEAN		/* prevent jmorecfg.h from redefining it */
 
+#define inline __inline
 
 #ifdef JPEG_INTERNALS
commit	60fa0600c0b32383de6880dd9bf2218e0eb1b537	[log] [tgz]
author	DRC <dcommander@users.sourceforge.net>	Fri Feb 12 06:01:49 2010 +0000
committer	DRC <dcommander@users.sourceforge.net>	Fri Feb 12 06:01:49 2010 +0000
tree	bc592d8107cd9fab120112e3daf33a062289e56b
parent	60cddeb849b803274914dbe1706cfc801f914319 [diff]
parent	5ead57a34a398aa798f35bd7a6abad19b2e453e2 [diff]