Re-add relevant files and tests from libjpeg v6b
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@81 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..43b5f98
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,86 @@
+lib_LTLIBRARIES = libjpeg.la libturbojpeg.la
+libjpeg_la_LDFLAGS = -version-number 62:0:0 -no-undefined
+libturbojpeg_la_LDFLAGS = -avoid-version -no-undefined
+include_HEADERS = jconfig.h jerror.h jmorecfg.h jpeglib.h turbojpeg.h
+
+HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+ jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h
+
+libjpeg_la_SOURCES = $(HDRS) jcapimin.c jcapistd.c jccoefct.c jccolor.c \
+ jcdctmgr.c jchuff.c jcinit.c jcmainct.c jcmarker.c jcmaster.c \
+ jcomapi.c jcparam.c jcphuff.c jcprepct.c jcsample.c jctrans.c \
+ jdapimin.c jdapistd.c jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c \
+ jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c jdmaster.c \
+ jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c \
+ jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c \
+ jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c
+
+libturbojpeg_la_SOURCES = $(libjpeg_la_SOURCES) turbojpegl.c turbojpeg.h
+
+if WITH_SIMD
+
+SUBDIRS = simd
+libjpeg_la_LIBADD = simd/libsimd.la
+libturbojpeg_la_LIBADD = simd/libsimd.la
+
+else
+
+libjpeg_la_SOURCES += jsimd_none.c
+
+endif
+
+TSTHDRS = rrutil.h rrtimer.h
+
+noinst_PROGRAMS = jpgtest jpegut cjpeg djpeg jpegtran rdjpgcom wrjpgcom
+
+jpgtest_SOURCES = $(TSTHDRS) jpgtest.cxx bmp.h bmp.c
+
+jpgtest_LDADD = $(top_srcdir)/libturbojpeg.la
+
+jpegut_SOURCES = $(TSTHDRS) jpegut.c bmp.h bmp.c
+
+jpegut_LDADD = $(top_srcdir)/libturbojpeg.la
+
+cjpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c cjpeg.c rdbmp.c rdgif.c \
+ rdppm.c rdswitch.c rdtarga.c
+
+cjpeg_LDADD = $(top_srcdir)/libjpeg.la
+
+cjpeg_CFLAGS = -DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED \
+ -DTARGA_SUPPORTED
+
+djpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c djpeg.c rdcolmap.c rdswitch.c \
+ wrbmp.c wrgif.c wrppm.c wrtarga.c
+
+djpeg_LDADD = $(top_srcdir)/libjpeg.la
+
+djpeg_CFLAGS = -DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED \
+ -DTARGA_SUPPORTED
+
+jpegtran_SOURCES = jpegtran.c rdswitch.c cdjpeg.c transupp.c
+
+jpegtran_LDADD = $(top_srcdir)/libjpeg.la
+
+rdjpgcom_SOURCES = wrjpgcom.c
+
+rdjpgcom_LDADD = $(top_srcdir)/libjpeg.la
+
+wrjpgcom_SOURCES = wrjpgcom.c
+
+wrjpgcom_LDADD = $(top_srcdir)/libjpeg.la
+
+test: cjpeg djpeg jpegtran
+ $(RM) testout*
+ $(top_srcdir)/jpegut
+ $(top_srcdir)/djpeg -dct int -ppm -outfile testout.ppm $(top_srcdir)/testorig.jpg
+ $(top_srcdir)/djpeg -dct int -bmp -colors 256 -outfile testout.bmp $(top_srcdir)/testorig.jpg
+ $(top_srcdir)/cjpeg -dct int -outfile testout.jpg $(top_srcdir)/testimg.ppm
+ $(top_srcdir)/djpeg -dct int -ppm -outfile testoutp.ppm $(top_srcdir)/testprog.jpg
+ $(top_srcdir)/cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(top_srcdir)/testimg.ppm
+ $(top_srcdir)/jpegtran -outfile testoutt.jpg $(top_srcdir)/testprog.jpg
+ cmp $(top_srcdir)/testimg.ppm testout.ppm
+ cmp $(top_srcdir)/testimg.bmp testout.bmp
+ cmp $(top_srcdir)/testimg.jpg testout.jpg
+ cmp $(top_srcdir)/testimg.ppm testoutp.ppm
+ cmp $(top_srcdir)/testimgp.jpg testoutp.jpg
+ cmp $(top_srcdir)/testorig.jpg testoutt.jpg
diff --git a/acinclude.m4 b/acinclude.m4
new file mode 100644
index 0000000..e8d7d08
--- /dev/null
+++ b/acinclude.m4
@@ -0,0 +1,121 @@
+# AC_PROG_NASM
+# --------------------------
+# Check that NASM exists and determine flags
+AC_DEFUN([AC_PROG_NASM],[
+
+AC_CHECK_PROGS(NASM, [nasm nasmw])
+test -z "$NASM" && AC_MSG_ERROR([no nasm (Netwide Assembler) found])
+
+AC_MSG_CHECKING([for object file format of host system])
+case "$host_os" in
+ cygwin* | mingw* | pw32* | interix*)
+ objfmt='Win32-COFF'
+ ;;
+ msdosdjgpp* | go32*)
+ objfmt='COFF'
+ ;;
+ os2-emx*) # not tested
+ objfmt='MSOMF' # obj
+ ;;
+ linux*coff* | linux*oldld*)
+ objfmt='COFF' # ???
+ ;;
+ linux*aout*)
+ objfmt='a.out'
+ ;;
+ linux*)
+ case "$host_cpu" in
+ x86_64)
+ objfmt='ELF64'
+ ;;
+ *)
+ objfmt='ELF'
+ ;;
+ esac
+ ;;
+ freebsd* | netbsd* | openbsd*)
+ if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+ objfmt='BSD-a.out'
+ else
+ objfmt='ELF'
+ fi
+ ;;
+ solaris* | sunos* | sysv* | sco*)
+ case "$host_cpu" in
+ x86_64)
+ objfmt='ELF64'
+ ;;
+ *)
+ objfmt='ELF'
+ ;;
+ esac
+ ;;
+ darwin* | rhapsody* | nextstep* | openstep* | macos*)
+ case "$host_cpu" in
+ x86_64)
+ objfmt='Mach-O64'
+ ;;
+ *)
+ objfmt='Mach-O'
+ ;;
+ esac
+ ;;
+ *)
+ objfmt='ELF ?'
+ ;;
+esac
+
+AC_MSG_RESULT([$objfmt])
+if test "$objfmt" = 'ELF ?'; then
+ objfmt='ELF'
+ AC_MSG_WARN([unexpected host system. assumed that the format is $objfmt.])
+fi
+
+AC_MSG_CHECKING([for object file format specifier (NAFLAGS) ])
+case "$objfmt" in
+ MSOMF) NAFLAGS='-fobj -DOBJ32';;
+ Win32-COFF) NAFLAGS='-fwin32 -DWIN32';;
+ COFF) NAFLAGS='-fcoff -DCOFF';;
+ a.out) NAFLAGS='-faout -DAOUT';;
+ BSD-a.out) NAFLAGS='-faoutb -DAOUT';;
+ ELF) NAFLAGS='-felf -DELF';;
+ ELF64) NAFLAGS='-felf64 -DELF -D__x86_64__';;
+ RDF) NAFLAGS='-frdf -DRDF';;
+ Mach-O) NAFLAGS='-fmacho -DMACHO';;
+ Mach-O64) NAFLAGS='-fmacho64 -DMACHO -D__x86_64__';;
+esac
+AC_MSG_RESULT([$NAFLAGS])
+AC_SUBST([NAFLAGS])
+
+AC_MSG_CHECKING([whether the assembler ($NASM $NAFLAGS) works])
+cat > conftest.asm <<EOF
+[%line __oline__ "configure"
+ section .text
+ global _main,main
+_main:
+main: xor eax,eax
+ ret
+]EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if AC_TRY_EVAL(try_nasm) && test -s conftest.o; then
+ AC_MSG_RESULT(yes)
+else
+ echo "configure: failed program was:" >&AC_FD_CC
+ cat conftest.asm >&AC_FD_CC
+ rm -rf conftest*
+ AC_MSG_RESULT(no)
+ AC_MSG_ERROR([installation or configuration problem: assembler cannot create object files.])
+fi
+
+AC_MSG_CHECKING([whether the linker accepts assembler output])
+try_nasm='${CC-cc} -o conftest${ac_exeext} $LDFLAGS conftest.o $LIBS 1>&AC_FD_CC'
+if AC_TRY_EVAL(try_nasm) && test -s conftest${ac_exeext}; then
+ rm -rf conftest*
+ AC_MSG_RESULT(yes)
+else
+ rm -rf conftest*
+ AC_MSG_RESULT(no)
+ AC_MSG_ERROR([configuration problem: maybe object file format mismatch.])
+fi
+
+])
diff --git a/ansi2knr.1 b/ansi2knr.1
deleted file mode 100644
index f9ee5a6..0000000
--- a/ansi2knr.1
+++ /dev/null
@@ -1,36 +0,0 @@
-.TH ANSI2KNR 1 "19 Jan 1996"
-.SH NAME
-ansi2knr \- convert ANSI C to Kernighan & Ritchie C
-.SH SYNOPSIS
-.I ansi2knr
-[--varargs] input_file [output_file]
-.SH DESCRIPTION
-If no output_file is supplied, output goes to stdout.
-.br
-There are no error messages.
-.sp
-.I ansi2knr
-recognizes function definitions by seeing a non-keyword identifier at the left
-margin, followed by a left parenthesis, with a right parenthesis as the last
-character on the line, and with a left brace as the first token on the
-following line (ignoring possible intervening comments). It will recognize a
-multi-line header provided that no intervening line ends with a left or right
-brace or a semicolon. These algorithms ignore whitespace and comments, except
-that the function name must be the first thing on the line.
-.sp
-The following constructs will confuse it:
-.br
- - Any other construct that starts at the left margin and follows the
-above syntax (such as a macro or function call).
-.br
- - Some macros that tinker with the syntax of the function header.
-.sp
-The --varargs switch is obsolete, and is recognized only for
-backwards compatibility. The present version of
-.I ansi2knr
-will always attempt to convert a ... argument to va_alist and va_dcl.
-.SH AUTHOR
-L. Peter Deutsch <ghost@aladdin.com> wrote the original ansi2knr and
-continues to maintain the current version; most of the code in the current
-version is his work. ansi2knr also includes contributions by Francois
-Pinard <pinard@iro.umontreal.ca> and Jim Avera <jima@netcom.com>.
diff --git a/ansi2knr.c b/ansi2knr.c
deleted file mode 100644
index 4e05fc2..0000000
--- a/ansi2knr.c
+++ /dev/null
@@ -1,693 +0,0 @@
-/* ansi2knr.c */
-/* Convert ANSI C function definitions to K&R ("traditional C") syntax */
-
-/*
-ansi2knr is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY. No author or distributor accepts responsibility to anyone for the
-consequences of using it or for whether it serves any particular purpose or
-works at all, unless he says so in writing. Refer to the GNU General Public
-License (the "GPL") for full details.
-
-Everyone is granted permission to copy, modify and redistribute ansi2knr,
-but only under the conditions described in the GPL. A copy of this license
-is supposed to have been given to you along with ansi2knr so you can know
-your rights and responsibilities. It should be in a file named COPYLEFT.
-[In the IJG distribution, the GPL appears below, not in a separate file.]
-Among other things, the copyright notice and this notice must be preserved
-on all copies.
-
-We explicitly state here what we believe is already implied by the GPL: if
-the ansi2knr program is distributed as a separate set of sources and a
-separate executable file which are aggregated on a storage medium together
-with another program, this in itself does not bring the other program under
-the GPL, nor does the mere fact that such a program or the procedures for
-constructing it invoke the ansi2knr executable bring any other part of the
-program under the GPL.
-*/
-
-/*
----------- Here is the GNU GPL file COPYLEFT, referred to above ----------
------ These terms do NOT apply to the JPEG software itself; see README ------
-
- GHOSTSCRIPT GENERAL PUBLIC LICENSE
- (Clarified 11 Feb 1988)
-
- Copyright (C) 1988 Richard M. Stallman
- Everyone is permitted to copy and distribute verbatim copies of this
- license, but changing it is not allowed. You can also use this wording
- to make the terms for other programs.
-
- The license agreements of most software companies keep you at the
-mercy of those companies. By contrast, our general public license is
-intended to give everyone the right to share Ghostscript. To make sure
-that you get the rights we want you to have, we need to make
-restrictions that forbid anyone to deny you these rights or to ask you
-to surrender the rights. Hence this license agreement.
-
- Specifically, we want to make sure that you have the right to give
-away copies of Ghostscript, that you receive source code or else can get
-it if you want it, that you can change Ghostscript or use pieces of it
-in new free programs, and that you know you can do these things.
-
- To make sure that everyone has such rights, we have to forbid you to
-deprive anyone else of these rights. For example, if you distribute
-copies of Ghostscript, you must give the recipients all the rights that
-you have. You must make sure that they, too, receive or can get the
-source code. And you must tell them their rights.
-
- Also, for our own protection, we must make certain that everyone finds
-out that there is no warranty for Ghostscript. If Ghostscript is
-modified by someone else and passed on, we want its recipients to know
-that what they have is not what we distributed, so that any problems
-introduced by others will not reflect on our reputation.
-
- Therefore we (Richard M. Stallman and the Free Software Foundation,
-Inc.) make the following terms which say what you must do to be allowed
-to distribute or change Ghostscript.
-
-
- COPYING POLICIES
-
- 1. You may copy and distribute verbatim copies of Ghostscript source
-code as you receive it, in any medium, provided that you conspicuously
-and appropriately publish on each copy a valid copyright and license
-notice "Copyright (C) 1989 Aladdin Enterprises. All rights reserved.
-Distributed by Free Software Foundation, Inc." (or with whatever year is
-appropriate); keep intact the notices on all files that refer to this
-License Agreement and to the absence of any warranty; and give any other
-recipients of the Ghostscript program a copy of this License Agreement
-along with the program. You may charge a distribution fee for the
-physical act of transferring a copy.
-
- 2. You may modify your copy or copies of Ghostscript or any portion of
-it, and copy and distribute such modifications under the terms of
-Paragraph 1 above, provided that you also do the following:
-
- a) cause the modified files to carry prominent notices stating
- that you changed the files and the date of any change; and
-
- b) cause the whole of any work that you distribute or publish,
- that in whole or in part contains or is a derivative of Ghostscript
- or any part thereof, to be licensed at no charge to all third
- parties on terms identical to those contained in this License
- Agreement (except that you may choose to grant more extensive
- warranty protection to some or all third parties, at your option).
-
- c) You may charge a distribution fee for the physical act of
- transferring a copy, and you may at your option offer warranty
- protection in exchange for a fee.
-
-Mere aggregation of another unrelated program with this program (or its
-derivative) on a volume of a storage or distribution medium does not bring
-the other program under the scope of these terms.
-
- 3. You may copy and distribute Ghostscript (or a portion or derivative
-of it, under Paragraph 2) in object code or executable form under the
-terms of Paragraphs 1 and 2 above provided that you also do one of the
-following:
-
- a) accompany it with the complete corresponding machine-readable
- source code, which must be distributed under the terms of
- Paragraphs 1 and 2 above; or,
-
- b) accompany it with a written offer, valid for at least three
- years, to give any third party free (except for a nominal
- shipping charge) a complete machine-readable copy of the
- corresponding source code, to be distributed under the terms of
- Paragraphs 1 and 2 above; or,
-
- c) accompany it with the information you received as to where the
- corresponding source code may be obtained. (This alternative is
- allowed only for noncommercial distribution and only if you
- received the program in object code or executable form alone.)
-
-For an executable file, complete source code means all the source code for
-all modules it contains; but, as a special exception, it need not include
-source code for modules which are standard libraries that accompany the
-operating system on which the executable file runs.
-
- 4. You may not copy, sublicense, distribute or transfer Ghostscript
-except as expressly provided under this License Agreement. Any attempt
-otherwise to copy, sublicense, distribute or transfer Ghostscript is
-void and your rights to use the program under this License agreement
-shall be automatically terminated. However, parties who have received
-computer software programs from you with this License Agreement will not
-have their licenses terminated so long as such parties remain in full
-compliance.
-
- 5. If you wish to incorporate parts of Ghostscript into other free
-programs whose distribution conditions are different, write to the Free
-Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not
-yet worked out a simple rule that can be stated here, but we will often
-permit this. We will be guided by the two goals of preserving the free
-status of all derivatives of our free software and of promoting the
-sharing and reuse of software.
-
-Your comments and suggestions about our licensing policies and our
-software are welcome! Please contact the Free Software Foundation,
-Inc., 675 Mass Ave, Cambridge, MA 02139, or call (617) 876-3296.
-
- NO WARRANTY
-
- BECAUSE GHOSTSCRIPT IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY
-NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT
-WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC, RICHARD
-M. STALLMAN, ALADDIN ENTERPRISES, L. PETER DEUTSCH, AND/OR OTHER PARTIES
-PROVIDE GHOSTSCRIPT "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
-EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
-ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF GHOSTSCRIPT IS WITH
-YOU. SHOULD GHOSTSCRIPT PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
-NECESSARY SERVICING, REPAIR OR CORRECTION.
-
- IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M.
-STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., L. PETER DEUTSCH, ALADDIN
-ENTERPRISES, AND/OR ANY OTHER PARTY WHO MAY MODIFY AND REDISTRIBUTE
-GHOSTSCRIPT AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING
-ANY LOST PROFITS, LOST MONIES, OR OTHER SPECIAL, INCIDENTAL OR
-CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE
-(INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED
-INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR A FAILURE OF THE
-PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) GHOSTSCRIPT, EVEN IF YOU
-HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES, OR FOR ANY CLAIM
-BY ANY OTHER PARTY.
-
--------------------- End of file COPYLEFT ------------------------------
-*/
-
-/*
- * Usage:
- ansi2knr input_file [output_file]
- * If no output_file is supplied, output goes to stdout.
- * There are no error messages.
- *
- * ansi2knr recognizes function definitions by seeing a non-keyword
- * identifier at the left margin, followed by a left parenthesis,
- * with a right parenthesis as the last character on the line,
- * and with a left brace as the first token on the following line
- * (ignoring possible intervening comments).
- * It will recognize a multi-line header provided that no intervening
- * line ends with a left or right brace or a semicolon.
- * These algorithms ignore whitespace and comments, except that
- * the function name must be the first thing on the line.
- * The following constructs will confuse it:
- * - Any other construct that starts at the left margin and
- * follows the above syntax (such as a macro or function call).
- * - Some macros that tinker with the syntax of the function header.
- */
-
-/*
- * The original and principal author of ansi2knr is L. Peter Deutsch
- * <ghost@aladdin.com>. Other authors are noted in the change history
- * that follows (in reverse chronological order):
- lpd 96-01-21 added code to cope with not HAVE_CONFIG_H and with
- compilers that don't understand void, as suggested by
- Tom Lane
- lpd 96-01-15 changed to require that the first non-comment token
- on the line following a function header be a left brace,
- to reduce sensitivity to macros, as suggested by Tom Lane
- <tgl@sss.pgh.pa.us>
- lpd 95-06-22 removed #ifndefs whose sole purpose was to define
- undefined preprocessor symbols as 0; changed all #ifdefs
- for configuration symbols to #ifs
- lpd 95-04-05 changed copyright notice to make it clear that
- including ansi2knr in a program does not bring the entire
- program under the GPL
- lpd 94-12-18 added conditionals for systems where ctype macros
- don't handle 8-bit characters properly, suggested by
- Francois Pinard <pinard@iro.umontreal.ca>;
- removed --varargs switch (this is now the default)
- lpd 94-10-10 removed CONFIG_BROKETS conditional
- lpd 94-07-16 added some conditionals to help GNU `configure',
- suggested by Francois Pinard <pinard@iro.umontreal.ca>;
- properly erase prototype args in function parameters,
- contributed by Jim Avera <jima@netcom.com>;
- correct error in writeblanks (it shouldn't erase EOLs)
- lpd 89-xx-xx original version
- */
-
-/* Most of the conditionals here are to make ansi2knr work with */
-/* or without the GNU configure machinery. */
-
-#if HAVE_CONFIG_H
-# include <config.h>
-#endif
-
-#include <stdio.h>
-#include <ctype.h>
-
-#if HAVE_CONFIG_H
-
-/*
- For properly autoconfiguring ansi2knr, use AC_CONFIG_HEADER(config.h).
- This will define HAVE_CONFIG_H and so, activate the following lines.
- */
-
-# if STDC_HEADERS || HAVE_STRING_H
-# include <string.h>
-# else
-# include <strings.h>
-# endif
-
-#else /* not HAVE_CONFIG_H */
-
-/* Otherwise do it the hard way */
-
-# ifdef BSD
-# include <strings.h>
-# else
-# ifdef VMS
- extern int strlen(), strncmp();
-# else
-# include <string.h>
-# endif
-# endif
-
-#endif /* not HAVE_CONFIG_H */
-
-#if STDC_HEADERS
-# include <stdlib.h>
-#else
-/*
- malloc and free should be declared in stdlib.h,
- but if you've got a K&R compiler, they probably aren't.
- */
-# ifdef MSDOS
-# include <malloc.h>
-# else
-# ifdef VMS
- extern char *malloc();
- extern void free();
-# else
- extern char *malloc();
- extern int free();
-# endif
-# endif
-
-#endif
-
-/*
- * The ctype macros don't always handle 8-bit characters correctly.
- * Compensate for this here.
- */
-#ifdef isascii
-# undef HAVE_ISASCII /* just in case */
-# define HAVE_ISASCII 1
-#else
-#endif
-#if STDC_HEADERS || !HAVE_ISASCII
-# define is_ascii(c) 1
-#else
-# define is_ascii(c) isascii(c)
-#endif
-
-#define is_space(c) (is_ascii(c) && isspace(c))
-#define is_alpha(c) (is_ascii(c) && isalpha(c))
-#define is_alnum(c) (is_ascii(c) && isalnum(c))
-
-/* Scanning macros */
-#define isidchar(ch) (is_alnum(ch) || (ch) == '_')
-#define isidfirstchar(ch) (is_alpha(ch) || (ch) == '_')
-
-/* Forward references */
-char *skipspace();
-int writeblanks();
-int test1();
-int convert1();
-
-/* The main program */
-int
-main(argc, argv)
- int argc;
- char *argv[];
-{ FILE *in, *out;
-#define bufsize 5000 /* arbitrary size */
- char *buf;
- char *line;
- char *more;
- /*
- * In previous versions, ansi2knr recognized a --varargs switch.
- * If this switch was supplied, ansi2knr would attempt to convert
- * a ... argument to va_alist and va_dcl; if this switch was not
- * supplied, ansi2knr would simply drop any such arguments.
- * Now, ansi2knr always does this conversion, and we only
- * check for this switch for backward compatibility.
- */
- int convert_varargs = 1;
-
- if ( argc > 1 && argv[1][0] == '-' )
- { if ( !strcmp(argv[1], "--varargs") )
- { convert_varargs = 1;
- argc--;
- argv++;
- }
- else
- { fprintf(stderr, "Unrecognized switch: %s\n", argv[1]);
- exit(1);
- }
- }
- switch ( argc )
- {
- default:
- printf("Usage: ansi2knr input_file [output_file]\n");
- exit(0);
- case 2:
- out = stdout;
- break;
- case 3:
- out = fopen(argv[2], "w");
- if ( out == NULL )
- { fprintf(stderr, "Cannot open output file %s\n", argv[2]);
- exit(1);
- }
- }
- in = fopen(argv[1], "r");
- if ( in == NULL )
- { fprintf(stderr, "Cannot open input file %s\n", argv[1]);
- exit(1);
- }
- fprintf(out, "#line 1 \"%s\"\n", argv[1]);
- buf = malloc(bufsize);
- line = buf;
- while ( fgets(line, (unsigned)(buf + bufsize - line), in) != NULL )
- {
-test: line += strlen(line);
- switch ( test1(buf) )
- {
- case 2: /* a function header */
- convert1(buf, out, 1, convert_varargs);
- break;
- case 1: /* a function */
- /* Check for a { at the start of the next line. */
- more = ++line;
-f: if ( line >= buf + (bufsize - 1) ) /* overflow check */
- goto wl;
- if ( fgets(line, (unsigned)(buf + bufsize - line), in) == NULL )
- goto wl;
- switch ( *skipspace(more, 1) )
- {
- case '{':
- /* Definitely a function header. */
- convert1(buf, out, 0, convert_varargs);
- fputs(more, out);
- break;
- case 0:
- /* The next line was blank or a comment: */
- /* keep scanning for a non-comment. */
- line += strlen(line);
- goto f;
- default:
- /* buf isn't a function header, but */
- /* more might be. */
- fputs(buf, out);
- strcpy(buf, more);
- line = buf;
- goto test;
- }
- break;
- case -1: /* maybe the start of a function */
- if ( line != buf + (bufsize - 1) ) /* overflow check */
- continue;
- /* falls through */
- default: /* not a function */
-wl: fputs(buf, out);
- break;
- }
- line = buf;
- }
- if ( line != buf )
- fputs(buf, out);
- free(buf);
- fclose(out);
- fclose(in);
- return 0;
-}
-
-/* Skip over space and comments, in either direction. */
-char *
-skipspace(p, dir)
- register char *p;
- register int dir; /* 1 for forward, -1 for backward */
-{ for ( ; ; )
- { while ( is_space(*p) )
- p += dir;
- if ( !(*p == '/' && p[dir] == '*') )
- break;
- p += dir; p += dir;
- while ( !(*p == '*' && p[dir] == '/') )
- { if ( *p == 0 )
- return p; /* multi-line comment?? */
- p += dir;
- }
- p += dir; p += dir;
- }
- return p;
-}
-
-/*
- * Write blanks over part of a string.
- * Don't overwrite end-of-line characters.
- */
-int
-writeblanks(start, end)
- char *start;
- char *end;
-{ char *p;
- for ( p = start; p < end; p++ )
- if ( *p != '\r' && *p != '\n' )
- *p = ' ';
- return 0;
-}
-
-/*
- * Test whether the string in buf is a function definition.
- * The string may contain and/or end with a newline.
- * Return as follows:
- * 0 - definitely not a function definition;
- * 1 - definitely a function definition;
- * 2 - definitely a function prototype (NOT USED);
- * -1 - may be the beginning of a function definition,
- * append another line and look again.
- * The reason we don't attempt to convert function prototypes is that
- * Ghostscript's declaration-generating macros look too much like
- * prototypes, and confuse the algorithms.
- */
-int
-test1(buf)
- char *buf;
-{ register char *p = buf;
- char *bend;
- char *endfn;
- int contin;
-
- if ( !isidfirstchar(*p) )
- return 0; /* no name at left margin */
- bend = skipspace(buf + strlen(buf) - 1, -1);
- switch ( *bend )
- {
- case ';': contin = 0 /*2*/; break;
- case ')': contin = 1; break;
- case '{': return 0; /* not a function */
- case '}': return 0; /* not a function */
- default: contin = -1;
- }
- while ( isidchar(*p) )
- p++;
- endfn = p;
- p = skipspace(p, 1);
- if ( *p++ != '(' )
- return 0; /* not a function */
- p = skipspace(p, 1);
- if ( *p == ')' )
- return 0; /* no parameters */
- /* Check that the apparent function name isn't a keyword. */
- /* We only need to check for keywords that could be followed */
- /* by a left parenthesis (which, unfortunately, is most of them). */
- { static char *words[] =
- { "asm", "auto", "case", "char", "const", "double",
- "extern", "float", "for", "if", "int", "long",
- "register", "return", "short", "signed", "sizeof",
- "static", "switch", "typedef", "unsigned",
- "void", "volatile", "while", 0
- };
- char **key = words;
- char *kp;
- int len = endfn - buf;
-
- while ( (kp = *key) != 0 )
- { if ( strlen(kp) == len && !strncmp(kp, buf, len) )
- return 0; /* name is a keyword */
- key++;
- }
- }
- return contin;
-}
-
-/* Convert a recognized function definition or header to K&R syntax. */
-int
-convert1(buf, out, header, convert_varargs)
- char *buf;
- FILE *out;
- int header; /* Boolean */
- int convert_varargs; /* Boolean */
-{ char *endfn;
- register char *p;
- char **breaks;
- unsigned num_breaks = 2; /* for testing */
- char **btop;
- char **bp;
- char **ap;
- char *vararg = 0;
-
- /* Pre-ANSI implementations don't agree on whether strchr */
- /* is called strchr or index, so we open-code it here. */
- for ( endfn = buf; *(endfn++) != '('; )
- ;
-top: p = endfn;
- breaks = (char **)malloc(sizeof(char *) * num_breaks * 2);
- if ( breaks == 0 )
- { /* Couldn't allocate break table, give up */
- fprintf(stderr, "Unable to allocate break table!\n");
- fputs(buf, out);
- return -1;
- }
- btop = breaks + num_breaks * 2 - 2;
- bp = breaks;
- /* Parse the argument list */
- do
- { int level = 0;
- char *lp = NULL;
- char *rp;
- char *end = NULL;
-
- if ( bp >= btop )
- { /* Filled up break table. */
- /* Allocate a bigger one and start over. */
- free((char *)breaks);
- num_breaks <<= 1;
- goto top;
- }
- *bp++ = p;
- /* Find the end of the argument */
- for ( ; end == NULL; p++ )
- { switch(*p)
- {
- case ',':
- if ( !level ) end = p;
- break;
- case '(':
- if ( !level ) lp = p;
- level++;
- break;
- case ')':
- if ( --level < 0 ) end = p;
- else rp = p;
- break;
- case '/':
- p = skipspace(p, 1) - 1;
- break;
- default:
- ;
- }
- }
- /* Erase any embedded prototype parameters. */
- if ( lp )
- writeblanks(lp + 1, rp);
- p--; /* back up over terminator */
- /* Find the name being declared. */
- /* This is complicated because of procedure and */
- /* array modifiers. */
- for ( ; ; )
- { p = skipspace(p - 1, -1);
- switch ( *p )
- {
- case ']': /* skip array dimension(s) */
- case ')': /* skip procedure args OR name */
- { int level = 1;
- while ( level )
- switch ( *--p )
- {
- case ']': case ')': level++; break;
- case '[': case '(': level--; break;
- case '/': p = skipspace(p, -1) + 1; break;
- default: ;
- }
- }
- if ( *p == '(' && *skipspace(p + 1, 1) == '*' )
- { /* We found the name being declared */
- while ( !isidfirstchar(*p) )
- p = skipspace(p, 1) + 1;
- goto found;
- }
- break;
- default:
- goto found;
- }
- }
-found: if ( *p == '.' && p[-1] == '.' && p[-2] == '.' )
- { if ( convert_varargs )
- { *bp++ = "va_alist";
- vararg = p-2;
- }
- else
- { p++;
- if ( bp == breaks + 1 ) /* sole argument */
- writeblanks(breaks[0], p);
- else
- writeblanks(bp[-1] - 1, p);
- bp--;
- }
- }
- else
- { while ( isidchar(*p) ) p--;
- *bp++ = p+1;
- }
- p = end;
- }
- while ( *p++ == ',' );
- *bp = p;
- /* Make a special check for 'void' arglist */
- if ( bp == breaks+2 )
- { p = skipspace(breaks[0], 1);
- if ( !strncmp(p, "void", 4) )
- { p = skipspace(p+4, 1);
- if ( p == breaks[2] - 1 )
- { bp = breaks; /* yup, pretend arglist is empty */
- writeblanks(breaks[0], p + 1);
- }
- }
- }
- /* Put out the function name and left parenthesis. */
- p = buf;
- while ( p != endfn ) putc(*p, out), p++;
- /* Put out the declaration. */
- if ( header )
- { fputs(");", out);
- for ( p = breaks[0]; *p; p++ )
- if ( *p == '\r' || *p == '\n' )
- putc(*p, out);
- }
- else
- { for ( ap = breaks+1; ap < bp; ap += 2 )
- { p = *ap;
- while ( isidchar(*p) )
- putc(*p, out), p++;
- if ( ap < bp - 1 )
- fputs(", ", out);
- }
- fputs(") ", out);
- /* Put out the argument declarations */
- for ( ap = breaks+2; ap <= bp; ap += 2 )
- (*ap)[-1] = ';';
- if ( vararg != 0 )
- { *vararg = 0;
- fputs(breaks[0], out); /* any prior args */
- fputs("va_dcl", out); /* the final arg */
- fputs(bp[0], out);
- }
- else
- fputs(breaks[0], out);
- }
- free((char *)breaks);
- return 0;
-}
diff --git a/bmp.c b/bmp.c
new file mode 100644
index 0000000..3ccc877
--- /dev/null
+++ b/bmp.c
@@ -0,0 +1,370 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005 Sun Microsystems, Inc.
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version. The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * wxWindows Library License for more details.
+*/
+
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifdef _WIN32
+ #include <io.h>
+#else
+ #include <unistd.h>
+#endif
+#include "./rrutil.h"
+#include "./bmp.h"
+
+#ifndef BI_BITFIELDS
+#define BI_BITFIELDS 3L
+#endif
+#ifndef BI_RGB
+#define BI_RGB 0L
+#endif
+
+#define BMPHDRSIZE 54
+typedef struct _bmphdr
+{
+ unsigned short bfType;
+ unsigned int bfSize;
+ unsigned short bfReserved1, bfReserved2;
+ unsigned int bfOffBits;
+
+ unsigned int biSize;
+ int biWidth, biHeight;
+ unsigned short biPlanes, biBitCount;
+ unsigned int biCompression, biSizeImage;
+ int biXPelsPerMeter, biYPelsPerMeter;
+ unsigned int biClrUsed, biClrImportant;
+} bmphdr;
+
+static const char *__bmperr="No error";
+
+static const int ps[BMPPIXELFORMATS]={3, 4, 3, 4, 4, 4};
+static const int roffset[BMPPIXELFORMATS]={0, 0, 2, 2, 3, 1};
+static const int goffset[BMPPIXELFORMATS]={1, 1, 1, 1, 2, 2};
+static const int boffset[BMPPIXELFORMATS]={2, 2, 0, 0, 1, 3};
+
+#define _throw(m) {__bmperr=m; retcode=-1; goto finally;}
+#define _unix(f) {if((f)==-1) _throw(strerror(errno));}
+#define _catch(f) {if((f)==-1) {retcode=-1; goto finally;}}
+
+#define readme(fd, addr, size) \
+ if((bytesread=read(fd, addr, (size)))==-1) _throw(strerror(errno)); \
+ if(bytesread!=(size)) _throw("Read error");
+
+void pixelconvert(unsigned char *srcbuf, enum BMPPIXELFORMAT srcformat,
+ int srcpitch, unsigned char *dstbuf, enum BMPPIXELFORMAT dstformat, int dstpitch,
+ int w, int h, int flip)
+{
+ unsigned char *srcptr, *srcptr0, *dstptr, *dstptr0;
+ int i, j;
+
+ srcptr=flip? &srcbuf[srcpitch*(h-1)]:srcbuf;
+ for(j=0, dstptr=dstbuf; j<h; j++,
+ srcptr+=flip? -srcpitch:srcpitch, dstptr+=dstpitch)
+ {
+ for(i=0, srcptr0=srcptr, dstptr0=dstptr; i<w; i++,
+ srcptr0+=ps[srcformat], dstptr0+=ps[dstformat])
+ {
+ dstptr0[roffset[dstformat]]=srcptr0[roffset[srcformat]];
+ dstptr0[goffset[dstformat]]=srcptr0[goffset[srcformat]];
+ dstptr0[boffset[dstformat]]=srcptr0[boffset[srcformat]];
+ }
+ }
+}
+
+int loadppm(int *fd, unsigned char **buf, int *w, int *h,
+ enum BMPPIXELFORMAT f, int align, int dstbottomup, int ascii)
+{
+ FILE *fs=NULL; int retcode=0, scalefactor, dstpitch;
+ unsigned char *tempbuf=NULL; char temps[255], temps2[255];
+ int numread=0, totalread=0, pixel[3], i, j;
+
+ if((fs=fdopen(*fd, "r"))==NULL) _throw(strerror(errno));
+
+ do
+ {
+ if(!fgets(temps, 255, fs)) _throw("Read error");
+ if(strlen(temps)==0 || temps[0]=='\n') continue;
+ if(sscanf(temps, "%s", temps2)==1 && temps2[1]=='#') continue;
+ switch(totalread)
+ {
+ case 0:
+ if((numread=sscanf(temps, "%d %d %d", w, h, &scalefactor))==EOF)
+ _throw("Read error");
+ break;
+ case 1:
+ if((numread=sscanf(temps, "%d %d", h, &scalefactor))==EOF)
+ _throw("Read error");
+ break;
+ case 2:
+ if((numread=sscanf(temps, "%d", &scalefactor))==EOF)
+ _throw("Read error");
+ break;
+ }
+ totalread+=numread;
+ } while(totalread<3);
+ if((*w)<1 || (*h)<1 || scalefactor<1) _throw("Corrupt PPM header");
+
+ dstpitch=(((*w)*ps[f])+(align-1))&(~(align-1));
+ if((*buf=(unsigned char *)malloc(dstpitch*(*h)))==NULL)
+ _throw("Memory allocation error");
+ if(ascii)
+ {
+ for(j=0; j<*h; j++)
+ {
+ for(i=0; i<*w; i++)
+ {
+ if(fscanf(fs, "%d%d%d", &pixel[0], &pixel[1], &pixel[2])!=3)
+ _throw("Read error");
+ (*buf)[j*dstpitch+i*ps[f]+roffset[f]]=(unsigned char)(pixel[0]*255/scalefactor);
+ (*buf)[j*dstpitch+i*ps[f]+goffset[f]]=(unsigned char)(pixel[1]*255/scalefactor);
+ (*buf)[j*dstpitch+i*ps[f]+boffset[f]]=(unsigned char)(pixel[2]*255/scalefactor);
+ }
+ }
+ }
+ else
+ {
+ if(scalefactor!=255)
+ _throw("Binary PPMs must have 8-bit components");
+ if((tempbuf=(unsigned char *)malloc((*w)*(*h)*3))==NULL)
+ _throw("Memory allocation error");
+ if(fread(tempbuf, (*w)*(*h)*3, 1, fs)!=1) _throw("Read error");
+ pixelconvert(tempbuf, BMP_RGB, (*w)*3, *buf, f, dstpitch, *w, *h, dstbottomup);
+ }
+
+ finally:
+ if(fs) {fclose(fs); *fd=-1;}
+ if(tempbuf) free(tempbuf);
+ return retcode;
+}
+
+
+int loadbmp(char *filename, unsigned char **buf, int *w, int *h,
+ enum BMPPIXELFORMAT f, int align, int dstbottomup)
+{
+ int fd=-1, bytesread, srcpitch, srcbottomup=1, srcps, dstpitch,
+ retcode=0;
+ unsigned char *tempbuf=NULL;
+ bmphdr bh; int flags=O_RDONLY;
+
+ dstbottomup=dstbottomup? 1:0;
+ #ifdef _WIN32
+ flags|=O_BINARY;
+ #endif
+ if(!filename || !buf || !w || !h || f<0 || f>BMPPIXELFORMATS-1 || align<1)
+ _throw("invalid argument to loadbmp()");
+ if((align&(align-1))!=0)
+ _throw("Alignment must be a power of 2");
+ _unix(fd=open(filename, flags));
+
+ readme(fd, &bh.bfType, sizeof(unsigned short));
+ if(!littleendian()) bh.bfType=byteswap16(bh.bfType);
+
+ if(bh.bfType==0x3650)
+ {
+ _catch(loadppm(&fd, buf, w, h, f, align, dstbottomup, 0));
+ goto finally;
+ }
+ if(bh.bfType==0x3350)
+ {
+ _catch(loadppm(&fd, buf, w, h, f, align, dstbottomup, 1));
+ goto finally;
+ }
+
+ readme(fd, &bh.bfSize, sizeof(unsigned int));
+ readme(fd, &bh.bfReserved1, sizeof(unsigned short));
+ readme(fd, &bh.bfReserved2, sizeof(unsigned short));
+ readme(fd, &bh.bfOffBits, sizeof(unsigned int));
+ readme(fd, &bh.biSize, sizeof(unsigned int));
+ readme(fd, &bh.biWidth, sizeof(int));
+ readme(fd, &bh.biHeight, sizeof(int));
+ readme(fd, &bh.biPlanes, sizeof(unsigned short));
+ readme(fd, &bh.biBitCount, sizeof(unsigned short));
+ readme(fd, &bh.biCompression, sizeof(unsigned int));
+ readme(fd, &bh.biSizeImage, sizeof(unsigned int));
+ readme(fd, &bh.biXPelsPerMeter, sizeof(int));
+ readme(fd, &bh.biYPelsPerMeter, sizeof(int));
+ readme(fd, &bh.biClrUsed, sizeof(unsigned int));
+ readme(fd, &bh.biClrImportant, sizeof(unsigned int));
+
+ if(!littleendian())
+ {
+ bh.bfSize=byteswap(bh.bfSize);
+ bh.bfOffBits=byteswap(bh.bfOffBits);
+ bh.biSize=byteswap(bh.biSize);
+ bh.biWidth=byteswap(bh.biWidth);
+ bh.biHeight=byteswap(bh.biHeight);
+ bh.biPlanes=byteswap16(bh.biPlanes);
+ bh.biBitCount=byteswap16(bh.biBitCount);
+ bh.biCompression=byteswap(bh.biCompression);
+ bh.biSizeImage=byteswap(bh.biSizeImage);
+ bh.biXPelsPerMeter=byteswap(bh.biXPelsPerMeter);
+ bh.biYPelsPerMeter=byteswap(bh.biYPelsPerMeter);
+ bh.biClrUsed=byteswap(bh.biClrUsed);
+ bh.biClrImportant=byteswap(bh.biClrImportant);
+ }
+
+ if(bh.bfType!=0x4d42 || bh.bfOffBits<BMPHDRSIZE
+ || bh.biWidth<1 || bh.biHeight==0)
+ _throw("Corrupt bitmap header");
+ if((bh.biBitCount!=24 && bh.biBitCount!=32) || bh.biCompression!=BI_RGB)
+ _throw("Only uncompessed RGB bitmaps are supported");
+
+ *w=bh.biWidth; *h=bh.biHeight; srcps=bh.biBitCount/8;
+ if(*h<0) {*h=-(*h); srcbottomup=0;}
+ srcpitch=(((*w)*srcps)+3)&(~3);
+ dstpitch=(((*w)*ps[f])+(align-1))&(~(align-1));
+
+ if(srcpitch*(*h)+bh.bfOffBits!=bh.bfSize) _throw("Corrupt bitmap header");
+ if((tempbuf=(unsigned char *)malloc(srcpitch*(*h)))==NULL
+ || (*buf=(unsigned char *)malloc(dstpitch*(*h)))==NULL)
+ _throw("Memory allocation error");
+ if(lseek(fd, (long)bh.bfOffBits, SEEK_SET)!=(long)bh.bfOffBits)
+ _throw(strerror(errno));
+ _unix(bytesread=read(fd, tempbuf, srcpitch*(*h)));
+ if(bytesread!=srcpitch*(*h)) _throw("Read error");
+
+ pixelconvert(tempbuf, BMP_BGR, srcpitch, *buf, f, dstpitch, *w, *h,
+ srcbottomup!=dstbottomup);
+
+ finally:
+ if(tempbuf) free(tempbuf);
+ if(fd!=-1) close(fd);
+ return retcode;
+}
+
+#define writeme(fd, addr, size) \
+ if((byteswritten=write(fd, addr, (size)))==-1) _throw(strerror(errno)); \
+ if(byteswritten!=(size)) _throw("Write error");
+
+int saveppm(char *filename, unsigned char *buf, int w, int h,
+ enum BMPPIXELFORMAT f, int srcpitch, int srcbottomup)
+{
+ FILE *fs=NULL; int retcode=0;
+ unsigned char *tempbuf=NULL;
+
+ if((fs=fopen(filename, "wb"))==NULL) _throw(strerror(errno));
+ if(fprintf(fs, "P6\n")<1) _throw("Write error");
+ if(fprintf(fs, "%d %d\n", w, h)<1) _throw("Write error");
+ if(fprintf(fs, "255\n")<1) _throw("Write error");
+
+ if((tempbuf=(unsigned char *)malloc(w*h*3))==NULL)
+ _throw("Memory allocation error");
+
+ pixelconvert(buf, f, srcpitch, tempbuf, BMP_RGB, w*3, w, h,
+ srcbottomup);
+
+ if((fwrite(tempbuf, w*h*3, 1, fs))!=1) _throw("Write error");
+
+ finally:
+ if(tempbuf) free(tempbuf);
+ if(fs) fclose(fs);
+ return retcode;
+}
+
+int savebmp(char *filename, unsigned char *buf, int w, int h,
+ enum BMPPIXELFORMAT f, int srcpitch, int srcbottomup)
+{
+ int fd=-1, byteswritten, dstpitch, retcode=0;
+ int flags=O_RDWR|O_CREAT|O_TRUNC;
+ unsigned char *tempbuf=NULL; char *temp;
+ bmphdr bh; int mode;
+
+ #ifdef _WIN32
+ flags|=O_BINARY; mode=_S_IREAD|_S_IWRITE;
+ #else
+ mode=S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH;
+ #endif
+ if(!filename || !buf || w<1 || h<1 || f<0 || f>BMPPIXELFORMATS-1 || srcpitch<0)
+ _throw("bad argument to savebmp()");
+
+ if(srcpitch==0) srcpitch=w*ps[f];
+
+ if((temp=strrchr(filename, '.'))!=NULL)
+ {
+ if(!stricmp(temp, ".ppm"))
+ return saveppm(filename, buf, w, h, f, srcpitch, srcbottomup);
+ }
+
+ _unix(fd=open(filename, flags, mode));
+ dstpitch=((w*3)+3)&(~3);
+
+ bh.bfType=0x4d42;
+ bh.bfSize=BMPHDRSIZE+dstpitch*h;
+ bh.bfReserved1=0; bh.bfReserved2=0;
+ bh.bfOffBits=BMPHDRSIZE;
+ bh.biSize=40;
+ bh.biWidth=w; bh.biHeight=h;
+ bh.biPlanes=0; bh.biBitCount=24;
+ bh.biCompression=BI_RGB; bh.biSizeImage=0;
+ bh.biXPelsPerMeter=0; bh.biYPelsPerMeter=0;
+ bh.biClrUsed=0; bh.biClrImportant=0;
+
+ if(!littleendian())
+ {
+ bh.bfType=byteswap16(bh.bfType);
+ bh.bfSize=byteswap(bh.bfSize);
+ bh.bfOffBits=byteswap(bh.bfOffBits);
+ bh.biSize=byteswap(bh.biSize);
+ bh.biWidth=byteswap(bh.biWidth);
+ bh.biHeight=byteswap(bh.biHeight);
+ bh.biPlanes=byteswap16(bh.biPlanes);
+ bh.biBitCount=byteswap16(bh.biBitCount);
+ bh.biCompression=byteswap(bh.biCompression);
+ bh.biSizeImage=byteswap(bh.biSizeImage);
+ bh.biXPelsPerMeter=byteswap(bh.biXPelsPerMeter);
+ bh.biYPelsPerMeter=byteswap(bh.biYPelsPerMeter);
+ bh.biClrUsed=byteswap(bh.biClrUsed);
+ bh.biClrImportant=byteswap(bh.biClrImportant);
+ }
+
+ writeme(fd, &bh.bfType, sizeof(unsigned short));
+ writeme(fd, &bh.bfSize, sizeof(unsigned int));
+ writeme(fd, &bh.bfReserved1, sizeof(unsigned short));
+ writeme(fd, &bh.bfReserved2, sizeof(unsigned short));
+ writeme(fd, &bh.bfOffBits, sizeof(unsigned int));
+ writeme(fd, &bh.biSize, sizeof(unsigned int));
+ writeme(fd, &bh.biWidth, sizeof(int));
+ writeme(fd, &bh.biHeight, sizeof(int));
+ writeme(fd, &bh.biPlanes, sizeof(unsigned short));
+ writeme(fd, &bh.biBitCount, sizeof(unsigned short));
+ writeme(fd, &bh.biCompression, sizeof(unsigned int));
+ writeme(fd, &bh.biSizeImage, sizeof(unsigned int));
+ writeme(fd, &bh.biXPelsPerMeter, sizeof(int));
+ writeme(fd, &bh.biYPelsPerMeter, sizeof(int));
+ writeme(fd, &bh.biClrUsed, sizeof(unsigned int));
+ writeme(fd, &bh.biClrImportant, sizeof(unsigned int));
+
+ if((tempbuf=(unsigned char *)malloc(dstpitch*h))==NULL)
+ _throw("Memory allocation error");
+
+ pixelconvert(buf, f, srcpitch, tempbuf, BMP_BGR, dstpitch, w, h,
+ !srcbottomup);
+
+ if((byteswritten=write(fd, tempbuf, dstpitch*h))!=dstpitch*h)
+ _throw(strerror(errno));
+
+ finally:
+ if(tempbuf) free(tempbuf);
+ if(fd!=-1) close(fd);
+ return retcode;
+}
+
+const char *bmpgeterr(void)
+{
+ return __bmperr;
+}
diff --git a/bmp.h b/bmp.h
new file mode 100644
index 0000000..437d327
--- /dev/null
+++ b/bmp.h
@@ -0,0 +1,48 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005 Sun Microsystems, Inc.
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version. The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * wxWindows Library License for more details.
+*/
+
+// This provides rudimentary facilities for loading and saving true color
+// BMP and PPM files
+
+#ifndef __BMP_H__
+#define __BMP_H__
+
+#define BMPPIXELFORMATS 6
+enum BMPPIXELFORMAT {BMP_RGB=0, BMP_RGBA, BMP_BGR, BMP_BGRA, BMP_ABGR, BMP_ARGB};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This will load a Windows bitmap from a file and return a buffer with the
+// specified pixel format, scanline alignment, and orientation. The width and
+// height are returned in w and h.
+
+int loadbmp(char *filename, unsigned char **buf, int *w, int *h,
+ enum BMPPIXELFORMAT f, int align, int dstbottomup);
+
+// This will save a buffer with the specified pixel format, pitch, orientation,
+// width, and height as a 24-bit Windows bitmap or PPM (the filename determines
+// which format to use)
+
+int savebmp(char *filename, unsigned char *buf, int w, int h,
+ enum BMPPIXELFORMAT f, int srcpitch, int srcbottomup);
+
+const char *bmpgeterr(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/cjpeg.1 b/cjpeg.1
deleted file mode 100644
index d175a96..0000000
--- a/cjpeg.1
+++ /dev/null
@@ -1,292 +0,0 @@
-.TH CJPEG 1 "20 March 1998"
-.SH NAME
-cjpeg \- compress an image file to a JPEG file
-.SH SYNOPSIS
-.B cjpeg
-[
-.I options
-]
-[
-.I filename
-]
-.LP
-.SH DESCRIPTION
-.LP
-.B cjpeg
-compresses the named image file, or the standard input if no file is
-named, and produces a JPEG/JFIF file on the standard output.
-The currently supported input file formats are: PPM (PBMPLUS color
-format), PGM (PBMPLUS gray-scale format), BMP, Targa, and RLE (Utah Raster
-Toolkit format). (RLE is supported only if the URT library is available.)
-.SH OPTIONS
-All switch names may be abbreviated; for example,
-.B \-grayscale
-may be written
-.B \-gray
-or
-.BR \-gr .
-Most of the "basic" switches can be abbreviated to as little as one letter.
-Upper and lower case are equivalent (thus
-.B \-BMP
-is the same as
-.BR \-bmp ).
-British spellings are also accepted (e.g.,
-.BR \-greyscale ),
-though for brevity these are not mentioned below.
-.PP
-The basic switches are:
-.TP
-.BI \-quality " N"
-Scale quantization tables to adjust image quality. Quality is 0 (worst) to
-100 (best); default is 75. (See below for more info.)
-.TP
-.B \-grayscale
-Create monochrome JPEG file from color input. Be sure to use this switch when
-compressing a grayscale BMP file, because
-.B cjpeg
-isn't bright enough to notice whether a BMP file uses only shades of gray.
-By saying
-.BR \-grayscale ,
-you'll get a smaller JPEG file that takes less time to process.
-.TP
-.B \-optimize
-Perform optimization of entropy encoding parameters. Without this, default
-encoding parameters are used.
-.B \-optimize
-usually makes the JPEG file a little smaller, but
-.B cjpeg
-runs somewhat slower and needs much more memory. Image quality and speed of
-decompression are unaffected by
-.BR \-optimize .
-.TP
-.B \-progressive
-Create progressive JPEG file (see below).
-.TP
-.B \-targa
-Input file is Targa format. Targa files that contain an "identification"
-field will not be automatically recognized by
-.BR cjpeg ;
-for such files you must specify
-.B \-targa
-to make
-.B cjpeg
-treat the input as Targa format.
-For most Targa files, you won't need this switch.
-.PP
-The
-.B \-quality
-switch lets you trade off compressed file size against quality of the
-reconstructed image: the higher the quality setting, the larger the JPEG file,
-and the closer the output image will be to the original input. Normally you
-want to use the lowest quality setting (smallest file) that decompresses into
-something visually indistinguishable from the original image. For this
-purpose the quality setting should be between 50 and 95; the default of 75 is
-often about right. If you see defects at
-.B \-quality
-75, then go up 5 or 10 counts at a time until you are happy with the output
-image. (The optimal setting will vary from one image to another.)
-.PP
-.B \-quality
-100 will generate a quantization table of all 1's, minimizing loss in the
-quantization step (but there is still information loss in subsampling, as well
-as roundoff error). This setting is mainly of interest for experimental
-purposes. Quality values above about 95 are
-.B not
-recommended for normal use; the compressed file size goes up dramatically for
-hardly any gain in output image quality.
-.PP
-In the other direction, quality values below 50 will produce very small files
-of low image quality. Settings around 5 to 10 might be useful in preparing an
-index of a large image library, for example. Try
-.B \-quality
-2 (or so) for some amusing Cubist effects. (Note: quality
-values below about 25 generate 2-byte quantization tables, which are
-considered optional in the JPEG standard.
-.B cjpeg
-emits a warning message when you give such a quality value, because some
-other JPEG programs may be unable to decode the resulting file. Use
-.B \-baseline
-if you need to ensure compatibility at low quality values.)
-.PP
-The
-.B \-progressive
-switch creates a "progressive JPEG" file. In this type of JPEG file, the data
-is stored in multiple scans of increasing quality. If the file is being
-transmitted over a slow communications link, the decoder can use the first
-scan to display a low-quality image very quickly, and can then improve the
-display with each subsequent scan. The final image is exactly equivalent to a
-standard JPEG file of the same quality setting, and the total file size is
-about the same --- often a little smaller.
-.B Caution:
-progressive JPEG is not yet widely implemented, so many decoders will be
-unable to view a progressive JPEG file at all.
-.PP
-Switches for advanced users:
-.TP
-.B \-dct int
-Use integer DCT method (default).
-.TP
-.B \-dct fast
-Use fast integer DCT (less accurate).
-.TP
-.B \-dct float
-Use floating-point DCT method.
-The float method is very slightly more accurate than the int method, but is
-much slower unless your machine has very fast floating-point hardware. Also
-note that results of the floating-point method may vary slightly across
-machines, while the integer methods should give the same results everywhere.
-The fast integer method is much less accurate than the other two.
-.TP
-.BI \-restart " N"
-Emit a JPEG restart marker every N MCU rows, or every N MCU blocks if "B" is
-attached to the number.
-.B \-restart 0
-(the default) means no restart markers.
-.TP
-.BI \-smooth " N"
-Smooth the input image to eliminate dithering noise. N, ranging from 1 to
-100, indicates the strength of smoothing. 0 (the default) means no smoothing.
-.TP
-.BI \-maxmemory " N"
-Set limit for amount of memory to use in processing large images. Value is
-in thousands of bytes, or millions of bytes if "M" is attached to the
-number. For example,
-.B \-max 4m
-selects 4000000 bytes. If more space is needed, temporary files will be used.
-.TP
-.BI \-outfile " name"
-Send output image to the named file, not to standard output.
-.TP
-.B \-verbose
-Enable debug printout. More
-.BR \-v 's
-give more output. Also, version information is printed at startup.
-.TP
-.B \-debug
-Same as
-.BR \-verbose .
-.PP
-The
-.B \-restart
-option inserts extra markers that allow a JPEG decoder to resynchronize after
-a transmission error. Without restart markers, any damage to a compressed
-file will usually ruin the image from the point of the error to the end of the
-image; with restart markers, the damage is usually confined to the portion of
-the image up to the next restart marker. Of course, the restart markers
-occupy extra space. We recommend
-.B \-restart 1
-for images that will be transmitted across unreliable networks such as Usenet.
-.PP
-The
-.B \-smooth
-option filters the input to eliminate fine-scale noise. This is often useful
-when converting dithered images to JPEG: a moderate smoothing factor of 10 to
-50 gets rid of dithering patterns in the input file, resulting in a smaller
-JPEG file and a better-looking image. Too large a smoothing factor will
-visibly blur the image, however.
-.PP
-Switches for wizards:
-.TP
-.B \-baseline
-Force baseline-compatible quantization tables to be generated. This clamps
-quantization values to 8 bits even at low quality settings. (This switch is
-poorly named, since it does not ensure that the output is actually baseline
-JPEG. For example, you can use
-.B \-baseline
-and
-.B \-progressive
-together.)
-.TP
-.BI \-qtables " file"
-Use the quantization tables given in the specified text file.
-.TP
-.BI \-qslots " N[,...]"
-Select which quantization table to use for each color component.
-.TP
-.BI \-sample " HxV[,...]"
-Set JPEG sampling factors for each color component.
-.TP
-.BI \-scans " file"
-Use the scan script given in the specified text file.
-.PP
-The "wizard" switches are intended for experimentation with JPEG. If you
-don't know what you are doing, \fBdon't use them\fR. These switches are
-documented further in the file wizard.doc.
-.SH EXAMPLES
-.LP
-This example compresses the PPM file foo.ppm with a quality factor of
-60 and saves the output as foo.jpg:
-.IP
-.B cjpeg \-quality
-.I 60 foo.ppm
-.B >
-.I foo.jpg
-.SH HINTS
-Color GIF files are not the ideal input for JPEG; JPEG is really intended for
-compressing full-color (24-bit) images. In particular, don't try to convert
-cartoons, line drawings, and other images that have only a few distinct
-colors. GIF works great on these, JPEG does not. If you want to convert a
-GIF to JPEG, you should experiment with
-.BR cjpeg 's
-.B \-quality
-and
-.B \-smooth
-options to get a satisfactory conversion.
-.B \-smooth 10
-or so is often helpful.
-.PP
-Avoid running an image through a series of JPEG compression/decompression
-cycles. Image quality loss will accumulate; after ten or so cycles the image
-may be noticeably worse than it was after one cycle. It's best to use a
-lossless format while manipulating an image, then convert to JPEG format when
-you are ready to file the image away.
-.PP
-The
-.B \-optimize
-option to
-.B cjpeg
-is worth using when you are making a "final" version for posting or archiving.
-It's also a win when you are using low quality settings to make very small
-JPEG files; the percentage improvement is often a lot more than it is on
-larger files. (At present,
-.B \-optimize
-mode is always selected when generating progressive JPEG files.)
-.SH ENVIRONMENT
-.TP
-.B JPEGMEM
-If this environment variable is set, its value is the default memory limit.
-The value is specified as described for the
-.B \-maxmemory
-switch.
-.B JPEGMEM
-overrides the default value specified when the program was compiled, and
-itself is overridden by an explicit
-.BR \-maxmemory .
-.SH SEE ALSO
-.BR djpeg (1),
-.BR jpegtran (1),
-.BR rdjpgcom (1),
-.BR wrjpgcom (1)
-.br
-.BR ppm (5),
-.BR pgm (5)
-.br
-Wallace, Gregory K. "The JPEG Still Picture Compression Standard",
-Communications of the ACM, April 1991 (vol. 34, no. 4), pp. 30-44.
-.SH AUTHOR
-Independent JPEG Group
-.SH BUGS
-Arithmetic coding is not supported for legal reasons.
-.PP
-GIF input files are no longer supported, to avoid the Unisys LZW patent.
-Use a Unisys-licensed program if you need to read a GIF file. (Conversion
-of GIF files to JPEG is usually a bad idea anyway.)
-.PP
-Not all variants of BMP and Targa file formats are supported.
-.PP
-The
-.B \-targa
-switch is not a bug, it's a feature. (It would be a bug if the Targa format
-designers had not been clueless.)
-.PP
-Still not as fast as we'd like.
diff --git a/ckconfig.c b/ckconfig.c
deleted file mode 100644
index 34baf79..0000000
--- a/ckconfig.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * ckconfig.c
- *
- * Copyright (C) 1991-1994, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- */
-
-/*
- * This program is intended to help you determine how to configure the JPEG
- * software for installation on a particular system. The idea is to try to
- * compile and execute this program. If your compiler fails to compile the
- * program, make changes as indicated in the comments below. Once you can
- * compile the program, run it, and it will produce a "jconfig.h" file for
- * your system.
- *
- * As a general rule, each time you try to compile this program,
- * pay attention only to the *first* error message you get from the compiler.
- * Many C compilers will issue lots of spurious error messages once they
- * have gotten confused. Go to the line indicated in the first error message,
- * and read the comments preceding that line to see what to change.
- *
- * Almost all of the edits you may need to make to this program consist of
- * changing a line that reads "#define SOME_SYMBOL" to "#undef SOME_SYMBOL",
- * or vice versa. This is called defining or undefining that symbol.
- */
-
-
-/* First we must see if your system has the include files we need.
- * We start out with the assumption that your system has all the ANSI-standard
- * include files. If you get any error trying to include one of these files,
- * undefine the corresponding HAVE_xxx symbol.
- */
-
-#define HAVE_STDDEF_H /* replace 'define' by 'undef' if error here */
-#ifdef HAVE_STDDEF_H /* next line will be skipped if you undef... */
-#include <stddef.h>
-#endif
-
-#define HAVE_STDLIB_H /* same thing for stdlib.h */
-#ifdef HAVE_STDLIB_H
-#include <stdlib.h>
-#endif
-
-#include <stdio.h> /* If you ain't got this, you ain't got C. */
-
-/* We have to see if your string functions are defined by
- * strings.h (old BSD convention) or string.h (everybody else).
- * We try the non-BSD convention first; define NEED_BSD_STRINGS
- * if the compiler says it can't find string.h.
- */
-
-#undef NEED_BSD_STRINGS
-
-#ifdef NEED_BSD_STRINGS
-#include <strings.h>
-#else
-#include <string.h>
-#endif
-
-/* On some systems (especially older Unix machines), type size_t is
- * defined only in the include file <sys/types.h>. If you get a failure
- * on the size_t test below, try defining NEED_SYS_TYPES_H.
- */
-
-#undef NEED_SYS_TYPES_H /* start by assuming we don't need it */
-#ifdef NEED_SYS_TYPES_H
-#include <sys/types.h>
-#endif
-
-
-/* Usually type size_t is defined in one of the include files we've included
- * above. If not, you'll get an error on the "typedef size_t my_size_t;" line.
- * In that case, first try defining NEED_SYS_TYPES_H just above.
- * If that doesn't work, you'll have to search through your system library
- * to figure out which include file defines "size_t". Look for a line that
- * says "typedef something-or-other size_t;". Then, change the line below
- * that says "#include <someincludefile.h>" to instead include the file
- * you found size_t in, and define NEED_SPECIAL_INCLUDE. If you can't find
- * type size_t anywhere, try replacing "#include <someincludefile.h>" with
- * "typedef unsigned int size_t;".
- */
-
-#undef NEED_SPECIAL_INCLUDE /* assume we DON'T need it, for starters */
-
-#ifdef NEED_SPECIAL_INCLUDE
-#include <someincludefile.h>
-#endif
-
-typedef size_t my_size_t; /* The payoff: do we have size_t now? */
-
-
-/* The next question is whether your compiler supports ANSI-style function
- * prototypes. You need to know this in order to choose between using
- * makefile.ansi and using makefile.unix.
- * The #define line below is set to assume you have ANSI function prototypes.
- * If you get an error in this group of lines, undefine HAVE_PROTOTYPES.
- */
-
-#define HAVE_PROTOTYPES
-
-#ifdef HAVE_PROTOTYPES
-int testfunction (int arg1, int * arg2); /* check prototypes */
-
-struct methods_struct { /* check method-pointer declarations */
- int (*error_exit) (char *msgtext);
- int (*trace_message) (char *msgtext);
- int (*another_method) (void);
-};
-
-int testfunction (int arg1, int * arg2) /* check definitions */
-{
- return arg2[arg1];
-}
-
-int test2function (void) /* check void arg list */
-{
- return 0;
-}
-#endif
-
-
-/* Now we want to find out if your compiler knows what "unsigned char" means.
- * If you get an error on the "unsigned char un_char;" line,
- * then undefine HAVE_UNSIGNED_CHAR.
- */
-
-#define HAVE_UNSIGNED_CHAR
-
-#ifdef HAVE_UNSIGNED_CHAR
-unsigned char un_char;
-#endif
-
-
-/* Now we want to find out if your compiler knows what "unsigned short" means.
- * If you get an error on the "unsigned short un_short;" line,
- * then undefine HAVE_UNSIGNED_SHORT.
- */
-
-#define HAVE_UNSIGNED_SHORT
-
-#ifdef HAVE_UNSIGNED_SHORT
-unsigned short un_short;
-#endif
-
-
-/* Now we want to find out if your compiler understands type "void".
- * If you get an error anywhere in here, undefine HAVE_VOID.
- */
-
-#define HAVE_VOID
-
-#ifdef HAVE_VOID
-/* Caution: a C++ compiler will insist on complete prototypes */
-typedef void * void_ptr; /* check void * */
-#ifdef HAVE_PROTOTYPES /* check ptr to function returning void */
-typedef void (*void_func) (int a, int b);
-#else
-typedef void (*void_func) ();
-#endif
-
-#ifdef HAVE_PROTOTYPES /* check void function result */
-void test3function (void_ptr arg1, void_func arg2)
-#else
-void test3function (arg1, arg2)
- void_ptr arg1;
- void_func arg2;
-#endif
-{
- char * locptr = (char *) arg1; /* check casting to and from void * */
- arg1 = (void *) locptr;
- (*arg2) (1, 2); /* check call of fcn returning void */
-}
-#endif
-
-
-/* Now we want to find out if your compiler knows what "const" means.
- * If you get an error here, undefine HAVE_CONST.
- */
-
-#define HAVE_CONST
-
-#ifdef HAVE_CONST
-static const int carray[3] = {1, 2, 3};
-
-#ifdef HAVE_PROTOTYPES
-int test4function (const int arg1)
-#else
-int test4function (arg1)
- const int arg1;
-#endif
-{
- return carray[arg1];
-}
-#endif
-
-
-/* If you get an error or warning about this structure definition,
- * define INCOMPLETE_TYPES_BROKEN.
- */
-
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifndef INCOMPLETE_TYPES_BROKEN
-typedef struct undefined_structure * undef_struct_ptr;
-#endif
-
-
-/* If you get an error about duplicate names,
- * define NEED_SHORT_EXTERNAL_NAMES.
- */
-
-#undef NEED_SHORT_EXTERNAL_NAMES
-
-#ifndef NEED_SHORT_EXTERNAL_NAMES
-
-int possibly_duplicate_function ()
-{
- return 0;
-}
-
-int possibly_dupli_function ()
-{
- return 1;
-}
-
-#endif
-
-
-
-/************************************************************************
- * OK, that's it. You should not have to change anything beyond this
- * point in order to compile and execute this program. (You might get
- * some warnings, but you can ignore them.)
- * When you run the program, it will make a couple more tests that it
- * can do automatically, and then it will create jconfig.h and print out
- * any additional suggestions it has.
- ************************************************************************
- */
-
-
-#ifdef HAVE_PROTOTYPES
-int is_char_signed (int arg)
-#else
-int is_char_signed (arg)
- int arg;
-#endif
-{
- if (arg == 189) { /* expected result for unsigned char */
- return 0; /* type char is unsigned */
- }
- else if (arg != -67) { /* expected result for signed char */
- printf("Hmm, it seems 'char' is not eight bits wide on your machine.\n");
- printf("I fear the JPEG software will not work at all.\n\n");
- }
- return 1; /* assume char is signed otherwise */
-}
-
-
-#ifdef HAVE_PROTOTYPES
-int is_shifting_signed (long arg)
-#else
-int is_shifting_signed (arg)
- long arg;
-#endif
-/* See whether right-shift on a long is signed or not. */
-{
- long res = arg >> 4;
-
- if (res == -0x7F7E80CL) { /* expected result for signed shift */
- return 1; /* right shift is signed */
- }
- /* see if unsigned-shift hack will fix it. */
- /* we can't just test exact value since it depends on width of long... */
- res |= (~0L) << (32-4);
- if (res == -0x7F7E80CL) { /* expected result now? */
- return 0; /* right shift is unsigned */
- }
- printf("Right shift isn't acting as I expect it to.\n");
- printf("I fear the JPEG software will not work at all.\n\n");
- return 0; /* try it with unsigned anyway */
-}
-
-
-#ifdef HAVE_PROTOTYPES
-int main (int argc, char ** argv)
-#else
-int main (argc, argv)
- int argc;
- char ** argv;
-#endif
-{
- char signed_char_check = (char) (-67);
- FILE *outfile;
-
- /* Attempt to write jconfig.h */
- if ((outfile = fopen("jconfig.h", "w")) == NULL) {
- printf("Failed to write jconfig.h\n");
- return 1;
- }
-
- /* Write out all the info */
- fprintf(outfile, "/* jconfig.h --- generated by ckconfig.c */\n");
- fprintf(outfile, "/* see jconfig.doc for explanations */\n\n");
-#ifdef HAVE_PROTOTYPES
- fprintf(outfile, "#define HAVE_PROTOTYPES\n");
-#else
- fprintf(outfile, "#undef HAVE_PROTOTYPES\n");
-#endif
-#ifdef HAVE_UNSIGNED_CHAR
- fprintf(outfile, "#define HAVE_UNSIGNED_CHAR\n");
-#else
- fprintf(outfile, "#undef HAVE_UNSIGNED_CHAR\n");
-#endif
-#ifdef HAVE_UNSIGNED_SHORT
- fprintf(outfile, "#define HAVE_UNSIGNED_SHORT\n");
-#else
- fprintf(outfile, "#undef HAVE_UNSIGNED_SHORT\n");
-#endif
-#ifdef HAVE_VOID
- fprintf(outfile, "/* #define void char */\n");
-#else
- fprintf(outfile, "#define void char\n");
-#endif
-#ifdef HAVE_CONST
- fprintf(outfile, "/* #define const */\n");
-#else
- fprintf(outfile, "#define const\n");
-#endif
- if (is_char_signed((int) signed_char_check))
- fprintf(outfile, "#undef CHAR_IS_UNSIGNED\n");
- else
- fprintf(outfile, "#define CHAR_IS_UNSIGNED\n");
-#ifdef HAVE_STDDEF_H
- fprintf(outfile, "#define HAVE_STDDEF_H\n");
-#else
- fprintf(outfile, "#undef HAVE_STDDEF_H\n");
-#endif
-#ifdef HAVE_STDLIB_H
- fprintf(outfile, "#define HAVE_STDLIB_H\n");
-#else
- fprintf(outfile, "#undef HAVE_STDLIB_H\n");
-#endif
-#ifdef NEED_BSD_STRINGS
- fprintf(outfile, "#define NEED_BSD_STRINGS\n");
-#else
- fprintf(outfile, "#undef NEED_BSD_STRINGS\n");
-#endif
-#ifdef NEED_SYS_TYPES_H
- fprintf(outfile, "#define NEED_SYS_TYPES_H\n");
-#else
- fprintf(outfile, "#undef NEED_SYS_TYPES_H\n");
-#endif
- fprintf(outfile, "#undef NEED_FAR_POINTERS\n");
-#ifdef NEED_SHORT_EXTERNAL_NAMES
- fprintf(outfile, "#define NEED_SHORT_EXTERNAL_NAMES\n");
-#else
- fprintf(outfile, "#undef NEED_SHORT_EXTERNAL_NAMES\n");
-#endif
-#ifdef INCOMPLETE_TYPES_BROKEN
- fprintf(outfile, "#define INCOMPLETE_TYPES_BROKEN\n");
-#else
- fprintf(outfile, "#undef INCOMPLETE_TYPES_BROKEN\n");
-#endif
- fprintf(outfile, "\n#ifdef JPEG_INTERNALS\n\n");
- if (is_shifting_signed(-0x7F7E80B1L))
- fprintf(outfile, "#undef RIGHT_SHIFT_IS_UNSIGNED\n");
- else
- fprintf(outfile, "#define RIGHT_SHIFT_IS_UNSIGNED\n");
- fprintf(outfile, "\n#endif /* JPEG_INTERNALS */\n");
- fprintf(outfile, "\n#ifdef JPEG_CJPEG_DJPEG\n\n");
- fprintf(outfile, "#define BMP_SUPPORTED /* BMP image file format */\n");
- fprintf(outfile, "#define GIF_SUPPORTED /* GIF image file format */\n");
- fprintf(outfile, "#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */\n");
- fprintf(outfile, "#undef RLE_SUPPORTED /* Utah RLE image file format */\n");
- fprintf(outfile, "#define TARGA_SUPPORTED /* Targa image file format */\n\n");
- fprintf(outfile, "#undef TWO_FILE_COMMANDLINE /* You may need this on non-Unix systems */\n");
- fprintf(outfile, "#undef NEED_SIGNAL_CATCHER /* Define this if you use jmemname.c */\n");
- fprintf(outfile, "#undef DONT_USE_B_MODE\n");
- fprintf(outfile, "/* #define PROGRESS_REPORT */ /* optional */\n");
- fprintf(outfile, "\n#endif /* JPEG_CJPEG_DJPEG */\n");
-
- /* Close the jconfig.h file */
- fclose(outfile);
-
- /* User report */
- printf("Configuration check for Independent JPEG Group's software done.\n");
- printf("\nI have written the jconfig.h file for you.\n\n");
-#ifdef HAVE_PROTOTYPES
- printf("You should use makefile.ansi as the starting point for your Makefile.\n");
-#else
- printf("You should use makefile.unix as the starting point for your Makefile.\n");
-#endif
-
-#ifdef NEED_SPECIAL_INCLUDE
- printf("\nYou'll need to change jconfig.h to include the system include file\n");
- printf("that you found type size_t in, or add a direct definition of type\n");
- printf("size_t if that's what you used. Just add it to the end.\n");
-#endif
-
- return 0;
-}
diff --git a/config.guess b/config.guess
deleted file mode 100755
index 413ed41..0000000
--- a/config.guess
+++ /dev/null
@@ -1,883 +0,0 @@
-#! /bin/sh
-# Attempt to guess a canonical system name.
-# Copyright (C) 1992, 93, 94, 95, 96, 1997 Free Software Foundation, Inc.
-#
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# Written by Per Bothner <bothner@cygnus.com>.
-# The master version of this file is at the FSF in /home/gd/gnu/lib.
-#
-# This script attempts to guess a canonical system name similar to
-# config.sub. If it succeeds, it prints the system name on stdout, and
-# exits with 0. Otherwise, it exits with 1.
-#
-# The plan is that this can be called by configure scripts if you
-# don't specify an explicit system type (host/target name).
-#
-# Only a few systems have been added to this list; please add others
-# (but try to keep the structure clean).
-#
-
-# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
-# (ghazi@noc.rutgers.edu 8/24/94.)
-if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
- PATH=$PATH:/.attbin ; export PATH
-fi
-
-UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
-UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
-UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown
-UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
-
-trap 'rm -f dummy.c dummy.o dummy; exit 1' 1 2 15
-
-# Note: order is significant - the case branches are not exclusive.
-
-case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
- alpha:OSF1:*:*)
- if test $UNAME_RELEASE = "V4.0"; then
- UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
- fi
- # A Vn.n version is a released version.
- # A Tn.n version is a released field test version.
- # A Xn.n version is an unreleased experimental baselevel.
- # 1.2 uses "1.2" for uname -r.
- cat <<EOF >dummy.s
- .globl main
- .ent main
-main:
- .frame \$30,0,\$26,0
- .prologue 0
- .long 0x47e03d80 # implver $0
- lda \$2,259
- .long 0x47e20c21 # amask $2,$1
- srl \$1,8,\$2
- sll \$2,2,\$2
- sll \$0,3,\$0
- addl \$1,\$0,\$0
- addl \$2,\$0,\$0
- ret \$31,(\$26),1
- .end main
-EOF
- ${CC-cc} dummy.s -o dummy 2>/dev/null
- if test "$?" = 0 ; then
- ./dummy
- case "$?" in
- 7)
- UNAME_MACHINE="alpha"
- ;;
- 15)
- UNAME_MACHINE="alphaev5"
- ;;
- 14)
- UNAME_MACHINE="alphaev56"
- ;;
- 10)
- UNAME_MACHINE="alphapca56"
- ;;
- 16)
- UNAME_MACHINE="alphaev6"
- ;;
- esac
- fi
- rm -f dummy.s dummy
- echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[VTX]//' | tr [[A-Z]] [[a-z]]`
- exit 0 ;;
- 21064:Windows_NT:50:3)
- echo alpha-dec-winnt3.5
- exit 0 ;;
- Amiga*:UNIX_System_V:4.0:*)
- echo m68k-cbm-sysv4
- exit 0;;
- amiga:NetBSD:*:*)
- echo m68k-cbm-netbsd${UNAME_RELEASE}
- exit 0 ;;
- amiga:OpenBSD:*:*)
- echo m68k-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- arc64:OpenBSD:*:*)
- echo mips64el-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- arc:OpenBSD:*:*)
- echo mipsel-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- hkmips:OpenBSD:*:*)
- echo mips-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- pmax:OpenBSD:*:*)
- echo mipsel-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- sgi:OpenBSD:*:*)
- echo mips-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- wgrisc:OpenBSD:*:*)
- echo mipsel-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
- echo arm-acorn-riscix${UNAME_RELEASE}
- exit 0;;
- arm32:NetBSD:*:*)
- echo arm-unknown-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
- exit 0 ;;
- SR2?01:HI-UX/MPP:*:*)
- echo hppa1.1-hitachi-hiuxmpp
- exit 0;;
- Pyramid*:OSx*:*:*|MIS*:OSx*:*:*)
- # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
- if test "`(/bin/universe) 2>/dev/null`" = att ; then
- echo pyramid-pyramid-sysv3
- else
- echo pyramid-pyramid-bsd
- fi
- exit 0 ;;
- NILE:*:*:dcosx)
- echo pyramid-pyramid-svr4
- exit 0 ;;
- sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
- echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
- exit 0 ;;
- i86pc:SunOS:5.*:*)
- echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
- exit 0 ;;
- sun4*:SunOS:6*:*)
- # According to config.sub, this is the proper way to canonicalize
- # SunOS6. Hard to guess exactly what SunOS6 will be like, but
- # it's likely to be more like Solaris than SunOS4.
- echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
- exit 0 ;;
- sun4*:SunOS:*:*)
- case "`/usr/bin/arch -k`" in
- Series*|S4*)
- UNAME_RELEASE=`uname -v`
- ;;
- esac
- # Japanese Language versions have a version number like `4.1.3-JL'.
- echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
- exit 0 ;;
- sun3*:SunOS:*:*)
- echo m68k-sun-sunos${UNAME_RELEASE}
- exit 0 ;;
- sun*:*:4.2BSD:*)
- UNAME_RELEASE=`(head -1 /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
- test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
- case "`/bin/arch`" in
- sun3)
- echo m68k-sun-sunos${UNAME_RELEASE}
- ;;
- sun4)
- echo sparc-sun-sunos${UNAME_RELEASE}
- ;;
- esac
- exit 0 ;;
- aushp:SunOS:*:*)
- echo sparc-auspex-sunos${UNAME_RELEASE}
- exit 0 ;;
- atari*:NetBSD:*:*)
- echo m68k-atari-netbsd${UNAME_RELEASE}
- exit 0 ;;
- atari*:OpenBSD:*:*)
- echo m68k-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- sun3*:NetBSD:*:*)
- echo m68k-sun-netbsd${UNAME_RELEASE}
- exit 0 ;;
- sun3*:OpenBSD:*:*)
- echo m68k-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- mac68k:NetBSD:*:*)
- echo m68k-apple-netbsd${UNAME_RELEASE}
- exit 0 ;;
- mac68k:OpenBSD:*:*)
- echo m68k-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- mvme68k:OpenBSD:*:*)
- echo m68k-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- mvme88k:OpenBSD:*:*)
- echo m88k-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- powerpc:machten:*:*)
- echo powerpc-apple-machten${UNAME_RELEASE}
- exit 0 ;;
- RISC*:Mach:*:*)
- echo mips-dec-mach_bsd4.3
- exit 0 ;;
- RISC*:ULTRIX:*:*)
- echo mips-dec-ultrix${UNAME_RELEASE}
- exit 0 ;;
- VAX*:ULTRIX*:*:*)
- echo vax-dec-ultrix${UNAME_RELEASE}
- exit 0 ;;
- 2020:CLIX:*:*)
- echo clipper-intergraph-clix${UNAME_RELEASE}
- exit 0 ;;
- mips:*:*:UMIPS | mips:*:*:RISCos)
- sed 's/^ //' << EOF >dummy.c
- int main (argc, argv) int argc; char **argv; {
- #if defined (host_mips) && defined (MIPSEB)
- #if defined (SYSTYPE_SYSV)
- printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
- #endif
- #if defined (SYSTYPE_SVR4)
- printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
- #endif
- #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
- printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
- #endif
- #endif
- exit (-1);
- }
-EOF
- ${CC-cc} dummy.c -o dummy \
- && ./dummy `echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` \
- && rm dummy.c dummy && exit 0
- rm -f dummy.c dummy
- echo mips-mips-riscos${UNAME_RELEASE}
- exit 0 ;;
- Night_Hawk:Power_UNIX:*:*)
- echo powerpc-harris-powerunix
- exit 0 ;;
- m88k:CX/UX:7*:*)
- echo m88k-harris-cxux7
- exit 0 ;;
- m88k:*:4*:R4*)
- echo m88k-motorola-sysv4
- exit 0 ;;
- m88k:*:3*:R3*)
- echo m88k-motorola-sysv3
- exit 0 ;;
- AViiON:dgux:*:*)
- # DG/UX returns AViiON for all architectures
- UNAME_PROCESSOR=`/usr/bin/uname -p`
- if [ $UNAME_PROCESSOR = mc88100 -o $UNAME_PROCESSOR = mc88110 ] ; then
- if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx \
- -o ${TARGET_BINARY_INTERFACE}x = x ] ; then
- echo m88k-dg-dgux${UNAME_RELEASE}
- else
- echo m88k-dg-dguxbcs${UNAME_RELEASE}
- fi
- else echo i586-dg-dgux${UNAME_RELEASE}
- fi
- exit 0 ;;
- M88*:DolphinOS:*:*) # DolphinOS (SVR3)
- echo m88k-dolphin-sysv3
- exit 0 ;;
- M88*:*:R3*:*)
- # Delta 88k system running SVR3
- echo m88k-motorola-sysv3
- exit 0 ;;
- XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
- echo m88k-tektronix-sysv3
- exit 0 ;;
- Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
- echo m68k-tektronix-bsd
- exit 0 ;;
- *:IRIX*:*:*)
- echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
- exit 0 ;;
- ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
- echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id
- exit 0 ;; # Note that: echo "'`uname -s`'" gives 'AIX '
- i?86:AIX:*:*)
- echo i386-ibm-aix
- exit 0 ;;
- *:AIX:2:3)
- if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
- sed 's/^ //' << EOF >dummy.c
- #include <sys/systemcfg.h>
-
- main()
- {
- if (!__power_pc())
- exit(1);
- puts("powerpc-ibm-aix3.2.5");
- exit(0);
- }
-EOF
- ${CC-cc} dummy.c -o dummy && ./dummy && rm dummy.c dummy && exit 0
- rm -f dummy.c dummy
- echo rs6000-ibm-aix3.2.5
- elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
- echo rs6000-ibm-aix3.2.4
- else
- echo rs6000-ibm-aix3.2
- fi
- exit 0 ;;
- *:AIX:*:4)
- if /usr/sbin/lsattr -EHl proc0 | grep POWER >/dev/null 2>&1; then
- IBM_ARCH=rs6000
- else
- IBM_ARCH=powerpc
- fi
- if [ -x /usr/bin/oslevel ] ; then
- IBM_REV=`/usr/bin/oslevel`
- else
- IBM_REV=4.${UNAME_RELEASE}
- fi
- echo ${IBM_ARCH}-ibm-aix${IBM_REV}
- exit 0 ;;
- *:AIX:*:*)
- echo rs6000-ibm-aix
- exit 0 ;;
- ibmrt:4.4BSD:*|romp-ibm:BSD:*)
- echo romp-ibm-bsd4.4
- exit 0 ;;
- ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC NetBSD and
- echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to
- exit 0 ;; # report: romp-ibm BSD 4.3
- *:BOSX:*:*)
- echo rs6000-bull-bosx
- exit 0 ;;
- DPX/2?00:B.O.S.:*:*)
- echo m68k-bull-sysv3
- exit 0 ;;
- 9000/[34]??:4.3bsd:1.*:*)
- echo m68k-hp-bsd
- exit 0 ;;
- hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
- echo m68k-hp-bsd4.4
- exit 0 ;;
- 9000/[3478]??:HP-UX:*:*)
- case "${UNAME_MACHINE}" in
- 9000/31? ) HP_ARCH=m68000 ;;
- 9000/[34]?? ) HP_ARCH=m68k ;;
- 9000/7?? | 9000/8?[1679] ) HP_ARCH=hppa1.1 ;;
- 9000/8?? ) HP_ARCH=hppa1.0 ;;
- esac
- HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
- echo ${HP_ARCH}-hp-hpux${HPUX_REV}
- exit 0 ;;
- 3050*:HI-UX:*:*)
- sed 's/^ //' << EOF >dummy.c
- #include <unistd.h>
- int
- main ()
- {
- long cpu = sysconf (_SC_CPU_VERSION);
- /* The order matters, because CPU_IS_HP_MC68K erroneously returns
- true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct
- results, however. */
- if (CPU_IS_PA_RISC (cpu))
- {
- switch (cpu)
- {
- case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
- case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
- case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
- default: puts ("hppa-hitachi-hiuxwe2"); break;
- }
- }
- else if (CPU_IS_HP_MC68K (cpu))
- puts ("m68k-hitachi-hiuxwe2");
- else puts ("unknown-hitachi-hiuxwe2");
- exit (0);
- }
-EOF
- ${CC-cc} dummy.c -o dummy && ./dummy && rm dummy.c dummy && exit 0
- rm -f dummy.c dummy
- echo unknown-hitachi-hiuxwe2
- exit 0 ;;
- 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
- echo hppa1.1-hp-bsd
- exit 0 ;;
- 9000/8??:4.3bsd:*:*)
- echo hppa1.0-hp-bsd
- exit 0 ;;
- hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
- echo hppa1.1-hp-osf
- exit 0 ;;
- hp8??:OSF1:*:*)
- echo hppa1.0-hp-osf
- exit 0 ;;
- i?86:OSF1:*:*)
- if [ -x /usr/sbin/sysversion ] ; then
- echo ${UNAME_MACHINE}-unknown-osf1mk
- else
- echo ${UNAME_MACHINE}-unknown-osf1
- fi
- exit 0 ;;
- parisc*:Lites*:*:*)
- echo hppa1.1-hp-lites
- exit 0 ;;
- C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
- echo c1-convex-bsd
- exit 0 ;;
- C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
- if getsysinfo -f scalar_acc
- then echo c32-convex-bsd
- else echo c2-convex-bsd
- fi
- exit 0 ;;
- C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
- echo c34-convex-bsd
- exit 0 ;;
- C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
- echo c38-convex-bsd
- exit 0 ;;
- C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
- echo c4-convex-bsd
- exit 0 ;;
- CRAY*X-MP:*:*:*)
- echo xmp-cray-unicos
- exit 0 ;;
- CRAY*Y-MP:*:*:*)
- echo ymp-cray-unicos${UNAME_RELEASE}
- exit 0 ;;
- CRAY*[A-Z]90:*:*:*)
- echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
- | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
- -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/
- exit 0 ;;
- CRAY*TS:*:*:*)
- echo t90-cray-unicos${UNAME_RELEASE}
- exit 0 ;;
- CRAY-2:*:*:*)
- echo cray2-cray-unicos
- exit 0 ;;
- F300:UNIX_System_V:*:*)
- FUJITSU_SYS=`uname -p | tr [A-Z] [a-z] | sed -e 's/\///'`
- FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
- echo "f300-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
- exit 0 ;;
- F301:UNIX_System_V:*:*)
- echo f301-fujitsu-uxpv`echo $UNAME_RELEASE | sed 's/ .*//'`
- exit 0 ;;
- hp3[0-9][05]:NetBSD:*:*)
- echo m68k-hp-netbsd${UNAME_RELEASE}
- exit 0 ;;
- hp300:OpenBSD:*:*)
- echo m68k-unknown-openbsd${UNAME_RELEASE}
- exit 0 ;;
- i?86:BSD/386:*:* | *:BSD/OS:*:*)
- echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
- exit 0 ;;
- *:FreeBSD:*:*)
- echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
- exit 0 ;;
- *:NetBSD:*:*)
- echo ${UNAME_MACHINE}-unknown-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
- exit 0 ;;
- *:OpenBSD:*:*)
- echo ${UNAME_MACHINE}-unknown-openbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
- exit 0 ;;
- i*:CYGWIN*:*)
- echo i386-pc-cygwin32
- exit 0 ;;
- i*:MINGW*:*)
- echo i386-pc-mingw32
- exit 0 ;;
- p*:CYGWIN*:*)
- echo powerpcle-unknown-cygwin32
- exit 0 ;;
- prep*:SunOS:5.*:*)
- echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
- exit 0 ;;
- *:GNU:*:*)
- echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
- exit 0 ;;
- *:Linux:*:*)
- # The BFD linker knows what the default object file format is, so
- # first see if it will tell us.
- ld_help_string=`ld --help 2>&1`
- ld_supported_emulations=`echo $ld_help_string \
- | sed -ne '/supported emulations:/!d
- s/[ ][ ]*/ /g
- s/.*supported emulations: *//
- s/ .*//
- p'`
- case "$ld_supported_emulations" in
- i?86linux) echo "${UNAME_MACHINE}-pc-linux-gnuaout" ; exit 0 ;;
- i?86coff) echo "${UNAME_MACHINE}-pc-linux-gnucoff" ; exit 0 ;;
- sparclinux) echo "${UNAME_MACHINE}-unknown-linux-gnuaout" ; exit 0 ;;
- m68klinux) echo "${UNAME_MACHINE}-unknown-linux-gnuaout" ; exit 0 ;;
- elf32ppc) echo "powerpc-unknown-linux-gnu" ; exit 0 ;;
- esac
-
- if test "${UNAME_MACHINE}" = "alpha" ; then
- sed 's/^ //' <<EOF >dummy.s
- .globl main
- .ent main
- main:
- .frame \$30,0,\$26,0
- .prologue 0
- .long 0x47e03d80 # implver $0
- lda \$2,259
- .long 0x47e20c21 # amask $2,$1
- srl \$1,8,\$2
- sll \$2,2,\$2
- sll \$0,3,\$0
- addl \$1,\$0,\$0
- addl \$2,\$0,\$0
- ret \$31,(\$26),1
- .end main
-EOF
- LIBC=""
- ${CC-cc} dummy.s -o dummy 2>/dev/null
- if test "$?" = 0 ; then
- ./dummy
- case "$?" in
- 7)
- UNAME_MACHINE="alpha"
- ;;
- 15)
- UNAME_MACHINE="alphaev5"
- ;;
- 14)
- UNAME_MACHINE="alphaev56"
- ;;
- 10)
- UNAME_MACHINE="alphapca56"
- ;;
- 16)
- UNAME_MACHINE="alphaev6"
- ;;
- esac
-
- objdump --private-headers dummy | \
- grep ld.so.1 > /dev/null
- if test "$?" = 0 ; then
- LIBC="libc1"
- fi
- fi
- rm -f dummy.s dummy
- echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} ; exit 0
- elif test "${UNAME_MACHINE}" = "mips" ; then
- cat >dummy.c <<EOF
-main(argc, argv)
- int argc;
- char *argv[];
-{
-#ifdef __MIPSEB__
- printf ("%s-unknown-linux-gnu\n", argv[1]);
-#endif
-#ifdef __MIPSEL__
- printf ("%sel-unknown-linux-gnu\n", argv[1]);
-#endif
- return 0;
-}
-EOF
- ${CC-cc} dummy.c -o dummy 2>/dev/null && ./dummy "${UNAME_MACHINE}" && rm dummy.c dummy && exit 0
- rm -f dummy.c dummy
- else
- # Either a pre-BFD a.out linker (linux-gnuoldld)
- # or one that does not give us useful --help.
- # GCC wants to distinguish between linux-gnuoldld and linux-gnuaout.
- # If ld does not provide *any* "supported emulations:"
- # that means it is gnuoldld.
- echo "$ld_help_string" | grep >/dev/null 2>&1 "supported emulations:"
- test $? != 0 && echo "${UNAME_MACHINE}-pc-linux-gnuoldld" && exit 0
-
- case "${UNAME_MACHINE}" in
- i?86)
- VENDOR=pc;
- ;;
- *)
- VENDOR=unknown;
- ;;
- esac
- # Determine whether the default compiler is a.out or elf
- cat >dummy.c <<EOF
-#include <features.h>
-main(argc, argv)
- int argc;
- char *argv[];
-{
-#ifdef __ELF__
-# ifdef __GLIBC__
-# if __GLIBC__ >= 2
- printf ("%s-${VENDOR}-linux-gnu\n", argv[1]);
-# else
- printf ("%s-${VENDOR}-linux-gnulibc1\n", argv[1]);
-# endif
-# else
- printf ("%s-${VENDOR}-linux-gnulibc1\n", argv[1]);
-# endif
-#else
- printf ("%s-${VENDOR}-linux-gnuaout\n", argv[1]);
-#endif
- return 0;
-}
-EOF
- ${CC-cc} dummy.c -o dummy 2>/dev/null && ./dummy "${UNAME_MACHINE}" && rm dummy.c dummy && exit 0
- rm -f dummy.c dummy
- fi ;;
-# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. earlier versions
-# are messed up and put the nodename in both sysname and nodename.
- i?86:DYNIX/ptx:4*:*)
- echo i386-sequent-sysv4
- exit 0 ;;
- i?86:UNIX_SV:4.2MP:2.*)
- # Unixware is an offshoot of SVR4, but it has its own version
- # number series starting with 2...
- # I am not positive that other SVR4 systems won't match this,
- # I just have to hope. -- rms.
- # Use sysv4.2uw... so that sysv4* matches it.
- echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
- exit 0 ;;
- i?86:*:4.*:* | i?86:SYSTEM_V:4.*:*)
- if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
- echo ${UNAME_MACHINE}-univel-sysv${UNAME_RELEASE}
- else
- echo ${UNAME_MACHINE}-pc-sysv${UNAME_RELEASE}
- fi
- exit 0 ;;
- i?86:*:3.2:*)
- if test -f /usr/options/cb.name; then
- UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
- echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
- elif /bin/uname -X 2>/dev/null >/dev/null ; then
- UNAME_REL=`(/bin/uname -X|egrep Release|sed -e 's/.*= //')`
- (/bin/uname -X|egrep i80486 >/dev/null) && UNAME_MACHINE=i486
- (/bin/uname -X|egrep '^Machine.*Pentium' >/dev/null) \
- && UNAME_MACHINE=i586
- echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
- else
- echo ${UNAME_MACHINE}-pc-sysv32
- fi
- exit 0 ;;
- pc:*:*:*)
- # uname -m prints for DJGPP always 'pc', but it prints nothing about
- # the processor, so we play safe by assuming i386.
- echo i386-pc-msdosdjgpp
- exit 0 ;;
- Intel:Mach:3*:*)
- echo i386-pc-mach3
- exit 0 ;;
- paragon:*:*:*)
- echo i860-intel-osf1
- exit 0 ;;
- i860:*:4.*:*) # i860-SVR4
- if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
- echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
- else # Add other i860-SVR4 vendors below as they are discovered.
- echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4
- fi
- exit 0 ;;
- mini*:CTIX:SYS*5:*)
- # "miniframe"
- echo m68010-convergent-sysv
- exit 0 ;;
- M68*:*:R3V[567]*:*)
- test -r /sysV68 && echo 'm68k-motorola-sysv' && exit 0 ;;
- 3[34]??:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 4850:*:4.0:3.0)
- OS_REL=''
- test -r /etc/.relid \
- && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
- /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
- && echo i486-ncr-sysv4.3${OS_REL} && exit 0
- /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
- && echo i586-ncr-sysv4.3${OS_REL} && exit 0 ;;
- 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
- /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
- && echo i486-ncr-sysv4 && exit 0 ;;
- m68*:LynxOS:2.*:*)
- echo m68k-unknown-lynxos${UNAME_RELEASE}
- exit 0 ;;
- mc68030:UNIX_System_V:4.*:*)
- echo m68k-atari-sysv4
- exit 0 ;;
- i?86:LynxOS:2.*:*)
- echo i386-unknown-lynxos${UNAME_RELEASE}
- exit 0 ;;
- TSUNAMI:LynxOS:2.*:*)
- echo sparc-unknown-lynxos${UNAME_RELEASE}
- exit 0 ;;
- rs6000:LynxOS:2.*:* | PowerPC:LynxOS:2.*:*)
- echo rs6000-unknown-lynxos${UNAME_RELEASE}
- exit 0 ;;
- SM[BE]S:UNIX_SV:*:*)
- echo mips-dde-sysv${UNAME_RELEASE}
- exit 0 ;;
- RM*:SINIX-*:*:*)
- echo mips-sni-sysv4
- exit 0 ;;
- *:SINIX-*:*:*)
- if uname -p 2>/dev/null >/dev/null ; then
- UNAME_MACHINE=`(uname -p) 2>/dev/null`
- echo ${UNAME_MACHINE}-sni-sysv4
- else
- echo ns32k-sni-sysv
- fi
- exit 0 ;;
- PENTIUM:CPunix:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
- # says <Richard.M.Bartel@ccMail.Census.GOV>
- echo i586-unisys-sysv4
- exit 0 ;;
- *:UNIX_System_V:4*:FTX*)
- # From Gerald Hewes <hewes@openmarket.com>.
- # How about differentiating between stratus architectures? -djm
- echo hppa1.1-stratus-sysv4
- exit 0 ;;
- *:*:*:FTX*)
- # From seanf@swdc.stratus.com.
- echo i860-stratus-sysv4
- exit 0 ;;
- mc68*:A/UX:*:*)
- echo m68k-apple-aux${UNAME_RELEASE}
- exit 0 ;;
- news*:NEWS-OS:*:6*)
- echo mips-sony-newsos6
- exit 0 ;;
- R3000:*System_V*:*:* | R4000:UNIX_SYSV:*:*)
- if [ -d /usr/nec ]; then
- echo mips-nec-sysv${UNAME_RELEASE}
- else
- echo mips-unknown-sysv${UNAME_RELEASE}
- fi
- exit 0 ;;
-esac
-
-#echo '(No uname command or uname output not recognized.)' 1>&2
-#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
-
-cat >dummy.c <<EOF
-#ifdef _SEQUENT_
-# include <sys/types.h>
-# include <sys/utsname.h>
-#endif
-main ()
-{
-#if defined (sony)
-#if defined (MIPSEB)
- /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed,
- I don't know.... */
- printf ("mips-sony-bsd\n"); exit (0);
-#else
-#include <sys/param.h>
- printf ("m68k-sony-newsos%s\n",
-#ifdef NEWSOS4
- "4"
-#else
- ""
-#endif
- ); exit (0);
-#endif
-#endif
-
-#if defined (__arm) && defined (__acorn) && defined (__unix)
- printf ("arm-acorn-riscix"); exit (0);
-#endif
-
-#if defined (hp300) && !defined (hpux)
- printf ("m68k-hp-bsd\n"); exit (0);
-#endif
-
-#if defined (NeXT)
-#if !defined (__ARCHITECTURE__)
-#define __ARCHITECTURE__ "m68k"
-#endif
- int version;
- version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
- printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
- exit (0);
-#endif
-
-#if defined (MULTIMAX) || defined (n16)
-#if defined (UMAXV)
- printf ("ns32k-encore-sysv\n"); exit (0);
-#else
-#if defined (CMU)
- printf ("ns32k-encore-mach\n"); exit (0);
-#else
- printf ("ns32k-encore-bsd\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (__386BSD__)
- printf ("i386-pc-bsd\n"); exit (0);
-#endif
-
-#if defined (sequent)
-#if defined (i386)
- printf ("i386-sequent-dynix\n"); exit (0);
-#endif
-#if defined (ns32000)
- printf ("ns32k-sequent-dynix\n"); exit (0);
-#endif
-#endif
-
-#if defined (_SEQUENT_)
- struct utsname un;
-
- uname(&un);
-
- if (strncmp(un.version, "V2", 2) == 0) {
- printf ("i386-sequent-ptx2\n"); exit (0);
- }
- if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
- printf ("i386-sequent-ptx1\n"); exit (0);
- }
- printf ("i386-sequent-ptx\n"); exit (0);
-
-#endif
-
-#if defined (vax)
-#if !defined (ultrix)
- printf ("vax-dec-bsd\n"); exit (0);
-#else
- printf ("vax-dec-ultrix\n"); exit (0);
-#endif
-#endif
-
-#if defined (alliant) && defined (i860)
- printf ("i860-alliant-bsd\n"); exit (0);
-#endif
-
- exit (1);
-}
-EOF
-
-${CC-cc} dummy.c -o dummy 2>/dev/null && ./dummy && rm dummy.c dummy && exit 0
-rm -f dummy.c dummy
-
-# Apollos put the system type in the environment.
-
-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit 0; }
-
-# Convex versions that predate uname can use getsysinfo(1)
-
-if [ -x /usr/convex/getsysinfo ]
-then
- case `getsysinfo -f cpu_type` in
- c1*)
- echo c1-convex-bsd
- exit 0 ;;
- c2*)
- if getsysinfo -f scalar_acc
- then echo c32-convex-bsd
- else echo c2-convex-bsd
- fi
- exit 0 ;;
- c34*)
- echo c34-convex-bsd
- exit 0 ;;
- c38*)
- echo c38-convex-bsd
- exit 0 ;;
- c4*)
- echo c4-convex-bsd
- exit 0 ;;
- esac
-fi
-
-#echo '(Unable to guess system type)' 1>&2
-
-exit 1
diff --git a/config.sub b/config.sub
deleted file mode 100755
index 213a6d4..0000000
--- a/config.sub
+++ /dev/null
@@ -1,954 +0,0 @@
-#! /bin/sh
-# Configuration validation subroutine script, version 1.1.
-# Copyright (C) 1991, 92, 93, 94, 95, 96, 1997 Free Software Foundation, Inc.
-# This file is (in principle) common to ALL GNU software.
-# The presence of a machine in this file suggests that SOME GNU software
-# can handle that machine. It does not imply ALL GNU software can.
-#
-# This file is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330,
-# Boston, MA 02111-1307, USA.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# Configuration subroutine to validate and canonicalize a configuration type.
-# Supply the specified configuration type as an argument.
-# If it is invalid, we print an error message on stderr and exit with code 1.
-# Otherwise, we print the canonical config type on stdout and succeed.
-
-# This file is supposed to be the same for all GNU packages
-# and recognize all the CPU types, system types and aliases
-# that are meaningful with *any* GNU software.
-# Each package is responsible for reporting which valid configurations
-# it does not support. The user should be able to distinguish
-# a failure to support a valid configuration from a meaningless
-# configuration.
-
-# The goal of this file is to map all the various variations of a given
-# machine specification into a single specification in the form:
-# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
-# or in some cases, the newer four-part form:
-# CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
-# It is wrong to echo any other type of specification.
-
-if [ x$1 = x ]
-then
- echo Configuration name missing. 1>&2
- echo "Usage: $0 CPU-MFR-OPSYS" 1>&2
- echo "or $0 ALIAS" 1>&2
- echo where ALIAS is a recognized configuration type. 1>&2
- exit 1
-fi
-
-# First pass through any local machine types.
-case $1 in
- *local*)
- echo $1
- exit 0
- ;;
- *)
- ;;
-esac
-
-# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
-# Here we must recognize all the valid KERNEL-OS combinations.
-maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
-case $maybe_os in
- linux-gnu*)
- os=-$maybe_os
- basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
- ;;
- *)
- basic_machine=`echo $1 | sed 's/-[^-]*$//'`
- if [ $basic_machine != $1 ]
- then os=`echo $1 | sed 's/.*-/-/'`
- else os=; fi
- ;;
-esac
-
-### Let's recognize common machines as not being operating systems so
-### that things like config.sub decstation-3100 work. We also
-### recognize some manufacturers as not being operating systems, so we
-### can provide default operating systems below.
-case $os in
- -sun*os*)
- # Prevent following clause from handling this invalid input.
- ;;
- -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
- -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
- -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
- -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
- -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
- -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
- -apple)
- os=
- basic_machine=$1
- ;;
- -hiux*)
- os=-hiuxwe2
- ;;
- -sco5)
- os=sco3.2v5
- basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
- ;;
- -sco4)
- os=-sco3.2v4
- basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
- ;;
- -sco3.2.[4-9]*)
- os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
- basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
- ;;
- -sco3.2v[4-9]*)
- # Don't forget version if it is 3.2v4 or newer.
- basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
- ;;
- -sco*)
- os=-sco3.2v2
- basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
- ;;
- -isc)
- os=-isc2.2
- basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
- ;;
- -clix*)
- basic_machine=clipper-intergraph
- ;;
- -isc*)
- basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
- ;;
- -lynx*)
- os=-lynxos
- ;;
- -ptx*)
- basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
- ;;
- -windowsnt*)
- os=`echo $os | sed -e 's/windowsnt/winnt/'`
- ;;
- -psos*)
- os=-psos
- ;;
-esac
-
-# Decode aliases for certain CPU-COMPANY combinations.
-case $basic_machine in
- # Recognize the basic CPU types without company name.
- # Some are omitted here because they have special meanings below.
- tahoe | i860 | m32r | m68k | m68000 | m88k | ns32k | arc | arm \
- | arme[lb] | pyramid | mn10200 | mn10300 \
- | tron | a29k | 580 | i960 | h8300 | hppa | hppa1.0 | hppa1.1 \
- | alpha | alphaev5 | alphaev56 | we32k | ns16k | clipper \
- | i370 | sh | powerpc | powerpcle | 1750a | dsp16xx | pdp11 \
- | mips64 | mipsel | mips64el | mips64orion | mips64orionel \
- | mipstx39 | mipstx39el \
- | sparc | sparclet | sparclite | sparc64 | v850)
- basic_machine=$basic_machine-unknown
- ;;
- # We use `pc' rather than `unknown'
- # because (1) that's what they normally are, and
- # (2) the word "unknown" tends to confuse beginning users.
- i[3456]86)
- basic_machine=$basic_machine-pc
- ;;
- # Object if more than one company name word.
- *-*-*)
- echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
- exit 1
- ;;
- # Recognize the basic CPU types with company name.
- vax-* | tahoe-* | i[3456]86-* | i860-* | m32r-* | m68k-* | m68000-* \
- | m88k-* | sparc-* | ns32k-* | fx80-* | arc-* | arm-* | c[123]* \
- | mips-* | pyramid-* | tron-* | a29k-* | romp-* | rs6000-* \
- | power-* | none-* | 580-* | cray2-* | h8300-* | i960-* \
- | xmp-* | ymp-* | hppa-* | hppa1.0-* | hppa1.1-* \
- | alpha-* | alphaev5-* | alphaev56-* | we32k-* | cydra-* \
- | ns16k-* | pn-* | np1-* | xps100-* | clipper-* | orion-* \
- | sparclite-* | pdp11-* | sh-* | powerpc-* | powerpcle-* \
- | sparc64-* | mips64-* | mipsel-* \
- | mips64el-* | mips64orion-* | mips64orionel-* \
- | mipstx39-* | mipstx39el-* \
- | f301-*)
- ;;
- # Recognize the various machine names and aliases which stand
- # for a CPU type and a company and sometimes even an OS.
- 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
- basic_machine=m68000-att
- ;;
- 3b*)
- basic_machine=we32k-att
- ;;
- alliant | fx80)
- basic_machine=fx80-alliant
- ;;
- altos | altos3068)
- basic_machine=m68k-altos
- ;;
- am29k)
- basic_machine=a29k-none
- os=-bsd
- ;;
- amdahl)
- basic_machine=580-amdahl
- os=-sysv
- ;;
- amiga | amiga-*)
- basic_machine=m68k-cbm
- ;;
- amigaos | amigados)
- basic_machine=m68k-cbm
- os=-amigaos
- ;;
- amigaunix | amix)
- basic_machine=m68k-cbm
- os=-sysv4
- ;;
- apollo68)
- basic_machine=m68k-apollo
- os=-sysv
- ;;
- aux)
- basic_machine=m68k-apple
- os=-aux
- ;;
- balance)
- basic_machine=ns32k-sequent
- os=-dynix
- ;;
- convex-c1)
- basic_machine=c1-convex
- os=-bsd
- ;;
- convex-c2)
- basic_machine=c2-convex
- os=-bsd
- ;;
- convex-c32)
- basic_machine=c32-convex
- os=-bsd
- ;;
- convex-c34)
- basic_machine=c34-convex
- os=-bsd
- ;;
- convex-c38)
- basic_machine=c38-convex
- os=-bsd
- ;;
- cray | ymp)
- basic_machine=ymp-cray
- os=-unicos
- ;;
- cray2)
- basic_machine=cray2-cray
- os=-unicos
- ;;
- [ctj]90-cray)
- basic_machine=c90-cray
- os=-unicos
- ;;
- crds | unos)
- basic_machine=m68k-crds
- ;;
- da30 | da30-*)
- basic_machine=m68k-da30
- ;;
- decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
- basic_machine=mips-dec
- ;;
- delta | 3300 | motorola-3300 | motorola-delta \
- | 3300-motorola | delta-motorola)
- basic_machine=m68k-motorola
- ;;
- delta88)
- basic_machine=m88k-motorola
- os=-sysv3
- ;;
- dpx20 | dpx20-*)
- basic_machine=rs6000-bull
- os=-bosx
- ;;
- dpx2* | dpx2*-bull)
- basic_machine=m68k-bull
- os=-sysv3
- ;;
- ebmon29k)
- basic_machine=a29k-amd
- os=-ebmon
- ;;
- elxsi)
- basic_machine=elxsi-elxsi
- os=-bsd
- ;;
- encore | umax | mmax)
- basic_machine=ns32k-encore
- ;;
- fx2800)
- basic_machine=i860-alliant
- ;;
- genix)
- basic_machine=ns32k-ns
- ;;
- gmicro)
- basic_machine=tron-gmicro
- os=-sysv
- ;;
- h3050r* | hiux*)
- basic_machine=hppa1.1-hitachi
- os=-hiuxwe2
- ;;
- h8300hms)
- basic_machine=h8300-hitachi
- os=-hms
- ;;
- harris)
- basic_machine=m88k-harris
- os=-sysv3
- ;;
- hp300-*)
- basic_machine=m68k-hp
- ;;
- hp300bsd)
- basic_machine=m68k-hp
- os=-bsd
- ;;
- hp300hpux)
- basic_machine=m68k-hp
- os=-hpux
- ;;
- hp9k2[0-9][0-9] | hp9k31[0-9])
- basic_machine=m68000-hp
- ;;
- hp9k3[2-9][0-9])
- basic_machine=m68k-hp
- ;;
- hp9k7[0-9][0-9] | hp7[0-9][0-9] | hp9k8[0-9]7 | hp8[0-9]7)
- basic_machine=hppa1.1-hp
- ;;
- hp9k8[0-9][0-9] | hp8[0-9][0-9])
- basic_machine=hppa1.0-hp
- ;;
- hppa-next)
- os=-nextstep3
- ;;
- i370-ibm* | ibm*)
- basic_machine=i370-ibm
- os=-mvs
- ;;
-# I'm not sure what "Sysv32" means. Should this be sysv3.2?
- i[3456]86v32)
- basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
- os=-sysv32
- ;;
- i[3456]86v4*)
- basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
- os=-sysv4
- ;;
- i[3456]86v)
- basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
- os=-sysv
- ;;
- i[3456]86sol2)
- basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
- os=-solaris2
- ;;
- iris | iris4d)
- basic_machine=mips-sgi
- case $os in
- -irix*)
- ;;
- *)
- os=-irix4
- ;;
- esac
- ;;
- isi68 | isi)
- basic_machine=m68k-isi
- os=-sysv
- ;;
- m88k-omron*)
- basic_machine=m88k-omron
- ;;
- magnum | m3230)
- basic_machine=mips-mips
- os=-sysv
- ;;
- merlin)
- basic_machine=ns32k-utek
- os=-sysv
- ;;
- miniframe)
- basic_machine=m68000-convergent
- ;;
- mipsel*-linux*)
- basic_machine=mipsel-unknown
- os=-linux-gnu
- ;;
- mips*-linux*)
- basic_machine=mips-unknown
- os=-linux-gnu
- ;;
- mips3*-*)
- basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
- ;;
- mips3*)
- basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
- ;;
- ncr3000)
- basic_machine=i486-ncr
- os=-sysv4
- ;;
- news | news700 | news800 | news900)
- basic_machine=m68k-sony
- os=-newsos
- ;;
- news1000)
- basic_machine=m68030-sony
- os=-newsos
- ;;
- news-3600 | risc-news)
- basic_machine=mips-sony
- os=-newsos
- ;;
- next | m*-next )
- basic_machine=m68k-next
- case $os in
- -nextstep* )
- ;;
- -ns2*)
- os=-nextstep2
- ;;
- *)
- os=-nextstep3
- ;;
- esac
- ;;
- nh3000)
- basic_machine=m68k-harris
- os=-cxux
- ;;
- nh[45]000)
- basic_machine=m88k-harris
- os=-cxux
- ;;
- nindy960)
- basic_machine=i960-intel
- os=-nindy
- ;;
- np1)
- basic_machine=np1-gould
- ;;
- pa-hitachi)
- basic_machine=hppa1.1-hitachi
- os=-hiuxwe2
- ;;
- paragon)
- basic_machine=i860-intel
- os=-osf
- ;;
- pbd)
- basic_machine=sparc-tti
- ;;
- pbb)
- basic_machine=m68k-tti
- ;;
- pc532 | pc532-*)
- basic_machine=ns32k-pc532
- ;;
- pentium | p5)
- basic_machine=i586-intel
- ;;
- pentiumpro | p6)
- basic_machine=i686-intel
- ;;
- pentium-* | p5-*)
- basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
- ;;
- pentiumpro-* | p6-*)
- basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
- ;;
- k5)
- # We don't have specific support for AMD's K5 yet, so just call it a Pentium
- basic_machine=i586-amd
- ;;
- nexen)
- # We don't have specific support for Nexgen yet, so just call it a Pentium
- basic_machine=i586-nexgen
- ;;
- pn)
- basic_machine=pn-gould
- ;;
- power) basic_machine=rs6000-ibm
- ;;
- ppc) basic_machine=powerpc-unknown
- ;;
- ppc-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
- ;;
- ppcle | powerpclittle | ppc-le | powerpc-little)
- basic_machine=powerpcle-unknown
- ;;
- ppcle-* | powerpclittle-*)
- basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
- ;;
- ps2)
- basic_machine=i386-ibm
- ;;
- rm[46]00)
- basic_machine=mips-siemens
- ;;
- rtpc | rtpc-*)
- basic_machine=romp-ibm
- ;;
- sequent)
- basic_machine=i386-sequent
- ;;
- sh)
- basic_machine=sh-hitachi
- os=-hms
- ;;
- sps7)
- basic_machine=m68k-bull
- os=-sysv2
- ;;
- spur)
- basic_machine=spur-unknown
- ;;
- sun2)
- basic_machine=m68000-sun
- ;;
- sun2os3)
- basic_machine=m68000-sun
- os=-sunos3
- ;;
- sun2os4)
- basic_machine=m68000-sun
- os=-sunos4
- ;;
- sun3os3)
- basic_machine=m68k-sun
- os=-sunos3
- ;;
- sun3os4)
- basic_machine=m68k-sun
- os=-sunos4
- ;;
- sun4os3)
- basic_machine=sparc-sun
- os=-sunos3
- ;;
- sun4os4)
- basic_machine=sparc-sun
- os=-sunos4
- ;;
- sun4sol2)
- basic_machine=sparc-sun
- os=-solaris2
- ;;
- sun3 | sun3-*)
- basic_machine=m68k-sun
- ;;
- sun4)
- basic_machine=sparc-sun
- ;;
- sun386 | sun386i | roadrunner)
- basic_machine=i386-sun
- ;;
- symmetry)
- basic_machine=i386-sequent
- os=-dynix
- ;;
- tx39)
- basic_machine=mipstx39-unknown
- ;;
- tx39el)
- basic_machine=mipstx39el-unknown
- ;;
- tower | tower-32)
- basic_machine=m68k-ncr
- ;;
- udi29k)
- basic_machine=a29k-amd
- os=-udi
- ;;
- ultra3)
- basic_machine=a29k-nyu
- os=-sym1
- ;;
- vaxv)
- basic_machine=vax-dec
- os=-sysv
- ;;
- vms)
- basic_machine=vax-dec
- os=-vms
- ;;
- vpp*|vx|vx-*)
- basic_machine=f301-fujitsu
- ;;
- vxworks960)
- basic_machine=i960-wrs
- os=-vxworks
- ;;
- vxworks68)
- basic_machine=m68k-wrs
- os=-vxworks
- ;;
- vxworks29k)
- basic_machine=a29k-wrs
- os=-vxworks
- ;;
- xmp)
- basic_machine=xmp-cray
- os=-unicos
- ;;
- xps | xps100)
- basic_machine=xps100-honeywell
- ;;
- none)
- basic_machine=none-none
- os=-none
- ;;
-
-# Here we handle the default manufacturer of certain CPU types. It is in
-# some cases the only manufacturer, in others, it is the most popular.
- mips)
- if [ x$os = x-linux-gnu ]; then
- basic_machine=mips-unknown
- else
- basic_machine=mips-mips
- fi
- ;;
- romp)
- basic_machine=romp-ibm
- ;;
- rs6000)
- basic_machine=rs6000-ibm
- ;;
- vax)
- basic_machine=vax-dec
- ;;
- pdp11)
- basic_machine=pdp11-dec
- ;;
- we32k)
- basic_machine=we32k-att
- ;;
- sparc)
- basic_machine=sparc-sun
- ;;
- cydra)
- basic_machine=cydra-cydrome
- ;;
- orion)
- basic_machine=orion-highlevel
- ;;
- orion105)
- basic_machine=clipper-highlevel
- ;;
- *)
- echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
- exit 1
- ;;
-esac
-
-# Here we canonicalize certain aliases for manufacturers.
-case $basic_machine in
- *-digital*)
- basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
- ;;
- *-commodore*)
- basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
- ;;
- *)
- ;;
-esac
-
-# Decode manufacturer-specific aliases for certain operating systems.
-
-if [ x"$os" != x"" ]
-then
-case $os in
- # First match some system type aliases
- # that might get confused with valid system types.
- # -solaris* is a basic system type, with this one exception.
- -solaris1 | -solaris1.*)
- os=`echo $os | sed -e 's|solaris1|sunos4|'`
- ;;
- -solaris)
- os=-solaris2
- ;;
- -svr4*)
- os=-sysv4
- ;;
- -unixware*)
- os=-sysv4.2uw
- ;;
- -gnu/linux*)
- os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
- ;;
- # First accept the basic system types.
- # The portable systems comes first.
- # Each alternative MUST END IN A *, to match a version number.
- # -sysv* is not here because it comes later, after sysvr4.
- -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
- | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
- | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
- | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
- | -aos* \
- | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
- | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
- | -hiux* | -386bsd* | -netbsd* | -openbsd* | -freebsd* | -riscix* \
- | -lynxos* | -bosx* | -nextstep* | -cxux* | -aout* | -elf* \
- | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
- | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
- | -cygwin32* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
- | -mingw32* | -linux-gnu* | -uxpv*)
- # Remember, each alternative MUST END IN *, to match a version number.
- ;;
- -linux*)
- os=`echo $os | sed -e 's|linux|linux-gnu|'`
- ;;
- -sunos5*)
- os=`echo $os | sed -e 's|sunos5|solaris2|'`
- ;;
- -sunos6*)
- os=`echo $os | sed -e 's|sunos6|solaris3|'`
- ;;
- -osfrose*)
- os=-osfrose
- ;;
- -osf*)
- os=-osf
- ;;
- -utek*)
- os=-bsd
- ;;
- -dynix*)
- os=-bsd
- ;;
- -acis*)
- os=-aos
- ;;
- -ctix* | -uts*)
- os=-sysv
- ;;
- -ns2 )
- os=-nextstep2
- ;;
- # Preserve the version number of sinix5.
- -sinix5.*)
- os=`echo $os | sed -e 's|sinix|sysv|'`
- ;;
- -sinix*)
- os=-sysv4
- ;;
- -triton*)
- os=-sysv3
- ;;
- -oss*)
- os=-sysv3
- ;;
- -svr4)
- os=-sysv4
- ;;
- -svr3)
- os=-sysv3
- ;;
- -sysvr4)
- os=-sysv4
- ;;
- # This must come after -sysvr4.
- -sysv*)
- ;;
- -xenix)
- os=-xenix
- ;;
- -none)
- ;;
- *)
- # Get rid of the `-' at the beginning of $os.
- os=`echo $os | sed 's/[^-]*-//'`
- echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
- exit 1
- ;;
-esac
-else
-
-# Here we handle the default operating systems that come with various machines.
-# The value should be what the vendor currently ships out the door with their
-# machine or put another way, the most popular os provided with the machine.
-
-# Note that if you're going to try to match "-MANUFACTURER" here (say,
-# "-sun"), then you have to tell the case statement up towards the top
-# that MANUFACTURER isn't an operating system. Otherwise, code above
-# will signal an error saying that MANUFACTURER isn't an operating
-# system, and we'll never get to this point.
-
-case $basic_machine in
- *-acorn)
- os=-riscix1.2
- ;;
- arm*-semi)
- os=-aout
- ;;
- pdp11-*)
- os=-none
- ;;
- *-dec | vax-*)
- os=-ultrix4.2
- ;;
- m68*-apollo)
- os=-domain
- ;;
- i386-sun)
- os=-sunos4.0.2
- ;;
- m68000-sun)
- os=-sunos3
- # This also exists in the configure program, but was not the
- # default.
- # os=-sunos4
- ;;
- *-tti) # must be before sparc entry or we get the wrong os.
- os=-sysv3
- ;;
- sparc-* | *-sun)
- os=-sunos4.1.1
- ;;
- *-ibm)
- os=-aix
- ;;
- *-hp)
- os=-hpux
- ;;
- *-hitachi)
- os=-hiux
- ;;
- i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
- os=-sysv
- ;;
- *-cbm)
- os=-amigaos
- ;;
- *-dg)
- os=-dgux
- ;;
- *-dolphin)
- os=-sysv3
- ;;
- m68k-ccur)
- os=-rtu
- ;;
- m88k-omron*)
- os=-luna
- ;;
- *-next )
- os=-nextstep
- ;;
- *-sequent)
- os=-ptx
- ;;
- *-crds)
- os=-unos
- ;;
- *-ns)
- os=-genix
- ;;
- i370-*)
- os=-mvs
- ;;
- *-next)
- os=-nextstep3
- ;;
- *-gould)
- os=-sysv
- ;;
- *-highlevel)
- os=-bsd
- ;;
- *-encore)
- os=-bsd
- ;;
- *-sgi)
- os=-irix
- ;;
- *-siemens)
- os=-sysv4
- ;;
- *-masscomp)
- os=-rtu
- ;;
- f301-fujitsu)
- os=-uxpv
- ;;
- *)
- os=-none
- ;;
-esac
-fi
-
-# Here we handle the case where we know the os, and the CPU type, but not the
-# manufacturer. We pick the logical manufacturer.
-vendor=unknown
-case $basic_machine in
- *-unknown)
- case $os in
- -riscix*)
- vendor=acorn
- ;;
- -sunos*)
- vendor=sun
- ;;
- -aix*)
- vendor=ibm
- ;;
- -hpux*)
- vendor=hp
- ;;
- -hiux*)
- vendor=hitachi
- ;;
- -unos*)
- vendor=crds
- ;;
- -dgux*)
- vendor=dg
- ;;
- -luna*)
- vendor=omron
- ;;
- -genix*)
- vendor=ns
- ;;
- -mvs*)
- vendor=ibm
- ;;
- -ptx*)
- vendor=sequent
- ;;
- -vxsim* | -vxworks*)
- vendor=wrs
- ;;
- -aux*)
- vendor=apple
- ;;
- esac
- basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
- ;;
-esac
-
-echo $basic_machine$os
diff --git a/configure b/configure
deleted file mode 100755
index 35c9db5..0000000
--- a/configure
+++ /dev/null
@@ -1,2011 +0,0 @@
-#! /bin/sh
-
-# Guess values for system-dependent variables and create Makefiles.
-# Generated automatically using autoconf version 2.12
-# Copyright (C) 1992, 93, 94, 95, 96 Free Software Foundation, Inc.
-#
-# This configure script is free software; the Free Software Foundation
-# gives unlimited permission to copy, distribute and modify it.
-
-# Defaults:
-ac_help=
-ac_default_prefix=/usr/local
-# Any additions from configure.in:
-ac_help="$ac_help
- --enable-shared build shared library using GNU libtool"
-ac_help="$ac_help
- --enable-static build static library using GNU libtool"
-ac_help="$ac_help
- --enable-maxmem[=N] enable use of temp files, set max mem usage to N MB"
-ac_help="$ac_help
-"
-
-# Initialize some variables set by options.
-# The variables have the same names as the options, with
-# dashes changed to underlines.
-build=NONE
-cache_file=./config.cache
-exec_prefix=NONE
-host=NONE
-no_create=
-nonopt=NONE
-no_recursion=
-prefix=NONE
-program_prefix=NONE
-program_suffix=NONE
-program_transform_name=s,x,x,
-silent=
-site=
-srcdir=
-target=NONE
-verbose=
-x_includes=NONE
-x_libraries=NONE
-bindir='${exec_prefix}/bin'
-sbindir='${exec_prefix}/sbin'
-libexecdir='${exec_prefix}/libexec'
-datadir='${prefix}/share'
-sysconfdir='${prefix}/etc'
-sharedstatedir='${prefix}/com'
-localstatedir='${prefix}/var'
-libdir='${exec_prefix}/lib'
-includedir='${prefix}/include'
-oldincludedir='/usr/include'
-infodir='${prefix}/info'
-mandir='${prefix}/man'
-
-# Initialize some other variables.
-subdirs=
-MFLAGS= MAKEFLAGS=
-# Maximum number of lines to put in a shell here document.
-ac_max_here_lines=12
-
-ac_prev=
-for ac_option
-do
-
- # If the previous option needs an argument, assign it.
- if test -n "$ac_prev"; then
- eval "$ac_prev=\$ac_option"
- ac_prev=
- continue
- fi
-
- case "$ac_option" in
- -*=*) ac_optarg=`echo "$ac_option" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
- *) ac_optarg= ;;
- esac
-
- # Accept the important Cygnus configure options, so we can diagnose typos.
-
- case "$ac_option" in
-
- -bindir | --bindir | --bindi | --bind | --bin | --bi)
- ac_prev=bindir ;;
- -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
- bindir="$ac_optarg" ;;
-
- -build | --build | --buil | --bui | --bu)
- ac_prev=build ;;
- -build=* | --build=* | --buil=* | --bui=* | --bu=*)
- build="$ac_optarg" ;;
-
- -cache-file | --cache-file | --cache-fil | --cache-fi \
- | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
- ac_prev=cache_file ;;
- -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
- | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
- cache_file="$ac_optarg" ;;
-
- -datadir | --datadir | --datadi | --datad | --data | --dat | --da)
- ac_prev=datadir ;;
- -datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \
- | --da=*)
- datadir="$ac_optarg" ;;
-
- -disable-* | --disable-*)
- ac_feature=`echo $ac_option|sed -e 's/-*disable-//'`
- # Reject names that are not valid shell variable names.
- if test -n "`echo $ac_feature| sed 's/[-a-zA-Z0-9_]//g'`"; then
- { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; }
- fi
- ac_feature=`echo $ac_feature| sed 's/-/_/g'`
- eval "enable_${ac_feature}=no" ;;
-
- -enable-* | --enable-*)
- ac_feature=`echo $ac_option|sed -e 's/-*enable-//' -e 's/=.*//'`
- # Reject names that are not valid shell variable names.
- if test -n "`echo $ac_feature| sed 's/[-_a-zA-Z0-9]//g'`"; then
- { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; }
- fi
- ac_feature=`echo $ac_feature| sed 's/-/_/g'`
- case "$ac_option" in
- *=*) ;;
- *) ac_optarg=yes ;;
- esac
- eval "enable_${ac_feature}='$ac_optarg'" ;;
-
- -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
- | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
- | --exec | --exe | --ex)
- ac_prev=exec_prefix ;;
- -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
- | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
- | --exec=* | --exe=* | --ex=*)
- exec_prefix="$ac_optarg" ;;
-
- -gas | --gas | --ga | --g)
- # Obsolete; use --with-gas.
- with_gas=yes ;;
-
- -help | --help | --hel | --he)
- # Omit some internal or obsolete options to make the list less imposing.
- # This message is too long to be a string in the A/UX 3.1 sh.
- cat << EOF
-Usage: configure [options] [host]
-Options: [defaults in brackets after descriptions]
-Configuration:
- --cache-file=FILE cache test results in FILE
- --help print this message
- --no-create do not create output files
- --quiet, --silent do not print \`checking...' messages
- --version print the version of autoconf that created configure
-Directory and file names:
- --prefix=PREFIX install architecture-independent files in PREFIX
- [$ac_default_prefix]
- --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX
- [same as prefix]
- --bindir=DIR user executables in DIR [EPREFIX/bin]
- --sbindir=DIR system admin executables in DIR [EPREFIX/sbin]
- --libexecdir=DIR program executables in DIR [EPREFIX/libexec]
- --datadir=DIR read-only architecture-independent data in DIR
- [PREFIX/share]
- --sysconfdir=DIR read-only single-machine data in DIR [PREFIX/etc]
- --sharedstatedir=DIR modifiable architecture-independent data in DIR
- [PREFIX/com]
- --localstatedir=DIR modifiable single-machine data in DIR [PREFIX/var]
- --libdir=DIR object code libraries in DIR [EPREFIX/lib]
- --includedir=DIR C header files in DIR [PREFIX/include]
- --oldincludedir=DIR C header files for non-gcc in DIR [/usr/include]
- --infodir=DIR info documentation in DIR [PREFIX/info]
- --mandir=DIR man documentation in DIR [PREFIX/man]
- --srcdir=DIR find the sources in DIR [configure dir or ..]
- --program-prefix=PREFIX prepend PREFIX to installed program names
- --program-suffix=SUFFIX append SUFFIX to installed program names
- --program-transform-name=PROGRAM
- run sed PROGRAM on installed program names
-EOF
- cat << EOF
-Host type:
- --build=BUILD configure for building on BUILD [BUILD=HOST]
- --host=HOST configure for HOST [guessed]
- --target=TARGET configure for TARGET [TARGET=HOST]
-Features and packages:
- --disable-FEATURE do not include FEATURE (same as --enable-FEATURE=no)
- --enable-FEATURE[=ARG] include FEATURE [ARG=yes]
- --with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
- --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no)
- --x-includes=DIR X include files are in DIR
- --x-libraries=DIR X library files are in DIR
-EOF
- if test -n "$ac_help"; then
- echo "--enable and --with options recognized:$ac_help"
- fi
- exit 0 ;;
-
- -host | --host | --hos | --ho)
- ac_prev=host ;;
- -host=* | --host=* | --hos=* | --ho=*)
- host="$ac_optarg" ;;
-
- -includedir | --includedir | --includedi | --included | --include \
- | --includ | --inclu | --incl | --inc)
- ac_prev=includedir ;;
- -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
- | --includ=* | --inclu=* | --incl=* | --inc=*)
- includedir="$ac_optarg" ;;
-
- -infodir | --infodir | --infodi | --infod | --info | --inf)
- ac_prev=infodir ;;
- -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
- infodir="$ac_optarg" ;;
-
- -libdir | --libdir | --libdi | --libd)
- ac_prev=libdir ;;
- -libdir=* | --libdir=* | --libdi=* | --libd=*)
- libdir="$ac_optarg" ;;
-
- -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
- | --libexe | --libex | --libe)
- ac_prev=libexecdir ;;
- -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
- | --libexe=* | --libex=* | --libe=*)
- libexecdir="$ac_optarg" ;;
-
- -localstatedir | --localstatedir | --localstatedi | --localstated \
- | --localstate | --localstat | --localsta | --localst \
- | --locals | --local | --loca | --loc | --lo)
- ac_prev=localstatedir ;;
- -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
- | --localstate=* | --localstat=* | --localsta=* | --localst=* \
- | --locals=* | --local=* | --loca=* | --loc=* | --lo=*)
- localstatedir="$ac_optarg" ;;
-
- -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
- ac_prev=mandir ;;
- -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
- mandir="$ac_optarg" ;;
-
- -nfp | --nfp | --nf)
- # Obsolete; use --without-fp.
- with_fp=no ;;
-
- -no-create | --no-create | --no-creat | --no-crea | --no-cre \
- | --no-cr | --no-c)
- no_create=yes ;;
-
- -no-recursion | --no-recursion | --no-recursio | --no-recursi \
- | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
- no_recursion=yes ;;
-
- -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
- | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
- | --oldin | --oldi | --old | --ol | --o)
- ac_prev=oldincludedir ;;
- -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
- | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
- | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
- oldincludedir="$ac_optarg" ;;
-
- -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
- ac_prev=prefix ;;
- -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
- prefix="$ac_optarg" ;;
-
- -program-prefix | --program-prefix | --program-prefi | --program-pref \
- | --program-pre | --program-pr | --program-p)
- ac_prev=program_prefix ;;
- -program-prefix=* | --program-prefix=* | --program-prefi=* \
- | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
- program_prefix="$ac_optarg" ;;
-
- -program-suffix | --program-suffix | --program-suffi | --program-suff \
- | --program-suf | --program-su | --program-s)
- ac_prev=program_suffix ;;
- -program-suffix=* | --program-suffix=* | --program-suffi=* \
- | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
- program_suffix="$ac_optarg" ;;
-
- -program-transform-name | --program-transform-name \
- | --program-transform-nam | --program-transform-na \
- | --program-transform-n | --program-transform- \
- | --program-transform | --program-transfor \
- | --program-transfo | --program-transf \
- | --program-trans | --program-tran \
- | --progr-tra | --program-tr | --program-t)
- ac_prev=program_transform_name ;;
- -program-transform-name=* | --program-transform-name=* \
- | --program-transform-nam=* | --program-transform-na=* \
- | --program-transform-n=* | --program-transform-=* \
- | --program-transform=* | --program-transfor=* \
- | --program-transfo=* | --program-transf=* \
- | --program-trans=* | --program-tran=* \
- | --progr-tra=* | --program-tr=* | --program-t=*)
- program_transform_name="$ac_optarg" ;;
-
- -q | -quiet | --quiet | --quie | --qui | --qu | --q \
- | -silent | --silent | --silen | --sile | --sil)
- silent=yes ;;
-
- -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
- ac_prev=sbindir ;;
- -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
- | --sbi=* | --sb=*)
- sbindir="$ac_optarg" ;;
-
- -sharedstatedir | --sharedstatedir | --sharedstatedi \
- | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
- | --sharedst | --shareds | --shared | --share | --shar \
- | --sha | --sh)
- ac_prev=sharedstatedir ;;
- -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
- | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
- | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
- | --sha=* | --sh=*)
- sharedstatedir="$ac_optarg" ;;
-
- -site | --site | --sit)
- ac_prev=site ;;
- -site=* | --site=* | --sit=*)
- site="$ac_optarg" ;;
-
- -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
- ac_prev=srcdir ;;
- -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
- srcdir="$ac_optarg" ;;
-
- -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
- | --syscon | --sysco | --sysc | --sys | --sy)
- ac_prev=sysconfdir ;;
- -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
- | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
- sysconfdir="$ac_optarg" ;;
-
- -target | --target | --targe | --targ | --tar | --ta | --t)
- ac_prev=target ;;
- -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
- target="$ac_optarg" ;;
-
- -v | -verbose | --verbose | --verbos | --verbo | --verb)
- verbose=yes ;;
-
- -version | --version | --versio | --versi | --vers)
- echo "configure generated by autoconf version 2.12"
- exit 0 ;;
-
- -with-* | --with-*)
- ac_package=`echo $ac_option|sed -e 's/-*with-//' -e 's/=.*//'`
- # Reject names that are not valid shell variable names.
- if test -n "`echo $ac_package| sed 's/[-_a-zA-Z0-9]//g'`"; then
- { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; }
- fi
- ac_package=`echo $ac_package| sed 's/-/_/g'`
- case "$ac_option" in
- *=*) ;;
- *) ac_optarg=yes ;;
- esac
- eval "with_${ac_package}='$ac_optarg'" ;;
-
- -without-* | --without-*)
- ac_package=`echo $ac_option|sed -e 's/-*without-//'`
- # Reject names that are not valid shell variable names.
- if test -n "`echo $ac_package| sed 's/[-a-zA-Z0-9_]//g'`"; then
- { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; }
- fi
- ac_package=`echo $ac_package| sed 's/-/_/g'`
- eval "with_${ac_package}=no" ;;
-
- --x)
- # Obsolete; use --with-x.
- with_x=yes ;;
-
- -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
- | --x-incl | --x-inc | --x-in | --x-i)
- ac_prev=x_includes ;;
- -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
- | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
- x_includes="$ac_optarg" ;;
-
- -x-libraries | --x-libraries | --x-librarie | --x-librari \
- | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
- ac_prev=x_libraries ;;
- -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
- | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
- x_libraries="$ac_optarg" ;;
-
- -*) { echo "configure: error: $ac_option: invalid option; use --help to show usage" 1>&2; exit 1; }
- ;;
-
- *=*)
- varname=`echo "$ac_option"|sed -e 's/=.*//'`
- # Reject names that aren't valid shell variable names.
- if test -n "`echo $varname| sed 's/[a-zA-Z0-9_]//g'`"; then
- { echo "configure: error: $varname: invalid shell variable name" 1>&2; exit 1; }
- fi
- val="`echo "$ac_option"|sed 's/[^=]*=//'`"
- test -n "$verbose" && echo " setting shell variable $varname to $val"
- eval "$varname='$val'"
- eval "export $varname" ;;
-
- *)
- if test -n "`echo $ac_option| sed 's/[-a-z0-9.]//g'`"; then
- echo "configure: warning: $ac_option: invalid host type" 1>&2
- fi
- if test "x$nonopt" != xNONE; then
- { echo "configure: error: can only configure for one host and one target at a time" 1>&2; exit 1; }
- fi
- nonopt="$ac_option"
- ;;
-
- esac
-done
-
-if test -n "$ac_prev"; then
- { echo "configure: error: missing argument to --`echo $ac_prev | sed 's/_/-/g'`" 1>&2; exit 1; }
-fi
-
-trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15
-
-# File descriptor usage:
-# 0 standard input
-# 1 file creation
-# 2 errors and warnings
-# 3 some systems may open it to /dev/tty
-# 4 used on the Kubota Titan
-# 6 checking for... messages and results
-# 5 compiler messages saved in config.log
-if test "$silent" = yes; then
- exec 6>/dev/null
-else
- exec 6>&1
-fi
-exec 5>./config.log
-
-echo "\
-This file contains any messages produced by compilers while
-running configure, to aid debugging if configure makes a mistake.
-" 1>&5
-
-# Strip out --no-create and --no-recursion so they do not pile up.
-# Also quote any args containing shell metacharacters.
-ac_configure_args=
-for ac_arg
-do
- case "$ac_arg" in
- -no-create | --no-create | --no-creat | --no-crea | --no-cre \
- | --no-cr | --no-c) ;;
- -no-recursion | --no-recursion | --no-recursio | --no-recursi \
- | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) ;;
- *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*)
- ac_configure_args="$ac_configure_args '$ac_arg'" ;;
- *) ac_configure_args="$ac_configure_args $ac_arg" ;;
- esac
-done
-
-# NLS nuisances.
-# Only set these to C if already set. These must not be set unconditionally
-# because not all systems understand e.g. LANG=C (notably SCO).
-# Fixing LC_MESSAGES prevents Solaris sh from translating var values in `set'!
-# Non-C LC_CTYPE values break the ctype check.
-if test "${LANG+set}" = set; then LANG=C; export LANG; fi
-if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi
-if test "${LC_MESSAGES+set}" = set; then LC_MESSAGES=C; export LC_MESSAGES; fi
-if test "${LC_CTYPE+set}" = set; then LC_CTYPE=C; export LC_CTYPE; fi
-
-# confdefs.h avoids OS command line length limits that DEFS can exceed.
-rm -rf conftest* confdefs.h
-# AIX cpp loses on an empty file, so make sure it contains at least a newline.
-echo > confdefs.h
-
-# A filename unique to this package, relative to the directory that
-# configure is in, which we can look for to find out if srcdir is correct.
-ac_unique_file=jcmaster.c
-
-# Find the source files, if location was not specified.
-if test -z "$srcdir"; then
- ac_srcdir_defaulted=yes
- # Try the directory containing this script, then its parent.
- ac_prog=$0
- ac_confdir=`echo $ac_prog|sed 's%/[^/][^/]*$%%'`
- test "x$ac_confdir" = "x$ac_prog" && ac_confdir=.
- srcdir=$ac_confdir
- if test ! -r $srcdir/$ac_unique_file; then
- srcdir=..
- fi
-else
- ac_srcdir_defaulted=no
-fi
-if test ! -r $srcdir/$ac_unique_file; then
- if test "$ac_srcdir_defaulted" = yes; then
- { echo "configure: error: can not find sources in $ac_confdir or .." 1>&2; exit 1; }
- else
- { echo "configure: error: can not find sources in $srcdir" 1>&2; exit 1; }
- fi
-fi
-srcdir=`echo "${srcdir}" | sed 's%\([^/]\)/*$%\1%'`
-
-# Prefer explicitly selected file to automatically selected ones.
-if test -z "$CONFIG_SITE"; then
- if test "x$prefix" != xNONE; then
- CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site"
- else
- CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site"
- fi
-fi
-for ac_site_file in $CONFIG_SITE; do
- if test -r "$ac_site_file"; then
- echo "loading site script $ac_site_file"
- . "$ac_site_file"
- fi
-done
-
-
-ac_ext=c
-# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
-ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
-cross_compiling=$ac_cv_prog_cc_cross
-
-if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
- # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu.
- if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
- ac_n= ac_c='
-' ac_t=' '
- else
- ac_n=-n ac_c= ac_t=
- fi
-else
- ac_n= ac_c='\c' ac_t=
-fi
-
-
-
-# Extract the first word of "gcc", so it can be a program name with args.
-set dummy gcc; ac_word=$2
-echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:538: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- if test -n "$CC"; then
- ac_cv_prog_CC="$CC" # Let the user override the test.
-else
- IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:"
- for ac_dir in $PATH; do
- test -z "$ac_dir" && ac_dir=.
- if test -f $ac_dir/$ac_word; then
- ac_cv_prog_CC="gcc"
- break
- fi
- done
- IFS="$ac_save_ifs"
-fi
-fi
-CC="$ac_cv_prog_CC"
-if test -n "$CC"; then
- echo "$ac_t""$CC" 1>&6
-else
- echo "$ac_t""no" 1>&6
-fi
-
-if test -z "$CC"; then
- # Extract the first word of "cc", so it can be a program name with args.
-set dummy cc; ac_word=$2
-echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:567: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- if test -n "$CC"; then
- ac_cv_prog_CC="$CC" # Let the user override the test.
-else
- IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:"
- ac_prog_rejected=no
- for ac_dir in $PATH; do
- test -z "$ac_dir" && ac_dir=.
- if test -f $ac_dir/$ac_word; then
- if test "$ac_dir/$ac_word" = "/usr/ucb/cc"; then
- ac_prog_rejected=yes
- continue
- fi
- ac_cv_prog_CC="cc"
- break
- fi
- done
- IFS="$ac_save_ifs"
-if test $ac_prog_rejected = yes; then
- # We found a bogon in the path, so make sure we never use it.
- set dummy $ac_cv_prog_CC
- shift
- if test $# -gt 0; then
- # We chose a different compiler from the bogus one.
- # However, it has the same basename, so the bogon will be chosen
- # first if we set CC to just the basename; use the full file name.
- shift
- set dummy "$ac_dir/$ac_word" "$@"
- shift
- ac_cv_prog_CC="$@"
- fi
-fi
-fi
-fi
-CC="$ac_cv_prog_CC"
-if test -n "$CC"; then
- echo "$ac_t""$CC" 1>&6
-else
- echo "$ac_t""no" 1>&6
-fi
-
- test -z "$CC" && { echo "configure: error: no acceptable cc found in \$PATH" 1>&2; exit 1; }
-fi
-
-echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6
-echo "configure:615: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5
-
-ac_ext=c
-# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
-ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
-cross_compiling=$ac_cv_prog_cc_cross
-
-cat > conftest.$ac_ext <<EOF
-#line 625 "configure"
-#include "confdefs.h"
-main(){return(0);}
-EOF
-if { (eval echo configure:629: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
- ac_cv_prog_cc_works=yes
- # If we can't run a trivial program, we are probably using a cross compiler.
- if (./conftest; exit) 2>/dev/null; then
- ac_cv_prog_cc_cross=no
- else
- ac_cv_prog_cc_cross=yes
- fi
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- ac_cv_prog_cc_works=no
-fi
-rm -fr conftest*
-
-echo "$ac_t""$ac_cv_prog_cc_works" 1>&6
-if test $ac_cv_prog_cc_works = no; then
- { echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; }
-fi
-echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6
-echo "configure:649: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5
-echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6
-cross_compiling=$ac_cv_prog_cc_cross
-
-echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6
-echo "configure:654: checking whether we are using GNU C" >&5
-if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- cat > conftest.c <<EOF
-#ifdef __GNUC__
- yes;
-#endif
-EOF
-if { ac_try='${CC-cc} -E conftest.c'; { (eval echo configure:663: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
- ac_cv_prog_gcc=yes
-else
- ac_cv_prog_gcc=no
-fi
-fi
-
-echo "$ac_t""$ac_cv_prog_gcc" 1>&6
-
-if test $ac_cv_prog_gcc = yes; then
- GCC=yes
- test "${CFLAGS+set}" = set || CFLAGS="-O2"
-else
- GCC=
- test "${CFLAGS+set}" = set || CFLAGS="-O"
-fi
-
-echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6
-echo "configure:681: checking how to run the C preprocessor" >&5
-# On Suns, sometimes $CPP names a directory.
-if test -n "$CPP" && test -d "$CPP"; then
- CPP=
-fi
-if test -z "$CPP"; then
-if eval "test \"`echo '$''{'ac_cv_prog_CPP'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- # This must be in double quotes, not single quotes, because CPP may get
- # substituted into the Makefile and "${CC-cc}" will confuse make.
- CPP="${CC-cc} -E"
- # On the NeXT, cc -E runs the code through the compiler's parser,
- # not just through cpp.
- cat > conftest.$ac_ext <<EOF
-#line 696 "configure"
-#include "confdefs.h"
-#include <assert.h>
-Syntax Error
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:702: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
-if test -z "$ac_err"; then
- :
-else
- echo "$ac_err" >&5
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- CPP="${CC-cc} -E -traditional-cpp"
- cat > conftest.$ac_ext <<EOF
-#line 713 "configure"
-#include "confdefs.h"
-#include <assert.h>
-Syntax Error
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:719: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
-if test -z "$ac_err"; then
- :
-else
- echo "$ac_err" >&5
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- CPP=/lib/cpp
-fi
-rm -f conftest*
-fi
-rm -f conftest*
- ac_cv_prog_CPP="$CPP"
-fi
- CPP="$ac_cv_prog_CPP"
-else
- ac_cv_prog_CPP="$CPP"
-fi
-echo "$ac_t""$CPP" 1>&6
-
-echo $ac_n "checking for function prototypes""... $ac_c" 1>&6
-echo "configure:742: checking for function prototypes" >&5
-if eval "test \"`echo '$''{'ijg_cv_have_prototypes'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- cat > conftest.$ac_ext <<EOF
-#line 747 "configure"
-#include "confdefs.h"
-
-int testfunction (int arg1, int * arg2); /* check prototypes */
-struct methods_struct { /* check method-pointer declarations */
- int (*error_exit) (char *msgtext);
- int (*trace_message) (char *msgtext);
- int (*another_method) (void);
-};
-int testfunction (int arg1, int * arg2) /* check definitions */
-{ return arg2[arg1]; }
-int test2function (void) /* check void arg list */
-{ return 0; }
-
-int main() {
-
-; return 0; }
-EOF
-if { (eval echo configure:765: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
- rm -rf conftest*
- ijg_cv_have_prototypes=yes
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- ijg_cv_have_prototypes=no
-fi
-rm -f conftest*
-fi
-
-echo "$ac_t""$ijg_cv_have_prototypes" 1>&6
-if test $ijg_cv_have_prototypes = yes; then
- cat >> confdefs.h <<\EOF
-#define HAVE_PROTOTYPES
-EOF
-
-else
- echo Your compiler does not seem to know about function prototypes.
- echo Perhaps it needs a special switch to enable ANSI C mode.
- echo If so, we recommend running configure like this:
- echo " ./configure CC='cc -switch'"
- echo where -switch is the proper switch.
-fi
-ac_safe=`echo "stddef.h" | sed 'y%./+-%__p_%'`
-echo $ac_n "checking for stddef.h""... $ac_c" 1>&6
-echo "configure:792: checking for stddef.h" >&5
-if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- cat > conftest.$ac_ext <<EOF
-#line 797 "configure"
-#include "confdefs.h"
-#include <stddef.h>
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:802: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
-if test -z "$ac_err"; then
- rm -rf conftest*
- eval "ac_cv_header_$ac_safe=yes"
-else
- echo "$ac_err" >&5
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- eval "ac_cv_header_$ac_safe=no"
-fi
-rm -f conftest*
-fi
-if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
- echo "$ac_t""yes" 1>&6
- cat >> confdefs.h <<\EOF
-#define HAVE_STDDEF_H
-EOF
-
-else
- echo "$ac_t""no" 1>&6
-fi
-
-ac_safe=`echo "stdlib.h" | sed 'y%./+-%__p_%'`
-echo $ac_n "checking for stdlib.h""... $ac_c" 1>&6
-echo "configure:828: checking for stdlib.h" >&5
-if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- cat > conftest.$ac_ext <<EOF
-#line 833 "configure"
-#include "confdefs.h"
-#include <stdlib.h>
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:838: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
-if test -z "$ac_err"; then
- rm -rf conftest*
- eval "ac_cv_header_$ac_safe=yes"
-else
- echo "$ac_err" >&5
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- eval "ac_cv_header_$ac_safe=no"
-fi
-rm -f conftest*
-fi
-if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
- echo "$ac_t""yes" 1>&6
- cat >> confdefs.h <<\EOF
-#define HAVE_STDLIB_H
-EOF
-
-else
- echo "$ac_t""no" 1>&6
-fi
-
-ac_safe=`echo "string.h" | sed 'y%./+-%__p_%'`
-echo $ac_n "checking for string.h""... $ac_c" 1>&6
-echo "configure:864: checking for string.h" >&5
-if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- cat > conftest.$ac_ext <<EOF
-#line 869 "configure"
-#include "confdefs.h"
-#include <string.h>
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:874: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
-if test -z "$ac_err"; then
- rm -rf conftest*
- eval "ac_cv_header_$ac_safe=yes"
-else
- echo "$ac_err" >&5
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- eval "ac_cv_header_$ac_safe=no"
-fi
-rm -f conftest*
-fi
-if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
- echo "$ac_t""yes" 1>&6
- :
-else
- echo "$ac_t""no" 1>&6
-cat >> confdefs.h <<\EOF
-#define NEED_BSD_STRINGS
-EOF
-
-fi
-
-echo $ac_n "checking for size_t""... $ac_c" 1>&6
-echo "configure:900: checking for size_t" >&5
-cat > conftest.$ac_ext <<EOF
-#line 902 "configure"
-#include "confdefs.h"
-
-#ifdef HAVE_STDDEF_H
-#include <stddef.h>
-#endif
-#ifdef HAVE_STDLIB_H
-#include <stdlib.h>
-#endif
-#include <stdio.h>
-#ifdef NEED_BSD_STRINGS
-#include <strings.h>
-#else
-#include <string.h>
-#endif
-typedef size_t my_size_t;
-
-int main() {
- my_size_t foovar;
-; return 0; }
-EOF
-if { (eval echo configure:923: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
- rm -rf conftest*
- ijg_size_t_ok=yes
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- ijg_size_t_ok="not ANSI, perhaps it is in sys/types.h"
-fi
-rm -f conftest*
-echo "$ac_t""$ijg_size_t_ok" 1>&6
-if test "$ijg_size_t_ok" != yes; then
-ac_safe=`echo "sys/types.h" | sed 'y%./+-%__p_%'`
-echo $ac_n "checking for sys/types.h""... $ac_c" 1>&6
-echo "configure:937: checking for sys/types.h" >&5
-if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- cat > conftest.$ac_ext <<EOF
-#line 942 "configure"
-#include "confdefs.h"
-#include <sys/types.h>
-EOF
-ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:947: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
-if test -z "$ac_err"; then
- rm -rf conftest*
- eval "ac_cv_header_$ac_safe=yes"
-else
- echo "$ac_err" >&5
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- eval "ac_cv_header_$ac_safe=no"
-fi
-rm -f conftest*
-fi
-if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
- echo "$ac_t""yes" 1>&6
- cat >> confdefs.h <<\EOF
-#define NEED_SYS_TYPES_H
-EOF
-
-cat > conftest.$ac_ext <<EOF
-#line 968 "configure"
-#include "confdefs.h"
-#include <sys/types.h>
-EOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
- egrep "size_t" >/dev/null 2>&1; then
- rm -rf conftest*
- ijg_size_t_ok="size_t is in sys/types.h"
-else
- rm -rf conftest*
- ijg_size_t_ok=no
-fi
-rm -f conftest*
-
-else
- echo "$ac_t""no" 1>&6
-ijg_size_t_ok=no
-fi
-
-echo "$ac_t""$ijg_size_t_ok" 1>&6
-if test "$ijg_size_t_ok" = no; then
- echo Type size_t is not defined in any of the usual places.
- echo Try putting '"typedef unsigned int size_t;"' in jconfig.h.
-fi
-fi
-echo $ac_n "checking for type unsigned char""... $ac_c" 1>&6
-echo "configure:994: checking for type unsigned char" >&5
-cat > conftest.$ac_ext <<EOF
-#line 996 "configure"
-#include "confdefs.h"
-
-int main() {
- unsigned char un_char;
-; return 0; }
-EOF
-if { (eval echo configure:1003: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
- rm -rf conftest*
- echo "$ac_t""yes" 1>&6
-cat >> confdefs.h <<\EOF
-#define HAVE_UNSIGNED_CHAR
-EOF
-
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- echo "$ac_t""no" 1>&6
-fi
-rm -f conftest*
-echo $ac_n "checking for type unsigned short""... $ac_c" 1>&6
-echo "configure:1018: checking for type unsigned short" >&5
-cat > conftest.$ac_ext <<EOF
-#line 1020 "configure"
-#include "confdefs.h"
-
-int main() {
- unsigned short un_short;
-; return 0; }
-EOF
-if { (eval echo configure:1027: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
- rm -rf conftest*
- echo "$ac_t""yes" 1>&6
-cat >> confdefs.h <<\EOF
-#define HAVE_UNSIGNED_SHORT
-EOF
-
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- echo "$ac_t""no" 1>&6
-fi
-rm -f conftest*
-echo $ac_n "checking for type void""... $ac_c" 1>&6
-echo "configure:1042: checking for type void" >&5
-cat > conftest.$ac_ext <<EOF
-#line 1044 "configure"
-#include "confdefs.h"
-
-/* Caution: a C++ compiler will insist on valid prototypes */
-typedef void * void_ptr; /* check void * */
-#ifdef HAVE_PROTOTYPES /* check ptr to function returning void */
-typedef void (*void_func) (int a, int b);
-#else
-typedef void (*void_func) ();
-#endif
-
-#ifdef HAVE_PROTOTYPES /* check void function result */
-void test3function (void_ptr arg1, void_func arg2)
-#else
-void test3function (arg1, arg2)
- void_ptr arg1;
- void_func arg2;
-#endif
-{
- char * locptr = (char *) arg1; /* check casting to and from void * */
- arg1 = (void *) locptr;
- (*arg2) (1, 2); /* check call of fcn returning void */
-}
-
-int main() {
-
-; return 0; }
-EOF
-if { (eval echo configure:1072: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
- rm -rf conftest*
- echo "$ac_t""yes" 1>&6
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- echo "$ac_t""no" 1>&6
-cat >> confdefs.h <<\EOF
-#define void char
-EOF
-
-fi
-rm -f conftest*
-
-echo $ac_n "checking for working const""... $ac_c" 1>&6
-echo "configure:1088: checking for working const" >&5
-if eval "test \"`echo '$''{'ac_cv_c_const'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- cat > conftest.$ac_ext <<EOF
-#line 1093 "configure"
-#include "confdefs.h"
-
-int main() {
-
-/* Ultrix mips cc rejects this. */
-typedef int charset[2]; const charset x;
-/* SunOS 4.1.1 cc rejects this. */
-char const *const *ccp;
-char **p;
-/* NEC SVR4.0.2 mips cc rejects this. */
-struct point {int x, y;};
-static struct point const zero = {0,0};
-/* AIX XL C 1.02.0.0 rejects this.
- It does not let you subtract one const X* pointer from another in an arm
- of an if-expression whose if-part is not a constant expression */
-const char *g = "string";
-ccp = &g + (g ? g-g : 0);
-/* HPUX 7.0 cc rejects these. */
-++ccp;
-p = (char**) ccp;
-ccp = (char const *const *) p;
-{ /* SCO 3.2v4 cc rejects this. */
- char *t;
- char const *s = 0 ? (char *) 0 : (char const *) 0;
-
- *t++ = 0;
-}
-{ /* Someone thinks the Sun supposedly-ANSI compiler will reject this. */
- int x[] = {25, 17};
- const int *foo = &x[0];
- ++foo;
-}
-{ /* Sun SC1.0 ANSI compiler rejects this -- but not the above. */
- typedef const int *iptr;
- iptr p = 0;
- ++p;
-}
-{ /* AIX XL C 1.02.0.0 rejects this saying
- "k.c", line 2.27: 1506-025 (S) Operand must be a modifiable lvalue. */
- struct s { int j; const int *ap[3]; };
- struct s *b; b->j = 5;
-}
-{ /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */
- const int foo = 10;
-}
-
-; return 0; }
-EOF
-if { (eval echo configure:1142: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
- rm -rf conftest*
- ac_cv_c_const=yes
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- ac_cv_c_const=no
-fi
-rm -f conftest*
-fi
-
-echo "$ac_t""$ac_cv_c_const" 1>&6
-if test $ac_cv_c_const = no; then
- cat >> confdefs.h <<\EOF
-#define const
-EOF
-
-fi
-
-echo $ac_n "checking for inline""... $ac_c" 1>&6
-echo "configure:1163: checking for inline" >&5
-ijg_cv_inline=""
-cat > conftest.$ac_ext <<EOF
-#line 1166 "configure"
-#include "confdefs.h"
-
-int main() {
-} __inline__ int foo() { return 0; }
-int bar() { return foo();
-; return 0; }
-EOF
-if { (eval echo configure:1174: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
- rm -rf conftest*
- ijg_cv_inline="__inline__"
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- cat > conftest.$ac_ext <<EOF
-#line 1182 "configure"
-#include "confdefs.h"
-
-int main() {
-} __inline int foo() { return 0; }
-int bar() { return foo();
-; return 0; }
-EOF
-if { (eval echo configure:1190: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
- rm -rf conftest*
- ijg_cv_inline="__inline"
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- cat > conftest.$ac_ext <<EOF
-#line 1198 "configure"
-#include "confdefs.h"
-
-int main() {
-} inline int foo() { return 0; }
-int bar() { return foo();
-; return 0; }
-EOF
-if { (eval echo configure:1206: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
- rm -rf conftest*
- ijg_cv_inline="inline"
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
-fi
-rm -f conftest*
-fi
-rm -f conftest*
-fi
-rm -f conftest*
-echo "$ac_t""$ijg_cv_inline" 1>&6
-cat >> confdefs.h <<EOF
-#define INLINE $ijg_cv_inline
-EOF
-
-echo $ac_n "checking for broken incomplete types""... $ac_c" 1>&6
-echo "configure:1224: checking for broken incomplete types" >&5
-cat > conftest.$ac_ext <<EOF
-#line 1226 "configure"
-#include "confdefs.h"
- typedef struct undefined_structure * undef_struct_ptr;
-int main() {
-
-; return 0; }
-EOF
-if { (eval echo configure:1233: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
- rm -rf conftest*
- echo "$ac_t""ok" 1>&6
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- echo "$ac_t""broken" 1>&6
-cat >> confdefs.h <<\EOF
-#define INCOMPLETE_TYPES_BROKEN
-EOF
-
-fi
-rm -f conftest*
-echo $ac_n "checking for short external names""... $ac_c" 1>&6
-echo "configure:1248: checking for short external names" >&5
-cat > conftest.$ac_ext <<EOF
-#line 1250 "configure"
-#include "confdefs.h"
-
-int possibly_duplicate_function () { return 0; }
-int possibly_dupli_function () { return 1; }
-
-int main() {
-
-; return 0; }
-EOF
-if { (eval echo configure:1260: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
- rm -rf conftest*
- echo "$ac_t""ok" 1>&6
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- echo "$ac_t""short" 1>&6
-cat >> confdefs.h <<\EOF
-#define NEED_SHORT_EXTERNAL_NAMES
-EOF
-
-fi
-rm -f conftest*
-echo $ac_n "checking to see if char is signed""... $ac_c" 1>&6
-echo "configure:1275: checking to see if char is signed" >&5
-if test "$cross_compiling" = yes; then
- echo Assuming that char is signed on target machine.
-echo If it is unsigned, this will be a little bit inefficient.
-
-else
- cat > conftest.$ac_ext <<EOF
-#line 1282 "configure"
-#include "confdefs.h"
-
-#ifdef HAVE_PROTOTYPES
-int is_char_signed (int arg)
-#else
-int is_char_signed (arg)
- int arg;
-#endif
-{
- if (arg == 189) { /* expected result for unsigned char */
- return 0; /* type char is unsigned */
- }
- else if (arg != -67) { /* expected result for signed char */
- printf("Hmm, it seems 'char' is not eight bits wide on your machine.\n");
- printf("I fear the JPEG software will not work at all.\n\n");
- }
- return 1; /* assume char is signed otherwise */
-}
-char signed_char_check = (char) (-67);
-main() {
- exit(is_char_signed((int) signed_char_check));
-}
-EOF
-if { (eval echo configure:1306: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
-then
- echo "$ac_t""no" 1>&6
-cat >> confdefs.h <<\EOF
-#define CHAR_IS_UNSIGNED
-EOF
-
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -fr conftest*
- echo "$ac_t""yes" 1>&6
-fi
-rm -fr conftest*
-fi
-
-echo $ac_n "checking to see if right shift is signed""... $ac_c" 1>&6
-echo "configure:1323: checking to see if right shift is signed" >&5
-if test "$cross_compiling" = yes; then
- echo "$ac_t""Assuming that right shift is signed on target machine." 1>&6
-else
- cat > conftest.$ac_ext <<EOF
-#line 1328 "configure"
-#include "confdefs.h"
-
-#ifdef HAVE_PROTOTYPES
-int is_shifting_signed (long arg)
-#else
-int is_shifting_signed (arg)
- long arg;
-#endif
-/* See whether right-shift on a long is signed or not. */
-{
- long res = arg >> 4;
-
- if (res == -0x7F7E80CL) { /* expected result for signed shift */
- return 1; /* right shift is signed */
- }
- /* see if unsigned-shift hack will fix it. */
- /* we can't just test exact value since it depends on width of long... */
- res |= (~0L) << (32-4);
- if (res == -0x7F7E80CL) { /* expected result now? */
- return 0; /* right shift is unsigned */
- }
- printf("Right shift isn't acting as I expect it to.\n");
- printf("I fear the JPEG software will not work at all.\n\n");
- return 0; /* try it with unsigned anyway */
-}
-main() {
- exit(is_shifting_signed(-0x7F7E80B1L));
-}
-EOF
-if { (eval echo configure:1358: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
-then
- echo "$ac_t""no" 1>&6
-cat >> confdefs.h <<\EOF
-#define RIGHT_SHIFT_IS_UNSIGNED
-EOF
-
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -fr conftest*
- echo "$ac_t""yes" 1>&6
-fi
-rm -fr conftest*
-fi
-
-echo $ac_n "checking to see if fopen accepts b spec""... $ac_c" 1>&6
-echo "configure:1375: checking to see if fopen accepts b spec" >&5
-if test "$cross_compiling" = yes; then
- echo "$ac_t""Assuming that it does." 1>&6
-else
- cat > conftest.$ac_ext <<EOF
-#line 1380 "configure"
-#include "confdefs.h"
-
-#include <stdio.h>
-main() {
- if (fopen("conftestdata", "wb") != NULL)
- exit(0);
- exit(1);
-}
-EOF
-if { (eval echo configure:1390: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
-then
- echo "$ac_t""yes" 1>&6
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -fr conftest*
- echo "$ac_t""no" 1>&6
-cat >> confdefs.h <<\EOF
-#define DONT_USE_B_MODE
-EOF
-
-fi
-rm -fr conftest*
-fi
-
-ac_aux_dir=
-for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do
- if test -f $ac_dir/install-sh; then
- ac_aux_dir=$ac_dir
- ac_install_sh="$ac_aux_dir/install-sh -c"
- break
- elif test -f $ac_dir/install.sh; then
- ac_aux_dir=$ac_dir
- ac_install_sh="$ac_aux_dir/install.sh -c"
- break
- fi
-done
-if test -z "$ac_aux_dir"; then
- { echo "configure: error: can not find install-sh or install.sh in $srcdir $srcdir/.. $srcdir/../.." 1>&2; exit 1; }
-fi
-ac_config_guess=$ac_aux_dir/config.guess
-ac_config_sub=$ac_aux_dir/config.sub
-ac_configure=$ac_aux_dir/configure # This should be Cygnus configure.
-
-# Find a good install program. We prefer a C program (faster),
-# so one script is as good as another. But avoid the broken or
-# incompatible versions:
-# SysV /etc/install, /usr/sbin/install
-# SunOS /usr/etc/install
-# IRIX /sbin/install
-# AIX /bin/install
-# AFS /usr/afsws/bin/install, which mishandles nonexistent args
-# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
-# ./install, which can be erroneously created by make from ./install.sh.
-echo $ac_n "checking for a BSD compatible install""... $ac_c" 1>&6
-echo "configure:1436: checking for a BSD compatible install" >&5
-if test -z "$INSTALL"; then
-if eval "test \"`echo '$''{'ac_cv_path_install'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- IFS="${IFS= }"; ac_save_IFS="$IFS"; IFS="${IFS}:"
- for ac_dir in $PATH; do
- # Account for people who put trailing slashes in PATH elements.
- case "$ac_dir/" in
- /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;;
- *)
- # OSF1 and SCO ODT 3.0 have their own names for install.
- for ac_prog in ginstall installbsd scoinst install; do
- if test -f $ac_dir/$ac_prog; then
- if test $ac_prog = install &&
- grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then
- # AIX install. It has an incompatible calling convention.
- # OSF/1 installbsd also uses dspmsg, but is usable.
- :
- else
- ac_cv_path_install="$ac_dir/$ac_prog -c"
- break 2
- fi
- fi
- done
- ;;
- esac
- done
- IFS="$ac_save_IFS"
-
-fi
- if test "${ac_cv_path_install+set}" = set; then
- INSTALL="$ac_cv_path_install"
- else
- # As a last resort, use the slow shell script. We don't cache a
- # path for INSTALL within a source directory, because that will
- # break other packages using the cache if that directory is
- # removed, or if the path is relative.
- INSTALL="$ac_install_sh"
- fi
-fi
-echo "$ac_t""$INSTALL" 1>&6
-
-# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
-# It thinks the first close brace ends the variable substitution.
-test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
-
-test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
-
-# Extract the first word of "ranlib", so it can be a program name with args.
-set dummy ranlib; ac_word=$2
-echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1488: checking for $ac_word" >&5
-if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
- echo $ac_n "(cached) $ac_c" 1>&6
-else
- if test -n "$RANLIB"; then
- ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
-else
- IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:"
- for ac_dir in $PATH; do
- test -z "$ac_dir" && ac_dir=.
- if test -f $ac_dir/$ac_word; then
- ac_cv_prog_RANLIB="ranlib"
- break
- fi
- done
- IFS="$ac_save_ifs"
- test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":"
-fi
-fi
-RANLIB="$ac_cv_prog_RANLIB"
-if test -n "$RANLIB"; then
- echo "$ac_t""$RANLIB" 1>&6
-else
- echo "$ac_t""no" 1>&6
-fi
-
-
-# Decide whether to use libtool,
-# and if so whether to build shared, static, or both flavors of library.
-LTSHARED="no"
-# Check whether --enable-shared or --disable-shared was given.
-if test "${enable_shared+set}" = set; then
- enableval="$enable_shared"
- LTSHARED="$enableval"
-fi
-
-LTSTATIC="no"
-# Check whether --enable-static or --disable-static was given.
-if test "${enable_static+set}" = set; then
- enableval="$enable_static"
- LTSTATIC="$enableval"
-fi
-
-if test "x$LTSHARED" != xno -o "x$LTSTATIC" != xno; then
- USELIBTOOL="yes"
- LIBTOOL="./libtool"
- O="lo"
- A="la"
- LN='$(LIBTOOL) --mode=link $(CC)'
- INSTALL_LIB='$(LIBTOOL) --mode=install ${INSTALL}'
- INSTALL_PROGRAM="\$(LIBTOOL) --mode=install $INSTALL_PROGRAM"
-else
- USELIBTOOL="no"
- LIBTOOL=""
- O="o"
- A="a"
- LN='$(CC)'
- INSTALL_LIB="$INSTALL_DATA"
-fi
-
-
-
-
-
-
-# Configure libtool if needed.
-if test $USELIBTOOL = yes; then
- disable_shared=
- disable_static=
- if test "x$LTSHARED" = xno; then
- disable_shared="--disable-shared"
- fi
- if test "x$LTSTATIC" = xno; then
- disable_static="--disable-static"
- fi
- $srcdir/ltconfig $disable_shared $disable_static $srcdir/ltmain.sh
-fi
-
-# Select memory manager depending on user input.
-# If no "-enable-maxmem", use jmemnobs
-MEMORYMGR='jmemnobs.$(O)'
-MAXMEM="no"
-# Check whether --enable-maxmem or --disable-maxmem was given.
-if test "${enable_maxmem+set}" = set; then
- enableval="$enable_maxmem"
- MAXMEM="$enableval"
-fi
-
-# support --with-maxmem for backwards compatibility with IJG V5.
-# Check whether --with-maxmem or --without-maxmem was given.
-if test "${with_maxmem+set}" = set; then
- withval="$with_maxmem"
- MAXMEM="$withval"
-fi
-
-if test "x$MAXMEM" = xyes; then
- MAXMEM=1
-fi
-if test "x$MAXMEM" != xno; then
- if test -n "`echo $MAXMEM | sed 's/[0-9]//g'`"; then
- { echo "configure: error: non-numeric argument to --enable-maxmem" 1>&2; exit 1; }
- fi
- DEFAULTMAXMEM=`expr $MAXMEM \* 1048576`
-cat >> confdefs.h <<EOF
-#define DEFAULT_MAX_MEM ${DEFAULTMAXMEM}
-EOF
-
-echo $ac_n "checking for 'tmpfile()'""... $ac_c" 1>&6
-echo "configure:1596: checking for 'tmpfile()'" >&5
-cat > conftest.$ac_ext <<EOF
-#line 1598 "configure"
-#include "confdefs.h"
-#include <stdio.h>
-int main() {
- FILE * tfile = tmpfile();
-; return 0; }
-EOF
-if { (eval echo configure:1605: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
- rm -rf conftest*
- echo "$ac_t""yes" 1>&6
-MEMORYMGR='jmemansi.$(O)'
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- echo "$ac_t""no" 1>&6
-MEMORYMGR='jmemname.$(O)'
-cat >> confdefs.h <<\EOF
-#define NEED_SIGNAL_CATCHER
-EOF
-
-echo $ac_n "checking for 'mktemp()'""... $ac_c" 1>&6
-echo "configure:1620: checking for 'mktemp()'" >&5
-cat > conftest.$ac_ext <<EOF
-#line 1622 "configure"
-#include "confdefs.h"
-
-int main() {
- char fname[80]; mktemp(fname);
-; return 0; }
-EOF
-if { (eval echo configure:1629: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
- rm -rf conftest*
- echo "$ac_t""yes" 1>&6
-else
- echo "configure: failed program was:" >&5
- cat conftest.$ac_ext >&5
- rm -rf conftest*
- echo "$ac_t""no" 1>&6
-cat >> confdefs.h <<\EOF
-#define NO_MKTEMP
-EOF
-
-fi
-rm -f conftest*
-fi
-rm -f conftest*
-fi
-
-
-# Extract the library version ID from jpeglib.h.
-echo $ac_n "checking libjpeg version number""... $ac_c" 1>&6
-echo "configure:1650: checking libjpeg version number" >&5
-JPEG_LIB_VERSION=`sed -e '/^#define JPEG_LIB_VERSION/!d' -e 's/^[^0-9]*\([0-9][0-9]*\).*$/\1/' $srcdir/jpeglib.h`
-echo "$ac_t""$JPEG_LIB_VERSION" 1>&6
-
-
-# Prepare to massage makefile.cfg correctly.
-if test $ijg_cv_have_prototypes = yes; then
- A2K_DEPS=""
- COM_A2K="# "
-else
- A2K_DEPS="ansi2knr"
- COM_A2K=""
-fi
-
-
-# ansi2knr needs -DBSD if string.h is missing
-if test $ac_cv_header_string_h = no; then
- ANSI2KNRFLAGS="-DBSD"
-else
- ANSI2KNRFLAGS=""
-fi
-
-# Substitutions to enable or disable libtool-related stuff
-if test $USELIBTOOL = yes -a $ijg_cv_have_prototypes = yes; then
- COM_LT=""
-else
- COM_LT="# "
-fi
-
-if test "x$LTSHARED" != xno; then
- FORCE_INSTALL_LIB="install-lib"
-else
- FORCE_INSTALL_LIB=""
-fi
-
-# Set up -I directives
-if test "x$srcdir" = x.; then
- INCLUDEFLAGS='-I$(srcdir)'
-else
- INCLUDEFLAGS='-I. -I$(srcdir)'
-fi
-
-trap '' 1 2 15
-
-trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15
-
-test "x$prefix" = xNONE && prefix=$ac_default_prefix
-# Let make expand exec_prefix.
-test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
-
-# Any assignment to VPATH causes Sun make to only execute
-# the first set of double-colon rules, so remove it if not needed.
-# If there is a colon in the path, we need to keep it.
-if test "x$srcdir" = x.; then
- ac_vpsub='/^[ ]*VPATH[ ]*=[^:]*$/d'
-fi
-
-trap 'rm -f $CONFIG_STATUS conftest*; exit 1' 1 2 15
-
-DEFS=-DHAVE_CONFIG_H
-
-# Without the "./", some shells look in PATH for config.status.
-: ${CONFIG_STATUS=./config.status}
-
-echo creating $CONFIG_STATUS
-rm -f $CONFIG_STATUS
-cat > $CONFIG_STATUS <<EOF
-#! /bin/sh
-# Generated automatically by configure.
-# Run this file to recreate the current configuration.
-# This directory was configured as follows,
-# on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
-#
-# $0 $ac_configure_args
-#
-# Compiler output produced by configure, useful for debugging
-# configure, is in ./config.log if it exists.
-
-ac_cs_usage="Usage: $CONFIG_STATUS [--recheck] [--version] [--help]"
-for ac_option
-do
- case "\$ac_option" in
- -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
- echo "running \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion"
- exec \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion ;;
- -version | --version | --versio | --versi | --vers | --ver | --ve | --v)
- echo "$CONFIG_STATUS generated by autoconf version 2.12"
- exit 0 ;;
- -help | --help | --hel | --he | --h)
- echo "\$ac_cs_usage"; exit 0 ;;
- *) echo "\$ac_cs_usage"; exit 1 ;;
- esac
-done
-
-ac_given_srcdir=$srcdir
-ac_given_INSTALL="$INSTALL"
-
-trap 'rm -fr `echo "Makefile:makefile.cfg jconfig.h:jconfig.cfg" | sed "s/:[^ ]*//g"` conftest*; exit 1' 1 2 15
-EOF
-cat >> $CONFIG_STATUS <<EOF
-
-# Protect against being on the right side of a sed subst in config.status.
-sed 's/%@/@@/; s/@%/@@/; s/%g\$/@g/; /@g\$/s/[\\\\&%]/\\\\&/g;
- s/@@/%@/; s/@@/@%/; s/@g\$/%g/' > conftest.subs <<\\CEOF
-$ac_vpsub
-$extrasub
-s%@CFLAGS@%$CFLAGS%g
-s%@CPPFLAGS@%$CPPFLAGS%g
-s%@CXXFLAGS@%$CXXFLAGS%g
-s%@DEFS@%$DEFS%g
-s%@LDFLAGS@%$LDFLAGS%g
-s%@LIBS@%$LIBS%g
-s%@exec_prefix@%$exec_prefix%g
-s%@prefix@%$prefix%g
-s%@program_transform_name@%$program_transform_name%g
-s%@bindir@%$bindir%g
-s%@sbindir@%$sbindir%g
-s%@libexecdir@%$libexecdir%g
-s%@datadir@%$datadir%g
-s%@sysconfdir@%$sysconfdir%g
-s%@sharedstatedir@%$sharedstatedir%g
-s%@localstatedir@%$localstatedir%g
-s%@libdir@%$libdir%g
-s%@includedir@%$includedir%g
-s%@oldincludedir@%$oldincludedir%g
-s%@infodir@%$infodir%g
-s%@mandir@%$mandir%g
-s%@CC@%$CC%g
-s%@CPP@%$CPP%g
-s%@INSTALL_PROGRAM@%$INSTALL_PROGRAM%g
-s%@INSTALL_DATA@%$INSTALL_DATA%g
-s%@RANLIB@%$RANLIB%g
-s%@LIBTOOL@%$LIBTOOL%g
-s%@O@%$O%g
-s%@A@%$A%g
-s%@LN@%$LN%g
-s%@INSTALL_LIB@%$INSTALL_LIB%g
-s%@MEMORYMGR@%$MEMORYMGR%g
-s%@JPEG_LIB_VERSION@%$JPEG_LIB_VERSION%g
-s%@A2K_DEPS@%$A2K_DEPS%g
-s%@COM_A2K@%$COM_A2K%g
-s%@ANSI2KNRFLAGS@%$ANSI2KNRFLAGS%g
-s%@COM_LT@%$COM_LT%g
-s%@FORCE_INSTALL_LIB@%$FORCE_INSTALL_LIB%g
-s%@INCLUDEFLAGS@%$INCLUDEFLAGS%g
-
-CEOF
-EOF
-
-cat >> $CONFIG_STATUS <<\EOF
-
-# Split the substitutions into bite-sized pieces for seds with
-# small command number limits, like on Digital OSF/1 and HP-UX.
-ac_max_sed_cmds=90 # Maximum number of lines to put in a sed script.
-ac_file=1 # Number of current file.
-ac_beg=1 # First line for current file.
-ac_end=$ac_max_sed_cmds # Line after last line for current file.
-ac_more_lines=:
-ac_sed_cmds=""
-while $ac_more_lines; do
- if test $ac_beg -gt 1; then
- sed "1,${ac_beg}d; ${ac_end}q" conftest.subs > conftest.s$ac_file
- else
- sed "${ac_end}q" conftest.subs > conftest.s$ac_file
- fi
- if test ! -s conftest.s$ac_file; then
- ac_more_lines=false
- rm -f conftest.s$ac_file
- else
- if test -z "$ac_sed_cmds"; then
- ac_sed_cmds="sed -f conftest.s$ac_file"
- else
- ac_sed_cmds="$ac_sed_cmds | sed -f conftest.s$ac_file"
- fi
- ac_file=`expr $ac_file + 1`
- ac_beg=$ac_end
- ac_end=`expr $ac_end + $ac_max_sed_cmds`
- fi
-done
-if test -z "$ac_sed_cmds"; then
- ac_sed_cmds=cat
-fi
-EOF
-
-cat >> $CONFIG_STATUS <<EOF
-
-CONFIG_FILES=\${CONFIG_FILES-"Makefile:makefile.cfg"}
-EOF
-cat >> $CONFIG_STATUS <<\EOF
-for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then
- # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in".
- case "$ac_file" in
- *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'`
- ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;;
- *) ac_file_in="${ac_file}.in" ;;
- esac
-
- # Adjust a relative srcdir, top_srcdir, and INSTALL for subdirectories.
-
- # Remove last slash and all that follows it. Not all systems have dirname.
- ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'`
- if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then
- # The file is in a subdirectory.
- test ! -d "$ac_dir" && mkdir "$ac_dir"
- ac_dir_suffix="/`echo $ac_dir|sed 's%^\./%%'`"
- # A "../" for each directory in $ac_dir_suffix.
- ac_dots=`echo $ac_dir_suffix|sed 's%/[^/]*%../%g'`
- else
- ac_dir_suffix= ac_dots=
- fi
-
- case "$ac_given_srcdir" in
- .) srcdir=.
- if test -z "$ac_dots"; then top_srcdir=.
- else top_srcdir=`echo $ac_dots|sed 's%/$%%'`; fi ;;
- /*) srcdir="$ac_given_srcdir$ac_dir_suffix"; top_srcdir="$ac_given_srcdir" ;;
- *) # Relative path.
- srcdir="$ac_dots$ac_given_srcdir$ac_dir_suffix"
- top_srcdir="$ac_dots$ac_given_srcdir" ;;
- esac
-
- case "$ac_given_INSTALL" in
- [/$]*) INSTALL="$ac_given_INSTALL" ;;
- *) INSTALL="$ac_dots$ac_given_INSTALL" ;;
- esac
-
- echo creating "$ac_file"
- rm -f "$ac_file"
- configure_input="Generated automatically from `echo $ac_file_in|sed 's%.*/%%'` by configure."
- case "$ac_file" in
- *Makefile*) ac_comsub="1i\\
-# $configure_input" ;;
- *) ac_comsub= ;;
- esac
-
- ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"`
- sed -e "$ac_comsub
-s%@configure_input@%$configure_input%g
-s%@srcdir@%$srcdir%g
-s%@top_srcdir@%$top_srcdir%g
-s%@INSTALL@%$INSTALL%g
-" $ac_file_inputs | (eval "$ac_sed_cmds") > $ac_file
-fi; done
-rm -f conftest.s*
-
-# These sed commands are passed to sed as "A NAME B NAME C VALUE D", where
-# NAME is the cpp macro being defined and VALUE is the value it is being given.
-#
-# ac_d sets the value in "#define NAME VALUE" lines.
-ac_dA='s%^\([ ]*\)#\([ ]*define[ ][ ]*\)'
-ac_dB='\([ ][ ]*\)[^ ]*%\1#\2'
-ac_dC='\3'
-ac_dD='%g'
-# ac_u turns "#undef NAME" with trailing blanks into "#define NAME VALUE".
-ac_uA='s%^\([ ]*\)#\([ ]*\)undef\([ ][ ]*\)'
-ac_uB='\([ ]\)%\1#\2define\3'
-ac_uC=' '
-ac_uD='\4%g'
-# ac_e turns "#undef NAME" without trailing blanks into "#define NAME VALUE".
-ac_eA='s%^\([ ]*\)#\([ ]*\)undef\([ ][ ]*\)'
-ac_eB='$%\1#\2define\3'
-ac_eC=' '
-ac_eD='%g'
-
-if test "${CONFIG_HEADERS+set}" != set; then
-EOF
-cat >> $CONFIG_STATUS <<EOF
- CONFIG_HEADERS="jconfig.h:jconfig.cfg"
-EOF
-cat >> $CONFIG_STATUS <<\EOF
-fi
-for ac_file in .. $CONFIG_HEADERS; do if test "x$ac_file" != x..; then
- # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in".
- case "$ac_file" in
- *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'`
- ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;;
- *) ac_file_in="${ac_file}.in" ;;
- esac
-
- echo creating $ac_file
-
- rm -f conftest.frag conftest.in conftest.out
- ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"`
- cat $ac_file_inputs > conftest.in
-
-EOF
-
-# Transform confdefs.h into a sed script conftest.vals that substitutes
-# the proper values into config.h.in to produce config.h. And first:
-# Protect against being on the right side of a sed subst in config.status.
-# Protect against being in an unquoted here document in config.status.
-rm -f conftest.vals
-cat > conftest.hdr <<\EOF
-s/[\\&%]/\\&/g
-s%[\\$`]%\\&%g
-s%#define \([A-Za-z_][A-Za-z0-9_]*\) *\(.*\)%${ac_dA}\1${ac_dB}\1${ac_dC}\2${ac_dD}%gp
-s%ac_d%ac_u%gp
-s%ac_u%ac_e%gp
-EOF
-sed -n -f conftest.hdr confdefs.h > conftest.vals
-rm -f conftest.hdr
-
-# This sed command replaces #undef with comments. This is necessary, for
-# example, in the case of _POSIX_SOURCE, which is predefined and required
-# on some systems where configure will not decide to define it.
-cat >> conftest.vals <<\EOF
-EOF
-
-# Break up conftest.vals because some shells have a limit on
-# the size of here documents, and old seds have small limits too.
-
-rm -f conftest.tail
-while :
-do
- ac_lines=`grep -c . conftest.vals`
- # grep -c gives empty output for an empty file on some AIX systems.
- if test -z "$ac_lines" || test "$ac_lines" -eq 0; then break; fi
- # Write a limited-size here document to conftest.frag.
- echo ' cat > conftest.frag <<CEOF' >> $CONFIG_STATUS
- sed ${ac_max_here_lines}q conftest.vals >> $CONFIG_STATUS
- echo 'CEOF
- sed -f conftest.frag conftest.in > conftest.out
- rm -f conftest.in
- mv conftest.out conftest.in
-' >> $CONFIG_STATUS
- sed 1,${ac_max_here_lines}d conftest.vals > conftest.tail
- rm -f conftest.vals
- mv conftest.tail conftest.vals
-done
-rm -f conftest.vals
-
-cat >> $CONFIG_STATUS <<\EOF
- rm -f conftest.frag conftest.h
- echo "/* $ac_file. Generated automatically by configure. */" > conftest.h
- cat conftest.in >> conftest.h
- rm -f conftest.in
- if cmp -s $ac_file conftest.h 2>/dev/null; then
- echo "$ac_file is unchanged"
- rm -f conftest.h
- else
- # Remove last slash and all that follows it. Not all systems have dirname.
- ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'`
- if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then
- # The file is in a subdirectory.
- test ! -d "$ac_dir" && mkdir "$ac_dir"
- fi
- rm -f $ac_file
- mv conftest.h $ac_file
- fi
-fi; done
-
-EOF
-cat >> $CONFIG_STATUS <<EOF
-
-EOF
-cat >> $CONFIG_STATUS <<\EOF
-
-exit 0
-EOF
-chmod +x $CONFIG_STATUS
-rm -fr confdefs* $ac_clean_files
-test "$no_create" = yes || ${CONFIG_SHELL-/bin/sh} $CONFIG_STATUS || exit 1
-
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..cd32445
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,124 @@
+# -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ([2.57])
+AC_INIT([libjpeg], [6.b])
+
+AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])
+
+# Always build with prototypes
+AC_DEFINE([HAVE_PROTOTYPES], 1, [Define if your compiler supports prototypes])
+# Don't use undefined types
+AC_DEFINE([INCOMPLETE_TYPES_BROKEN], 1, [Define if you want use complete types])
+
+# Checks for programs.
+AC_PROG_CPP
+AC_PROG_CC
+AC_PROG_CXX
+AC_PROG_INSTALL
+AC_PROG_LIBTOOL
+AC_PROG_LN_S
+
+# Checks for libraries.
+
+# Checks for header files.
+AC_HEADER_STDC
+AC_CHECK_HEADERS([stddef.h stdlib.h string.h])
+AC_CHECK_HEADER([sys/types.h], AC_DEFINE([NEED_SYS_TYPES_H], 1, [Define if you have sys/types.h]))
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_C_CONST
+AC_C_CHAR_UNSIGNED
+AC_C_INLINE
+AC_TYPE_SIZE_T
+AC_CHECK_TYPES([unsigned char, unsigned short])
+
+AC_MSG_CHECKING([if right shift is signed])
+AC_TRY_RUN(
+ [#include <stdio.h>
+ int is_shifting_signed (long arg) {
+ long res = arg >> 4;
+
+ if (res == -0x7F7E80CL)
+ return 1; /* right shift is signed */
+
+ /* see if unsigned-shift hack will fix it. */
+ /* we can't just test exact value since it depends on width of long... */
+ res |= (~0L) << (32-4);
+ if (res == -0x7F7E80CL)
+ return 0; /* right shift is unsigned */
+
+ printf("Right shift isn't acting as I expect it to.\n");
+ printf("I fear the JPEG software will not work at all.\n\n");
+ return 0; /* try it with unsigned anyway */
+ }
+ int main (void) {
+ exit(is_shifting_signed(-0x7F7E80B1L));
+ }],
+ [AC_MSG_RESULT(no)
+ AC_DEFINE([RIGHT_SHIFT_IS_UNSIGNED], 1, [Define if shift is unsigned])],
+ [AC_MSG_RESULT(yes)],
+ [AC_MSG_RESULT(Assuming that right shift is signed on target machine.)])
+
+# test whether global names are unique to at least 15 chars
+AC_MSG_CHECKING([for short external names])
+AC_TRY_LINK(
+ [int possibly_duplicate_function () { return 0; }
+ int possibly_dupli_function () { return 1; }], [ ],
+ [AC_MSG_RESULT(ok)],
+ [AC_MSG_RESULT(short)
+ AC_DEFINE([NEED_SHORT_EXTERNAL_NAMES], 1, [Define if you need short function names])])
+
+# Checks for library functions.
+AC_CHECK_FUNCS([memset memcpy], [],
+ [AC_DEFINE([NEED_BSD_STRINGS], 1,
+ [Define if you have BSD-like bzero and bcopy])])
+
+# Set flags to indicate platform
+case "$host_os" in
+ cygwin* | mingw* | pw32* | interix*)
+ is_win32=1
+ ;;
+esac
+AM_CONDITIONAL([IS_WIN32], [test "x$is_win32" = "x1"])
+
+# SIMD is optional
+AC_ARG_WITH([simd],
+ AC_HELP_STRING([--without-simd],[Omit accelerated SIMD routines.]))
+if test "x${with_simd}" != "xno"; then
+ # Check if we're on a supported CPU
+ AC_MSG_CHECKING([if we have SIMD optimisations for cpu type])
+ case "$host_cpu" in
+ x86_64)
+ AC_MSG_RESULT([yes (x86_64)])
+ AC_PROG_NASM
+ simd_arch=x86_64
+ ;;
+ i*86 | x86 | ia32)
+ AC_MSG_RESULT([yes (i386)])
+ AC_PROG_NASM
+ simd_arch=i386
+ ;;
+ *)
+ AC_MSG_RESULT([no ("$host_cpu")])
+ AC_MSG_ERROR([CPU is not supported])
+ ;;
+ esac
+
+ if test "x${with_simd}" != "xno"; then
+ AC_DEFINE([WITH_SIMD], [1], [Use accelerated SIMD routines.])
+ fi
+fi
+
+AM_CONDITIONAL([WITH_SIMD], [test "x$with_simd" != "xno"])
+AM_CONDITIONAL([SIMD_I386], [test "x$simd_arch" = "xi386"])
+AM_CONDITIONAL([SIMD_X86_64], [test "x$simd_arch" = "xx86_64"])
+
+# jconfig.h is the file we use, but we have another before that to
+# fool autoheader. the reason is that we include this header in our
+# API headers, which can screw things up for users of the lib.
+# jconfig.h is a minimal version that allows this package to be built
+AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_HEADERS([jconfig.h])
+AC_CONFIG_FILES([Makefile simd/Makefile])
+AC_OUTPUT
diff --git a/djpeg.1 b/djpeg.1
deleted file mode 100644
index 11beb6a..0000000
--- a/djpeg.1
+++ /dev/null
@@ -1,253 +0,0 @@
-.TH DJPEG 1 "22 August 1997"
-.SH NAME
-djpeg \- decompress a JPEG file to an image file
-.SH SYNOPSIS
-.B djpeg
-[
-.I options
-]
-[
-.I filename
-]
-.LP
-.SH DESCRIPTION
-.LP
-.B djpeg
-decompresses the named JPEG file, or the standard input if no file is named,
-and produces an image file on the standard output. PBMPLUS (PPM/PGM), BMP,
-GIF, Targa, or RLE (Utah Raster Toolkit) output format can be selected.
-(RLE is supported only if the URT library is available.)
-.SH OPTIONS
-All switch names may be abbreviated; for example,
-.B \-grayscale
-may be written
-.B \-gray
-or
-.BR \-gr .
-Most of the "basic" switches can be abbreviated to as little as one letter.
-Upper and lower case are equivalent (thus
-.B \-BMP
-is the same as
-.BR \-bmp ).
-British spellings are also accepted (e.g.,
-.BR \-greyscale ),
-though for brevity these are not mentioned below.
-.PP
-The basic switches are:
-.TP
-.BI \-colors " N"
-Reduce image to at most N colors. This reduces the number of colors used in
-the output image, so that it can be displayed on a colormapped display or
-stored in a colormapped file format. For example, if you have an 8-bit
-display, you'd need to reduce to 256 or fewer colors.
-.TP
-.BI \-quantize " N"
-Same as
-.BR \-colors .
-.B \-colors
-is the recommended name,
-.B \-quantize
-is provided only for backwards compatibility.
-.TP
-.B \-fast
-Select recommended processing options for fast, low quality output. (The
-default options are chosen for highest quality output.) Currently, this is
-equivalent to \fB\-dct fast \-nosmooth \-onepass \-dither ordered\fR.
-.TP
-.B \-grayscale
-Force gray-scale output even if JPEG file is color. Useful for viewing on
-monochrome displays; also,
-.B djpeg
-runs noticeably faster in this mode.
-.TP
-.BI \-scale " M/N"
-Scale the output image by a factor M/N. Currently the scale factor must be
-1/1, 1/2, 1/4, or 1/8. Scaling is handy if the image is larger than your
-screen; also,
-.B djpeg
-runs much faster when scaling down the output.
-.TP
-.B \-bmp
-Select BMP output format (Windows flavor). 8-bit colormapped format is
-emitted if
-.B \-colors
-or
-.B \-grayscale
-is specified, or if the JPEG file is gray-scale; otherwise, 24-bit full-color
-format is emitted.
-.TP
-.B \-gif
-Select GIF output format. Since GIF does not support more than 256 colors,
-.B \-colors 256
-is assumed (unless you specify a smaller number of colors).
-.TP
-.B \-os2
-Select BMP output format (OS/2 1.x flavor). 8-bit colormapped format is
-emitted if
-.B \-colors
-or
-.B \-grayscale
-is specified, or if the JPEG file is gray-scale; otherwise, 24-bit full-color
-format is emitted.
-.TP
-.B \-pnm
-Select PBMPLUS (PPM/PGM) output format (this is the default format).
-PGM is emitted if the JPEG file is gray-scale or if
-.B \-grayscale
-is specified; otherwise PPM is emitted.
-.TP
-.B \-rle
-Select RLE output format. (Requires URT library.)
-.TP
-.B \-targa
-Select Targa output format. Gray-scale format is emitted if the JPEG file is
-gray-scale or if
-.B \-grayscale
-is specified; otherwise, colormapped format is emitted if
-.B \-colors
-is specified; otherwise, 24-bit full-color format is emitted.
-.PP
-Switches for advanced users:
-.TP
-.B \-dct int
-Use integer DCT method (default).
-.TP
-.B \-dct fast
-Use fast integer DCT (less accurate).
-.TP
-.B \-dct float
-Use floating-point DCT method.
-The float method is very slightly more accurate than the int method, but is
-much slower unless your machine has very fast floating-point hardware. Also
-note that results of the floating-point method may vary slightly across
-machines, while the integer methods should give the same results everywhere.
-The fast integer method is much less accurate than the other two.
-.TP
-.B \-dither fs
-Use Floyd-Steinberg dithering in color quantization.
-.TP
-.B \-dither ordered
-Use ordered dithering in color quantization.
-.TP
-.B \-dither none
-Do not use dithering in color quantization.
-By default, Floyd-Steinberg dithering is applied when quantizing colors; this
-is slow but usually produces the best results. Ordered dither is a compromise
-between speed and quality; no dithering is fast but usually looks awful. Note
-that these switches have no effect unless color quantization is being done.
-Ordered dither is only available in
-.B \-onepass
-mode.
-.TP
-.BI \-map " file"
-Quantize to the colors used in the specified image file. This is useful for
-producing multiple files with identical color maps, or for forcing a
-predefined set of colors to be used. The
-.I file
-must be a GIF or PPM file. This option overrides
-.B \-colors
-and
-.BR \-onepass .
-.TP
-.B \-nosmooth
-Use a faster, lower-quality upsampling routine.
-.TP
-.B \-onepass
-Use one-pass instead of two-pass color quantization. The one-pass method is
-faster and needs less memory, but it produces a lower-quality image.
-.B \-onepass
-is ignored unless you also say
-.B \-colors
-.IR N .
-Also, the one-pass method is always used for gray-scale output (the two-pass
-method is no improvement then).
-.TP
-.BI \-maxmemory " N"
-Set limit for amount of memory to use in processing large images. Value is
-in thousands of bytes, or millions of bytes if "M" is attached to the
-number. For example,
-.B \-max 4m
-selects 4000000 bytes. If more space is needed, temporary files will be used.
-.TP
-.BI \-outfile " name"
-Send output image to the named file, not to standard output.
-.TP
-.B \-verbose
-Enable debug printout. More
-.BR \-v 's
-give more output. Also, version information is printed at startup.
-.TP
-.B \-debug
-Same as
-.BR \-verbose .
-.SH EXAMPLES
-.LP
-This example decompresses the JPEG file foo.jpg, quantizes it to
-256 colors, and saves the output in 8-bit BMP format in foo.bmp:
-.IP
-.B djpeg \-colors 256 \-bmp
-.I foo.jpg
-.B >
-.I foo.bmp
-.SH HINTS
-To get a quick preview of an image, use the
-.B \-grayscale
-and/or
-.B \-scale
-switches.
-.B \-grayscale \-scale 1/8
-is the fastest case.
-.PP
-Several options are available that trade off image quality to gain speed.
-.B \-fast
-turns on the recommended settings.
-.PP
-.B \-dct fast
-and/or
-.B \-nosmooth
-gain speed at a small sacrifice in quality.
-When producing a color-quantized image,
-.B \-onepass \-dither ordered
-is fast but much lower quality than the default behavior.
-.B \-dither none
-may give acceptable results in two-pass mode, but is seldom tolerable in
-one-pass mode.
-.PP
-If you are fortunate enough to have very fast floating point hardware,
-\fB\-dct float\fR may be even faster than \fB\-dct fast\fR. But on most
-machines \fB\-dct float\fR is slower than \fB\-dct int\fR; in this case it is
-not worth using, because its theoretical accuracy advantage is too small to be
-significant in practice.
-.SH ENVIRONMENT
-.TP
-.B JPEGMEM
-If this environment variable is set, its value is the default memory limit.
-The value is specified as described for the
-.B \-maxmemory
-switch.
-.B JPEGMEM
-overrides the default value specified when the program was compiled, and
-itself is overridden by an explicit
-.BR \-maxmemory .
-.SH SEE ALSO
-.BR cjpeg (1),
-.BR jpegtran (1),
-.BR rdjpgcom (1),
-.BR wrjpgcom (1)
-.br
-.BR ppm (5),
-.BR pgm (5)
-.br
-Wallace, Gregory K. "The JPEG Still Picture Compression Standard",
-Communications of the ACM, April 1991 (vol. 34, no. 4), pp. 30-44.
-.SH AUTHOR
-Independent JPEG Group
-.SH BUGS
-Arithmetic coding is not supported for legal reasons.
-.PP
-To avoid the Unisys LZW patent,
-.B djpeg
-produces uncompressed GIF files. These are larger than they should be, but
-are readable by standard GIF decoders.
-.PP
-Still not as fast as we'd like.
diff --git a/install-sh b/install-sh
deleted file mode 100755
index e843669..0000000
--- a/install-sh
+++ /dev/null
@@ -1,250 +0,0 @@
-#!/bin/sh
-#
-# install - install a program, script, or datafile
-# This comes from X11R5 (mit/util/scripts/install.sh).
-#
-# Copyright 1991 by the Massachusetts Institute of Technology
-#
-# Permission to use, copy, modify, distribute, and sell this software and its
-# documentation for any purpose is hereby granted without fee, provided that
-# the above copyright notice appear in all copies and that both that
-# copyright notice and this permission notice appear in supporting
-# documentation, and that the name of M.I.T. not be used in advertising or
-# publicity pertaining to distribution of the software without specific,
-# written prior permission. M.I.T. makes no representations about the
-# suitability of this software for any purpose. It is provided "as is"
-# without express or implied warranty.
-#
-# Calling this script install-sh is preferred over install.sh, to prevent
-# `make' implicit rules from creating a file called install from it
-# when there is no Makefile.
-#
-# This script is compatible with the BSD install script, but was written
-# from scratch. It can only install one file at a time, a restriction
-# shared with many OS's install programs.
-
-
-# set DOITPROG to echo to test this script
-
-# Don't use :- since 4.3BSD and earlier shells don't like it.
-doit="${DOITPROG-}"
-
-
-# put in absolute paths if you don't have them in your path; or use env. vars.
-
-mvprog="${MVPROG-mv}"
-cpprog="${CPPROG-cp}"
-chmodprog="${CHMODPROG-chmod}"
-chownprog="${CHOWNPROG-chown}"
-chgrpprog="${CHGRPPROG-chgrp}"
-stripprog="${STRIPPROG-strip}"
-rmprog="${RMPROG-rm}"
-mkdirprog="${MKDIRPROG-mkdir}"
-
-transformbasename=""
-transform_arg=""
-instcmd="$mvprog"
-chmodcmd="$chmodprog 0755"
-chowncmd=""
-chgrpcmd=""
-stripcmd=""
-rmcmd="$rmprog -f"
-mvcmd="$mvprog"
-src=""
-dst=""
-dir_arg=""
-
-while [ x"$1" != x ]; do
- case $1 in
- -c) instcmd="$cpprog"
- shift
- continue;;
-
- -d) dir_arg=true
- shift
- continue;;
-
- -m) chmodcmd="$chmodprog $2"
- shift
- shift
- continue;;
-
- -o) chowncmd="$chownprog $2"
- shift
- shift
- continue;;
-
- -g) chgrpcmd="$chgrpprog $2"
- shift
- shift
- continue;;
-
- -s) stripcmd="$stripprog"
- shift
- continue;;
-
- -t=*) transformarg=`echo $1 | sed 's/-t=//'`
- shift
- continue;;
-
- -b=*) transformbasename=`echo $1 | sed 's/-b=//'`
- shift
- continue;;
-
- *) if [ x"$src" = x ]
- then
- src=$1
- else
- # this colon is to work around a 386BSD /bin/sh bug
- :
- dst=$1
- fi
- shift
- continue;;
- esac
-done
-
-if [ x"$src" = x ]
-then
- echo "install: no input file specified"
- exit 1
-else
- true
-fi
-
-if [ x"$dir_arg" != x ]; then
- dst=$src
- src=""
-
- if [ -d $dst ]; then
- instcmd=:
- else
- instcmd=mkdir
- fi
-else
-
-# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
-# might cause directories to be created, which would be especially bad
-# if $src (and thus $dsttmp) contains '*'.
-
- if [ -f $src -o -d $src ]
- then
- true
- else
- echo "install: $src does not exist"
- exit 1
- fi
-
- if [ x"$dst" = x ]
- then
- echo "install: no destination specified"
- exit 1
- else
- true
- fi
-
-# If destination is a directory, append the input filename; if your system
-# does not like double slashes in filenames, you may need to add some logic
-
- if [ -d $dst ]
- then
- dst="$dst"/`basename $src`
- else
- true
- fi
-fi
-
-## this sed command emulates the dirname command
-dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
-
-# Make sure that the destination directory exists.
-# this part is taken from Noah Friedman's mkinstalldirs script
-
-# Skip lots of stat calls in the usual case.
-if [ ! -d "$dstdir" ]; then
-defaultIFS='
-'
-IFS="${IFS-${defaultIFS}}"
-
-oIFS="${IFS}"
-# Some sh's can't handle IFS=/ for some reason.
-IFS='%'
-set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
-IFS="${oIFS}"
-
-pathcomp=''
-
-while [ $# -ne 0 ] ; do
- pathcomp="${pathcomp}${1}"
- shift
-
- if [ ! -d "${pathcomp}" ] ;
- then
- $mkdirprog "${pathcomp}"
- else
- true
- fi
-
- pathcomp="${pathcomp}/"
-done
-fi
-
-if [ x"$dir_arg" != x ]
-then
- $doit $instcmd $dst &&
-
- if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
- if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
- if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
- if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
-else
-
-# If we're going to rename the final executable, determine the name now.
-
- if [ x"$transformarg" = x ]
- then
- dstfile=`basename $dst`
- else
- dstfile=`basename $dst $transformbasename |
- sed $transformarg`$transformbasename
- fi
-
-# don't allow the sed command to completely eliminate the filename
-
- if [ x"$dstfile" = x ]
- then
- dstfile=`basename $dst`
- else
- true
- fi
-
-# Make a temp file name in the proper directory.
-
- dsttmp=$dstdir/#inst.$$#
-
-# Move or copy the file name to the temp name
-
- $doit $instcmd $src $dsttmp &&
-
- trap "rm -f ${dsttmp}" 0 &&
-
-# and set any options; do chmod last to preserve setuid bits
-
-# If any of these fail, we abort the whole thing. If we want to
-# ignore errors from any of these, just make sure not to ignore
-# errors from the above "$doit $instcmd $src $dsttmp" command.
-
- if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
- if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
- if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
- if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
-
-# Now rename the file to the real destination.
-
- $doit $rmcmd -f $dstdir/$dstfile &&
- $doit $mvcmd $dsttmp $dstdir/$dstfile
-
-fi &&
-
-
-exit 0
diff --git a/jccolor.c b/jccolor.c
index 0a8a4b5..2e2bfd2 100644
--- a/jccolor.c
+++ b/jccolor.c
@@ -2,6 +2,8 @@
* jccolor.c
*
* Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -11,6 +13,7 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
+#include "jsimd.h"
/* Private subobject */
@@ -78,6 +81,74 @@
#define TABLE_SIZE (8*(MAXJSAMPLE+1))
+#if BITS_IN_JSAMPLE == 8
+
+const unsigned char red_lut[256] = {
+ 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
+ 5 , 5 , 5 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 8 , 8 , 8 , 9 , 9 , 9 ,
+ 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14,
+ 14, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19,
+ 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 24,
+ 24, 24, 25, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 28,
+ 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33,
+ 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38,
+ 38, 39, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 42, 43,
+ 43, 43, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48,
+ 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52,
+ 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 56, 56, 56, 57, 57, 57,
+ 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 60, 61, 61, 61, 62, 62,
+ 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 67,
+ 67, 67, 68, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71,
+ 72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 76, 76, 76
+};
+
+const unsigned char green_lut[256] = {
+ 0 , 1 , 1 , 2 , 2 , 3 , 4 , 4 , 5 , 5 , 6 , 6 ,
+ 7 , 8 , 8 , 9 , 9 , 10 , 11 , 11 , 12 , 12 , 13 , 14 ,
+ 14 , 15 , 15 , 16 , 16 , 17 , 18 , 18 , 19 , 19 , 20 , 21 ,
+ 21 , 22 , 22 , 23 , 23 , 24 , 25 , 25 , 26 , 26 , 27 , 28 ,
+ 28 , 29 , 29 , 30 , 31 , 31 , 32 , 32 , 33 , 33 , 34 , 35 ,
+ 35 , 36 , 36 , 37 , 38 , 38 , 39 , 39 , 40 , 41 , 41 , 42 ,
+ 42 , 43 , 43 , 44 , 45 , 45 , 46 , 46 , 47 , 48 , 48 , 49 ,
+ 49 , 50 , 50 , 51 , 52 , 52 , 53 , 53 , 54 , 55 , 55 , 56 ,
+ 56 , 57 , 58 , 58 , 59 , 59 , 60 , 60 , 61 , 62 , 62 , 63 ,
+ 63 , 64 , 65 , 65 , 66 , 66 , 67 , 68 , 68 , 69 , 69 , 70 ,
+ 70 , 71 , 72 , 72 , 73 , 73 , 74 , 75 , 75 , 76 , 76 , 77 ,
+ 77 , 78 , 79 , 79 , 80 , 80 , 81 , 82 , 82 , 83 , 83 , 84 ,
+ 85 , 85 , 86 , 86 , 87 , 87 , 88 , 89 , 89 , 90 , 90 , 91 ,
+ 92 , 92 , 93 , 93 , 94 , 95 , 95 , 96 , 96 , 97 , 97 , 98 ,
+ 99 , 99 , 100, 100, 101, 102, 102, 103, 103, 104, 104, 105,
+ 106, 106, 107, 107, 108, 109, 109, 110, 110, 111, 112, 112,
+ 113, 113, 114, 114, 115, 116, 116, 117, 117, 118, 119, 119,
+ 120, 120, 121, 122, 122, 123, 123, 124, 124, 125, 126, 126,
+ 127, 127, 128, 129, 129, 130, 130, 131, 131, 132, 133, 133,
+ 34, 134, 135, 136, 136, 137, 137, 138, 139, 139, 140, 140,
+ 141, 141, 142, 143, 143, 144, 144, 145, 146, 146, 147, 147,
+ 148, 149, 149, 150
+};
+
+const unsigned char blue_lut[256] = {
+ 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 ,
+ 2 , 2 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 ,
+ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 ,
+ 5 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 7 , 7 ,
+ 7 , 7 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 9 ,
+ 9 , 9 , 9 , 9 , 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
+ 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
+ 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
+ 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24,
+ 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29
+};
+
+#endif
+
+
/*
* Initialize for RGB->YCC colorspace conversion.
*/
@@ -146,10 +217,10 @@
outptr2 = output_buf[2][output_row];
output_row++;
for (col = 0; col < num_cols; col++) {
- r = GETJSAMPLE(inptr[RGB_RED]);
- g = GETJSAMPLE(inptr[RGB_GREEN]);
- b = GETJSAMPLE(inptr[RGB_BLUE]);
- inptr += RGB_PIXELSIZE;
+ r = GETJSAMPLE(inptr[rgb_red[cinfo->in_color_space]]);
+ g = GETJSAMPLE(inptr[rgb_green[cinfo->in_color_space]]);
+ b = GETJSAMPLE(inptr[rgb_blue[cinfo->in_color_space]]);
+ inptr += rgb_pixelsize[cinfo->in_color_space];
/* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
* must be too; we do not need an explicit range-limiting operation.
* Hence the value being shifted is never negative, and we don't
@@ -188,26 +259,36 @@
JDIMENSION output_row, int num_rows)
{
my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
- register int r, g, b;
+ #if BITS_IN_JSAMPLE != 8
register INT32 * ctab = cconvert->rgb_ycc_tab;
+ #endif
register JSAMPROW inptr;
register JSAMPROW outptr;
+ JSAMPLE *maxoutptr;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->image_width;
+ int rindex = rgb_red[cinfo->in_color_space];
+ int gindex = rgb_green[cinfo->in_color_space];
+ int bindex = rgb_blue[cinfo->in_color_space];
+ int rgbstride = rgb_pixelsize[cinfo->in_color_space];
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr = output_buf[0][output_row];
+ maxoutptr = &outptr[num_cols];
output_row++;
- for (col = 0; col < num_cols; col++) {
- r = GETJSAMPLE(inptr[RGB_RED]);
- g = GETJSAMPLE(inptr[RGB_GREEN]);
- b = GETJSAMPLE(inptr[RGB_BLUE]);
- inptr += RGB_PIXELSIZE;
+ for (; outptr < maxoutptr; outptr++, inptr += rgbstride) {
/* Y */
- outptr[col] = (JSAMPLE)
- ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
- >> SCALEBITS);
+ #if BITS_IN_JSAMPLE == 8
+ *outptr = red_lut[inptr[rindex]] + green_lut[inptr[gindex]]
+ + blue_lut[inptr[bindex]];
+ #else
+ *outptr = (JSAMPLE)
+ ((ctab[GETJSAMPLE(inptr[rindex])+R_Y_OFF]
+ + ctab[GETJSAMPLE(inptr[gindex])+G_Y_OFF]
+ + ctab[GETJSAMPLE(inptr[bindex])+B_Y_OFF])
+ >> SCALEBITS);
+ #endif
}
}
}
@@ -368,11 +449,15 @@
break;
case JCS_RGB:
-#if RGB_PIXELSIZE != 3
- if (cinfo->input_components != RGB_PIXELSIZE)
+ case JCS_EXT_RGB:
+ case JCS_EXT_RGBX:
+ case JCS_EXT_BGR:
+ case JCS_EXT_BGRX:
+ case JCS_EXT_XBGR:
+ case JCS_EXT_XRGB:
+ if (cinfo->input_components != rgb_pixelsize[cinfo->in_color_space])
ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
break;
-#endif /* else share code with YCbCr */
case JCS_YCbCr:
if (cinfo->input_components != 3)
@@ -398,7 +483,13 @@
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
if (cinfo->in_color_space == JCS_GRAYSCALE)
cconvert->pub.color_convert = grayscale_convert;
- else if (cinfo->in_color_space == JCS_RGB) {
+ else if (cinfo->in_color_space == JCS_RGB ||
+ cinfo->in_color_space == JCS_EXT_RGB ||
+ cinfo->in_color_space == JCS_EXT_RGBX ||
+ cinfo->in_color_space == JCS_EXT_BGR ||
+ cinfo->in_color_space == JCS_EXT_BGRX ||
+ cinfo->in_color_space == JCS_EXT_XBGR ||
+ cinfo->in_color_space == JCS_EXT_XRGB) {
cconvert->pub.start_pass = rgb_ycc_start;
cconvert->pub.color_convert = rgb_gray_convert;
} else if (cinfo->in_color_space == JCS_YCbCr)
@@ -408,9 +499,16 @@
break;
case JCS_RGB:
+ case JCS_EXT_RGB:
+ case JCS_EXT_RGBX:
+ case JCS_EXT_BGR:
+ case JCS_EXT_BGRX:
+ case JCS_EXT_XBGR:
+ case JCS_EXT_XRGB:
if (cinfo->num_components != 3)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
- if (cinfo->in_color_space == JCS_RGB && RGB_PIXELSIZE == 3)
+ if (cinfo->in_color_space == cinfo->jpeg_color_space &&
+ rgb_pixelsize[cinfo->in_color_space] == 3)
cconvert->pub.color_convert = null_convert;
else
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
@@ -419,9 +517,19 @@
case JCS_YCbCr:
if (cinfo->num_components != 3)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
- if (cinfo->in_color_space == JCS_RGB) {
- cconvert->pub.start_pass = rgb_ycc_start;
- cconvert->pub.color_convert = rgb_ycc_convert;
+ if (cinfo->in_color_space == JCS_RGB ||
+ cinfo->in_color_space == JCS_EXT_RGB ||
+ cinfo->in_color_space == JCS_EXT_RGBX ||
+ cinfo->in_color_space == JCS_EXT_BGR ||
+ cinfo->in_color_space == JCS_EXT_BGRX ||
+ cinfo->in_color_space == JCS_EXT_XBGR ||
+ cinfo->in_color_space == JCS_EXT_XRGB) {
+ if (jsimd_can_rgb_ycc())
+ cconvert->pub.color_convert = jsimd_rgb_ycc_convert;
+ else {
+ cconvert->pub.start_pass = rgb_ycc_start;
+ cconvert->pub.color_convert = rgb_ycc_convert;
+ }
} else if (cinfo->in_color_space == JCS_YCbCr)
cconvert->pub.color_convert = null_convert;
else
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 61fa79b..156957a 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -2,6 +2,8 @@
* jcdctmgr.c
*
* Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -15,15 +17,35 @@
#include "jinclude.h"
#include "jpeglib.h"
#include "jdct.h" /* Private declarations for DCT subsystem */
+#include "jsimddct.h"
/* Private subobject for this module */
+typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
+typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
+
+typedef JMETHOD(void, convsamp_method_ptr,
+ (JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM * workspace));
+typedef JMETHOD(void, float_convsamp_method_ptr,
+ (JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace));
+
+typedef JMETHOD(void, quantize_method_ptr,
+ (JCOEFPTR coef_block, DCTELEM * divisors,
+ DCTELEM * workspace));
+typedef JMETHOD(void, float_quantize_method_ptr,
+ (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace));
+
typedef struct {
struct jpeg_forward_dct pub; /* public fields */
/* Pointer to the DCT routine actually in use */
- forward_DCT_method_ptr do_dct;
+ forward_DCT_method_ptr dct;
+ convsamp_method_ptr convsamp;
+ quantize_method_ptr quantize;
/* The actual post-DCT divisors --- not identical to the quant table
* entries, because of scaling (especially for an unnormalized DCT).
@@ -31,10 +53,16 @@
*/
DCTELEM * divisors[NUM_QUANT_TBLS];
+ /* work area for FDCT subroutine */
+ DCTELEM * workspace;
+
#ifdef DCT_FLOAT_SUPPORTED
/* Same as above for the floating-point case. */
- float_DCT_method_ptr do_float_dct;
+ float_DCT_method_ptr float_dct;
+ float_convsamp_method_ptr float_convsamp;
+ float_quantize_method_ptr float_quantize;
FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
+ FAST_FLOAT * float_workspace;
#endif
} my_fdct_controller;
@@ -42,6 +70,128 @@
/*
+ * Find the highest bit in an integer through binary search.
+ */
+LOCAL(int)
+flss (UINT16 val)
+{
+ int bit;
+
+ bit = 16;
+
+ if (!val)
+ return 0;
+
+ if (!(val & 0xff00)) {
+ bit -= 8;
+ val <<= 8;
+ }
+ if (!(val & 0xf000)) {
+ bit -= 4;
+ val <<= 4;
+ }
+ if (!(val & 0xc000)) {
+ bit -= 2;
+ val <<= 2;
+ }
+ if (!(val & 0x8000)) {
+ bit -= 1;
+ val <<= 1;
+ }
+
+ return bit;
+}
+
+/*
+ * Compute values to do a division using reciprocal.
+ *
+ * This implementation is based on an algorithm described in
+ * "How to optimize for the Pentium family of microprocessors"
+ * (http://www.agner.org/assem/).
+ * More information about the basic algorithm can be found in
+ * the paper "Integer Division Using Reciprocals" by Robert Alverson.
+ *
+ * The basic idea is to replace x/d by x * d^-1. In order to store
+ * d^-1 with enough precision we shift it left a few places. It turns
+ * out that this algoright gives just enough precision, and also fits
+ * into DCTELEM:
+ *
+ * b = (the number of significant bits in divisor) - 1
+ * r = (word size) + b
+ * f = 2^r / divisor
+ *
+ * f will not be an integer for most cases, so we need to compensate
+ * for the rounding error introduced:
+ *
+ * no fractional part:
+ *
+ * result = input >> r
+ *
+ * fractional part of f < 0.5:
+ *
+ * round f down to nearest integer
+ * result = ((input + 1) * f) >> r
+ *
+ * fractional part of f > 0.5:
+ *
+ * round f up to nearest integer
+ * result = (input * f) >> r
+ *
+ * This is the original algorithm that gives truncated results. But we
+ * want properly rounded results, so we replace "input" with
+ * "input + divisor/2".
+ *
+ * In order to allow SIMD implementations we also tweak the values to
+ * allow the same calculation to be made at all times:
+ *
+ * dctbl[0] = f rounded to nearest integer
+ * dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
+ * dctbl[2] = 1 << ((word size) * 2 - r)
+ * dctbl[3] = r - (word size)
+ *
+ * dctbl[2] is for stupid instruction sets where the shift operation
+ * isn't member wise (e.g. MMX).
+ *
+ * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
+ * is that most SIMD implementations have a "multiply and store top
+ * half" operation.
+ *
+ * Lastly, we store each of the values in their own table instead
+ * of in a consecutive manner, yet again in order to allow SIMD
+ * routines.
+ */
+LOCAL(void)
+compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
+{
+ UDCTELEM2 fq, fr;
+ UDCTELEM c;
+ int b, r;
+
+ b = flss(divisor) - 1;
+ r = sizeof(DCTELEM) * 8 + b;
+
+ fq = ((UDCTELEM2)1 << r) / divisor;
+ fr = ((UDCTELEM2)1 << r) % divisor;
+
+ c = divisor / 2; /* for rounding */
+
+ if (fr == 0) { /* divisor is power of two */
+ /* fq will be one bit too large to fit in DCTELEM, so adjust */
+ fq >>= 1;
+ r--;
+ } else if (fr <= (divisor / 2)) { /* fractional part is < 0.5 */
+ c++;
+ } else { /* fractional part is > 0.5 */
+ fq++;
+ }
+
+ dtbl[DCTSIZE2 * 0] = (DCTELEM) fq; /* reciprocal */
+ dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */
+ dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r)); /* scale */
+ dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+}
+
+/*
* Initialize for a processing pass.
* Verify that all referenced Q-tables are present, and set up
* the divisor table for each one.
@@ -78,11 +228,11 @@
if (fdct->divisors[qtblno] == NULL) {
fdct->divisors[qtblno] = (DCTELEM *)
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- DCTSIZE2 * SIZEOF(DCTELEM));
+ (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
}
dtbl = fdct->divisors[qtblno];
for (i = 0; i < DCTSIZE2; i++) {
- dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
+ compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
}
break;
#endif
@@ -112,14 +262,14 @@
if (fdct->divisors[qtblno] == NULL) {
fdct->divisors[qtblno] = (DCTELEM *)
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
- DCTSIZE2 * SIZEOF(DCTELEM));
+ (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
}
dtbl = fdct->divisors[qtblno];
for (i = 0; i < DCTSIZE2; i++) {
- dtbl[i] = (DCTELEM)
+ compute_reciprocal(
DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
(INT32) aanscales[i]),
- CONST_BITS-3);
+ CONST_BITS-3), &dtbl[i]);
}
}
break;
@@ -169,6 +319,77 @@
/*
+ * Load data into workspace, applying unsigned->signed conversion.
+ */
+
+METHODDEF(void)
+convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
+{
+ register DCTELEM *workspaceptr;
+ register JSAMPROW elemptr;
+ register int elemr;
+
+ workspaceptr = workspace;
+ for (elemr = 0; elemr < DCTSIZE; elemr++) {
+ elemptr = sample_data[elemr] + start_col;
+
+#if DCTSIZE == 8 /* unroll the inner loop */
+ *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+ *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+#else
+ {
+ register int elemc;
+ for (elemc = DCTSIZE; elemc > 0; elemc--)
+ *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+ }
+#endif
+ }
+}
+
+
+/*
+ * Quantize/descale the coefficients, and store into coef_blocks[].
+ */
+
+METHODDEF(void)
+quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
+{
+ int i;
+ DCTELEM temp;
+ UDCTELEM recip, corr, shift;
+ UDCTELEM2 product;
+ JCOEFPTR output_ptr = coef_block;
+
+ for (i = 0; i < DCTSIZE2; i++) {
+ temp = workspace[i];
+ recip = divisors[i + DCTSIZE2 * 0];
+ corr = divisors[i + DCTSIZE2 * 1];
+ shift = divisors[i + DCTSIZE2 * 3];
+
+ if (temp < 0) {
+ temp = -temp;
+ product = (UDCTELEM2)(temp + corr) * recip;
+ product >>= shift + sizeof(DCTELEM)*8;
+ temp = product;
+ temp = -temp;
+ } else {
+ product = (UDCTELEM2)(temp + corr) * recip;
+ product >>= shift + sizeof(DCTELEM)*8;
+ temp = product;
+ }
+
+ output_ptr[i] = (JCOEF) temp;
+ }
+}
+
+
+/*
* Perform forward DCT on one or more blocks of a component.
*
* The input samples are taken from the sample_data[] array starting at
@@ -185,87 +406,87 @@
{
/* This routine is heavily used, so it's worth coding it tightly. */
my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
- forward_DCT_method_ptr do_dct = fdct->do_dct;
DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
- DCTELEM workspace[DCTSIZE2]; /* work area for FDCT subroutine */
+ DCTELEM * workspace;
JDIMENSION bi;
+ /* Make sure the compiler doesn't look up these every pass */
+ forward_DCT_method_ptr do_dct = fdct->dct;
+ convsamp_method_ptr do_convsamp = fdct->convsamp;
+ quantize_method_ptr do_quantize = fdct->quantize;
+ workspace = fdct->workspace;
+
sample_data += start_row; /* fold in the vertical offset once */
for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
/* Load data into workspace, applying unsigned->signed conversion */
- { register DCTELEM *workspaceptr;
- register JSAMPROW elemptr;
- register int elemr;
-
- workspaceptr = workspace;
- for (elemr = 0; elemr < DCTSIZE; elemr++) {
- elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8 /* unroll the inner loop */
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-#else
- { register int elemc;
- for (elemc = DCTSIZE; elemc > 0; elemc--) {
- *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
- }
- }
-#endif
- }
- }
+ (*do_convsamp) (sample_data, start_col, workspace);
/* Perform the DCT */
(*do_dct) (workspace);
/* Quantize/descale the coefficients, and store into coef_blocks[] */
- { register DCTELEM temp, qval;
- register int i;
- register JCOEFPTR output_ptr = coef_blocks[bi];
-
- for (i = 0; i < DCTSIZE2; i++) {
- qval = divisors[i];
- temp = workspace[i];
- /* Divide the coefficient value by qval, ensuring proper rounding.
- * Since C does not specify the direction of rounding for negative
- * quotients, we have to force the dividend positive for portability.
- *
- * In most files, at least half of the output values will be zero
- * (at default quantization settings, more like three-quarters...)
- * so we should ensure that this case is fast. On many machines,
- * a comparison is enough cheaper than a divide to make a special test
- * a win. Since both inputs will be nonnegative, we need only test
- * for a < b to discover whether a/b is 0.
- * If your machine's division is fast enough, define FAST_DIVIDE.
- */
-#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b) a /= b
-#else
-#define DIVIDE_BY(a,b) if (a >= b) a /= b; else a = 0
-#endif
- if (temp < 0) {
- temp = -temp;
- temp += qval>>1; /* for rounding */
- DIVIDE_BY(temp, qval);
- temp = -temp;
- } else {
- temp += qval>>1; /* for rounding */
- DIVIDE_BY(temp, qval);
- }
- output_ptr[i] = (JCOEF) temp;
- }
- }
+ (*do_quantize) (coef_blocks[bi], divisors, workspace);
}
}
#ifdef DCT_FLOAT_SUPPORTED
+
+METHODDEF(void)
+convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)
+{
+ register FAST_FLOAT *workspaceptr;
+ register JSAMPROW elemptr;
+ register int elemr;
+
+ workspaceptr = workspace;
+ for (elemr = 0; elemr < DCTSIZE; elemr++) {
+ elemptr = sample_data[elemr] + start_col;
+#if DCTSIZE == 8 /* unroll the inner loop */
+ *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+ *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+#else
+ {
+ register int elemc;
+ for (elemc = DCTSIZE; elemc > 0; elemc--)
+ *workspaceptr++ = (FAST_FLOAT)
+ (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+ }
+#endif
+ }
+}
+
+
+METHODDEF(void)
+quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)
+{
+ register FAST_FLOAT temp;
+ register int i;
+ register JCOEFPTR output_ptr = coef_block;
+
+ for (i = 0; i < DCTSIZE2; i++) {
+ /* Apply the quantization and scaling factor */
+ temp = workspace[i] * divisors[i];
+
+ /* Round to nearest integer.
+ * Since C does not specify the direction of rounding for negative
+ * quotients, we have to force the dividend positive for portability.
+ * The maximum coefficient size is +-16K (for 12-bit data), so this
+ * code should work for either 16-bit or 32-bit ints.
+ */
+ output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
+ }
+}
+
+
METHODDEF(void)
forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
@@ -275,62 +496,28 @@
{
/* This routine is heavily used, so it's worth coding it tightly. */
my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
- float_DCT_method_ptr do_dct = fdct->do_float_dct;
FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
- FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
+ FAST_FLOAT * workspace;
JDIMENSION bi;
+
+ /* Make sure the compiler doesn't look up these every pass */
+ float_DCT_method_ptr do_dct = fdct->float_dct;
+ float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
+ float_quantize_method_ptr do_quantize = fdct->float_quantize;
+ workspace = fdct->float_workspace;
+
sample_data += start_row; /* fold in the vertical offset once */
for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
/* Load data into workspace, applying unsigned->signed conversion */
- { register FAST_FLOAT *workspaceptr;
- register JSAMPROW elemptr;
- register int elemr;
-
- workspaceptr = workspace;
- for (elemr = 0; elemr < DCTSIZE; elemr++) {
- elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8 /* unroll the inner loop */
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-#else
- { register int elemc;
- for (elemc = DCTSIZE; elemc > 0; elemc--) {
- *workspaceptr++ = (FAST_FLOAT)
- (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
- }
- }
-#endif
- }
- }
+ (*do_convsamp) (sample_data, start_col, workspace);
/* Perform the DCT */
(*do_dct) (workspace);
/* Quantize/descale the coefficients, and store into coef_blocks[] */
- { register FAST_FLOAT temp;
- register int i;
- register JCOEFPTR output_ptr = coef_blocks[bi];
-
- for (i = 0; i < DCTSIZE2; i++) {
- /* Apply the quantization and scaling factor */
- temp = workspace[i] * divisors[i];
- /* Round to nearest integer.
- * Since C does not specify the direction of rounding for negative
- * quotients, we have to force the dividend positive for portability.
- * The maximum coefficient size is +-16K (for 12-bit data), so this
- * code should work for either 16-bit or 32-bit ints.
- */
- output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
- }
- }
+ (*do_quantize) (coef_blocks[bi], divisors, workspace);
}
}
@@ -353,23 +540,33 @@
cinfo->fdct = (struct jpeg_forward_dct *) fdct;
fdct->pub.start_pass = start_pass_fdctmgr;
+ /* First determine the DCT... */
switch (cinfo->dct_method) {
#ifdef DCT_ISLOW_SUPPORTED
case JDCT_ISLOW:
fdct->pub.forward_DCT = forward_DCT;
- fdct->do_dct = jpeg_fdct_islow;
+ if (jsimd_can_fdct_islow())
+ fdct->dct = jsimd_fdct_islow;
+ else
+ fdct->dct = jpeg_fdct_islow;
break;
#endif
#ifdef DCT_IFAST_SUPPORTED
case JDCT_IFAST:
fdct->pub.forward_DCT = forward_DCT;
- fdct->do_dct = jpeg_fdct_ifast;
+ if (jsimd_can_fdct_ifast())
+ fdct->dct = jsimd_fdct_ifast;
+ else
+ fdct->dct = jpeg_fdct_ifast;
break;
#endif
#ifdef DCT_FLOAT_SUPPORTED
case JDCT_FLOAT:
fdct->pub.forward_DCT = forward_DCT_float;
- fdct->do_float_dct = jpeg_fdct_float;
+ if (jsimd_can_fdct_float())
+ fdct->float_dct = jsimd_fdct_float;
+ else
+ fdct->float_dct = jpeg_fdct_float;
break;
#endif
default:
@@ -377,6 +574,54 @@
break;
}
+ /* ...then the supporting stages. */
+ switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+ case JDCT_ISLOW:
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+ case JDCT_IFAST:
+#endif
+#if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
+ if (jsimd_can_convsamp())
+ fdct->convsamp = jsimd_convsamp;
+ else
+ fdct->convsamp = convsamp;
+ if (jsimd_can_quantize())
+ fdct->quantize = jsimd_quantize;
+ else
+ fdct->quantize = quantize;
+ break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+ case JDCT_FLOAT:
+ if (jsimd_can_convsamp_float())
+ fdct->float_convsamp = jsimd_convsamp_float;
+ else
+ fdct->float_convsamp = convsamp_float;
+ if (jsimd_can_quantize_float())
+ fdct->float_quantize = jsimd_quantize_float;
+ else
+ fdct->float_quantize = quantize_float;
+ break;
+#endif
+ default:
+ ERREXIT(cinfo, JERR_NOT_COMPILED);
+ break;
+ }
+
+ /* Allocate workspace memory */
+#ifdef DCT_FLOAT_SUPPORTED
+ if (cinfo->dct_method == JDCT_FLOAT)
+ fdct->float_workspace = (FAST_FLOAT *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ SIZEOF(FAST_FLOAT) * DCTSIZE2);
+ else
+#endif
+ fdct->workspace = (DCTELEM *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ SIZEOF(DCTELEM) * DCTSIZE2);
+
/* Mark divisor tables unallocated */
for (i = 0; i < NUM_QUANT_TBLS; i++) {
fdct->divisors[i] = NULL;
diff --git a/jchuff.c b/jchuff.c
index f235250..daf0736 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -14,11 +14,33 @@
* permanent JPEG objects only upon successful completion of an MCU.
*/
+/* Modifications:
+ * Copyright (C)2007 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version. The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * wxWindows Library License for more details.
+ */
+
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#include "jchuff.h" /* Declarations shared with jcphuff.c */
+#include <limits.h>
+static unsigned char jpeg_first_bit_table[65536];
+int jpeg_first_bit_table_init=0;
+
+#ifndef min
+ #define min(a,b) ((a)<(b)?(a):(b))
+#endif
/* Expanded entropy encoder object for Huffman encoding.
*
@@ -27,7 +49,7 @@
*/
typedef struct {
- INT32 put_buffer; /* current bit-accumulation buffer */
+ long put_buffer; /* current bit-accumulation buffer */
int put_bits; /* # of bits now in it */
int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
} savable_state;
@@ -159,6 +181,7 @@
}
/* Initialize bit buffer to empty */
+
entropy->saved.put_buffer = 0;
entropy->saved.put_bits = 0;
@@ -261,6 +284,15 @@
dtbl->ehufco[i] = huffcode[p];
dtbl->ehufsi[i] = huffsize[p];
}
+
+ if(!jpeg_first_bit_table_init) {
+ for(i = 0; i < 65536; i++) {
+ int bit = 0, val = i;
+ while (val) {val >>= 1; bit++;}
+ jpeg_first_bit_table[i] = bit;
+ }
+ jpeg_first_bit_table_init = 1;
+ }
}
@@ -280,6 +312,8 @@
{
struct jpeg_destination_mgr * dest = state->cinfo->dest;
+ dest->free_in_buffer = state->free_in_buffer;
+
if (! (*dest->empty_output_buffer) (state->cinfo))
return FALSE;
/* After a successful buffer dump, must reset buffer pointers */
@@ -297,147 +331,250 @@
* between calls, so 24 bits are sufficient.
*/
-INLINE
-LOCAL(boolean)
-emit_bits (working_state * state, unsigned int code, int size)
-/* Emit some bits; return TRUE if successful, FALSE if must suspend */
-{
- /* This routine is heavily used, so it's worth coding tightly. */
- register INT32 put_buffer = (INT32) code;
- register int put_bits = state->cur.put_bits;
+/***************************************************************/
- /* if size is 0, caller used an invalid Huffman table entry */
- if (size == 0)
- ERREXIT(state->cinfo, JERR_HUFF_MISSING_CODE);
+#define EMIT_BYTE() { \
+ if (0xFF == (*buffer++ = (unsigned char)(put_buffer >> (put_bits -= 8)))) \
+ *buffer++ = 0; \
+ }
- put_buffer &= (((INT32) 1)<<size) - 1; /* mask off any extra bits in code */
-
- put_bits += size; /* new number of bits in buffer */
-
- put_buffer <<= 24 - put_bits; /* align incoming bits */
+/***************************************************************/
- put_buffer |= state->cur.put_buffer; /* and merge with old buffer contents */
-
- while (put_bits >= 8) {
- int c = (int) ((put_buffer >> 16) & 0xFF);
-
- emit_byte(state, c, return FALSE);
- if (c == 0xFF) { /* need to stuff a zero byte? */
- emit_byte(state, 0, return FALSE);
- }
- put_buffer <<= 8;
- put_bits -= 8;
- }
+#define DUMP_BITS_(code, size) { \
+ put_bits += size; \
+ put_buffer = (put_buffer << size) | code; \
+ if (put_bits > 7) \
+ while(put_bits > 7) \
+ EMIT_BYTE() \
+ }
- state->cur.put_buffer = put_buffer; /* update state variables */
- state->cur.put_bits = put_bits;
+/***************************************************************/
- return TRUE;
+#define CHECKBUF15() { \
+ if (put_bits > 15) { \
+ EMIT_BYTE() \
+ EMIT_BYTE() \
+ } \
}
+#define CHECKBUF47() { \
+ if (put_bits > 47) { \
+ EMIT_BYTE() \
+ EMIT_BYTE() \
+ EMIT_BYTE() \
+ EMIT_BYTE() \
+ EMIT_BYTE() \
+ EMIT_BYTE() \
+ } \
+}
+
+#define CHECKBUF31() { \
+ if (put_bits > 31) { \
+ EMIT_BYTE() \
+ EMIT_BYTE() \
+ EMIT_BYTE() \
+ EMIT_BYTE() \
+ } \
+}
+
+/***************************************************************/
+
+#define DUMP_BITS_NOCHECK(code, size) { \
+ put_bits += size; \
+ put_buffer = (put_buffer << size) | code; \
+ }
+
+#if __WORDSIZE==64
+
+#define DUMP_BITS(code, size) { \
+ CHECKBUF47() \
+ put_bits += size; \
+ put_buffer = (put_buffer << size) | code; \
+ }
+
+#else
+
+#define DUMP_BITS(code, size) { \
+ put_bits += size; \
+ put_buffer = (put_buffer << size) | code; \
+ CHECKBUF15() \
+ }
+
+#endif
+
+/***************************************************************/
+
+#define DUMP_SINGLE_VALUE(ht, codevalue) { \
+ size = ht->ehufsi[codevalue]; \
+ code = ht->ehufco[codevalue]; \
+ \
+ DUMP_BITS(code, size) \
+ }
+
+/***************************************************************/
+
+#define DUMP_VALUE_SLOW(ht, codevalue, t, nbits) { \
+ size = ht->ehufsi[codevalue]; \
+ code = ht->ehufco[codevalue]; \
+ t &= ~(-1 << nbits); \
+ DUMP_BITS_NOCHECK(code, size) \
+ CHECKBUF15() \
+ DUMP_BITS_NOCHECK(t, nbits) \
+ CHECKBUF15() \
+ }
+
+int _max=0;
+
+#if __WORDSIZE==64
+
+#define DUMP_VALUE(ht, codevalue, t, nbits) { \
+ size = ht->ehufsi[codevalue]; \
+ code = ht->ehufco[codevalue]; \
+ t &= ~(-1 << nbits); \
+ CHECKBUF31() \
+ DUMP_BITS_NOCHECK(code, size) \
+ DUMP_BITS_NOCHECK(t, nbits) \
+ }
+
+#else
+
+#define DUMP_VALUE(ht, codevalue, t, nbits) { \
+ size = ht->ehufsi[codevalue]; \
+ code = ht->ehufco[codevalue]; \
+ t &= ~(-1 << nbits); \
+ DUMP_BITS_NOCHECK(code, size) \
+ CHECKBUF15() \
+ DUMP_BITS_NOCHECK(t, nbits) \
+ CHECKBUF15() \
+ }
+
+#endif
+
+/***************************************************************/
+
+#define BUFSIZE (DCTSIZE2 * 2)
+
+#define LOAD_BUFFER() { \
+ if (state->free_in_buffer < BUFSIZE) { \
+ localbuf = 1; \
+ buffer = _buffer; \
+ } \
+ else buffer = state->next_output_byte; \
+ }
+
+#define STORE_BUFFER() { \
+ if (localbuf) { \
+ bytes = buffer - _buffer; \
+ buffer = _buffer; \
+ while (bytes > 0) { \
+ bytestocopy = min(bytes, state->free_in_buffer); \
+ MEMCOPY(state->next_output_byte, buffer, bytestocopy); \
+ state->next_output_byte += bytestocopy; \
+ buffer += bytestocopy; \
+ state->free_in_buffer -= bytestocopy; \
+ if (state->free_in_buffer == 0) \
+ if (! dump_buffer(state)) return FALSE; \
+ bytes -= bytestocopy; \
+ } \
+ } \
+ else { \
+ state->free_in_buffer -= (buffer - state->next_output_byte); \
+ state->next_output_byte = buffer; \
+ } \
+ }
+
+/***************************************************************/
LOCAL(boolean)
flush_bits (working_state * state)
{
- if (! emit_bits(state, 0x7F, 7)) /* fill any partial byte with ones */
- return FALSE;
+ unsigned char _buffer[BUFSIZE], *buffer;
+ long put_buffer; int put_bits;
+ int bytes, bytestocopy, localbuf = 0;
+
+ put_buffer = state->cur.put_buffer;
+ put_bits = state->cur.put_bits;
+ LOAD_BUFFER()
+
+ DUMP_BITS_(0x7F, 7)
+
state->cur.put_buffer = 0; /* and reset bit-buffer to empty */
state->cur.put_bits = 0;
+ STORE_BUFFER()
+
return TRUE;
}
-
/* Encode a single block's worth of coefficients */
LOCAL(boolean)
encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl, c_derived_tbl *actbl)
{
- register int temp, temp2;
- register int nbits;
- register int k, r, i;
-
+ int temp, temp2;
+ int nbits;
+ int r, sflag, size, code;
+ unsigned char _buffer[BUFSIZE], *buffer;
+ long put_buffer; int put_bits;
+ int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0];
+ int bytes, bytestocopy, localbuf = 0;
+
+ put_buffer = state->cur.put_buffer;
+ put_bits = state->cur.put_bits;
+ LOAD_BUFFER()
+
/* Encode the DC coefficient difference per section F.1.2.1 */
temp = temp2 = block[0] - last_dc_val;
- if (temp < 0) {
- temp = -temp; /* temp is abs value of input */
- /* For a negative input, want temp2 = bitwise complement of abs(input) */
- /* This code assumes we are on a two's complement machine */
- temp2--;
- }
-
- /* Find the number of bits needed for the magnitude of the coefficient */
- nbits = 0;
- while (temp) {
- nbits++;
- temp >>= 1;
- }
- /* Check for out-of-range coefficient values.
- * Since we're encoding a difference, the range limit is twice as much.
- */
- if (nbits > MAX_COEF_BITS+1)
- ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-
- /* Emit the Huffman-coded symbol for the number of bits */
- if (! emit_bits(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
- return FALSE;
-
- /* Emit that number of bits of the value, if positive, */
- /* or the complement of its magnitude, if negative. */
- if (nbits) /* emit_bits rejects calls with size 0 */
- if (! emit_bits(state, (unsigned int) temp2, nbits))
- return FALSE;
+ sflag = temp >> 31;
+ temp -= ((temp + temp) & sflag);
+ temp2 += sflag;
+ nbits = jpeg_first_bit_table[temp];
+ DUMP_VALUE_SLOW(dctbl, nbits, temp2, nbits)
/* Encode the AC coefficients per section F.1.2.2 */
r = 0; /* r = run length of zeros */
-
- for (k = 1; k < DCTSIZE2; k++) {
- if ((temp = block[jpeg_natural_order[k]]) == 0) {
- r++;
- } else {
- /* if run length > 15, must emit special run-length-16 codes (0xF0) */
- while (r > 15) {
- if (! emit_bits(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
- return FALSE;
- r -= 16;
- }
- temp2 = temp;
- if (temp < 0) {
- temp = -temp; /* temp is abs value of input */
- /* This code assumes we are on a two's complement machine */
- temp2--;
- }
-
- /* Find the number of bits needed for the magnitude of the coefficient */
- nbits = 1; /* there must be at least one 1 bit */
- while ((temp >>= 1))
- nbits++;
- /* Check for out-of-range coefficient values */
- if (nbits > MAX_COEF_BITS)
- ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-
- /* Emit Huffman symbol for run length / number of bits */
- i = (r << 4) + nbits;
- if (! emit_bits(state, actbl->ehufco[i], actbl->ehufsi[i]))
- return FALSE;
+#define innerloop(order) { \
+ temp2 = *(JCOEF*)((unsigned char*)block + order); \
+ if(temp2 == 0) r++; \
+ else { \
+ temp = (JCOEF)temp2; \
+ sflag = temp >> 31; \
+ temp = (temp ^ sflag) - sflag; \
+ temp2 += sflag; \
+ nbits = jpeg_first_bit_table[temp]; \
+ for(; r > 15; r -= 16) DUMP_BITS(code_0xf0, size_0xf0) \
+ sflag = (r << 4) + nbits; \
+ DUMP_VALUE(actbl, sflag, temp2, nbits) \
+ r = 0; \
+ }}
- /* Emit that number of bits of the value, if positive, */
- /* or the complement of its magnitude, if negative. */
- if (! emit_bits(state, (unsigned int) temp2, nbits))
- return FALSE;
-
- r = 0;
- }
- }
+ innerloop(2*1); innerloop(2*8); innerloop(2*16); innerloop(2*9);
+ innerloop(2*2); innerloop(2*3); innerloop(2*10); innerloop(2*17);
+ innerloop(2*24); innerloop(2*32); innerloop(2*25); innerloop(2*18);
+ innerloop(2*11); innerloop(2*4); innerloop(2*5); innerloop(2*12);
+ innerloop(2*19); innerloop(2*26); innerloop(2*33); innerloop(2*40);
+ innerloop(2*48); innerloop(2*41); innerloop(2*34); innerloop(2*27);
+ innerloop(2*20); innerloop(2*13); innerloop(2*6); innerloop(2*7);
+ innerloop(2*14); innerloop(2*21); innerloop(2*28); innerloop(2*35);
+ innerloop(2*42); innerloop(2*49); innerloop(2*56); innerloop(2*57);
+ innerloop(2*50); innerloop(2*43); innerloop(2*36); innerloop(2*29);
+ innerloop(2*22); innerloop(2*15); innerloop(2*23); innerloop(2*30);
+ innerloop(2*37); innerloop(2*44); innerloop(2*51); innerloop(2*58);
+ innerloop(2*59); innerloop(2*52); innerloop(2*45); innerloop(2*38);
+ innerloop(2*31); innerloop(2*39); innerloop(2*46); innerloop(2*53);
+ innerloop(2*60); innerloop(2*61); innerloop(2*54); innerloop(2*47);
+ innerloop(2*55); innerloop(2*62); innerloop(2*63);
/* If the last coef(s) were zero, emit an end-of-block code */
- if (r > 0)
- if (! emit_bits(state, actbl->ehufco[0], actbl->ehufsi[0]))
- return FALSE;
+ if (r > 0) DUMP_SINGLE_VALUE(actbl, 0x0)
+
+ state->cur.put_buffer = put_buffer;
+ state->cur.put_bits = put_bits;
+ STORE_BUFFER()
return TRUE;
}
diff --git a/jconfig.bcc b/jconfig.bcc
deleted file mode 100644
index c6c53ff..0000000
--- a/jconfig.bcc
+++ /dev/null
@@ -1,48 +0,0 @@
-/* jconfig.bcc --- jconfig.h for Borland C (Turbo C) on MS-DOS or OS/2. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#ifdef __MSDOS__
-#define NEED_FAR_POINTERS /* for small or medium memory model */
-#endif
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN /* this assumes you have -w-stu in CFLAGS */
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#ifdef __MSDOS__
-#define USE_MSDOS_MEMMGR /* Define this if you use jmemdos.c */
-#define MAX_ALLOC_CHUNK 65520L /* Maximum request to malloc() */
-#define USE_FMEM /* Borland has _fmemcpy() and _fmemset() */
-#endif
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED /* BMP image file format */
-#define GIF_SUPPORTED /* GIF image file format */
-#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED /* Utah RLE image file format */
-#define TARGA_SUPPORTED /* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE
-#define USE_SETMODE /* Borland has setmode() */
-#ifdef __MSDOS__
-#define NEED_SIGNAL_CATCHER /* Define this if you use jmemdos.c */
-#endif
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT /* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.cfg b/jconfig.cfg
deleted file mode 100644
index 36a04fa..0000000
--- a/jconfig.cfg
+++ /dev/null
@@ -1,44 +0,0 @@
-/* jconfig.cfg --- source file edited by configure script */
-/* see jconfig.doc for explanations */
-
-#undef HAVE_PROTOTYPES
-#undef HAVE_UNSIGNED_CHAR
-#undef HAVE_UNSIGNED_SHORT
-#undef void
-#undef const
-#undef CHAR_IS_UNSIGNED
-#undef HAVE_STDDEF_H
-#undef HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-/* Define this if you get warnings about undefined structures. */
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-#undef INLINE
-/* These are for configuring the JPEG memory manager. */
-#undef DEFAULT_MAX_MEM
-#undef NO_MKTEMP
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED /* BMP image file format */
-#define GIF_SUPPORTED /* GIF image file format */
-#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED /* Utah RLE image file format */
-#define TARGA_SUPPORTED /* Targa image file format */
-
-#undef TWO_FILE_COMMANDLINE
-#undef NEED_SIGNAL_CATCHER
-#undef DONT_USE_B_MODE
-
-/* Define this if you want percent-done progress reports from cjpeg/djpeg. */
-#undef PROGRESS_REPORT
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.dj b/jconfig.dj
deleted file mode 100644
index f759a9d..0000000
--- a/jconfig.dj
+++ /dev/null
@@ -1,38 +0,0 @@
-/* jconfig.dj --- jconfig.h for DJGPP (Delorie's GNU C port) on MS-DOS. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS /* DJGPP uses flat 32-bit addressing */
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED /* BMP image file format */
-#define GIF_SUPPORTED /* GIF image file format */
-#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED /* Utah RLE image file format */
-#define TARGA_SUPPORTED /* Targa image file format */
-
-#undef TWO_FILE_COMMANDLINE /* optional */
-#define USE_SETMODE /* Needed to make one-file style work in DJGPP */
-#undef NEED_SIGNAL_CATCHER /* Define this if you use jmemname.c */
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT /* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.h.in b/jconfig.h.in
new file mode 100644
index 0000000..4e5e80e
--- /dev/null
+++ b/jconfig.h.in
@@ -0,0 +1,49 @@
+/* Define if your compiler supports prototypes */
+#undef HAVE_PROTOTYPES
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#undef HAVE_STDDEF_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if the system has the type `unsigned char'. */
+#undef HAVE_UNSIGNED_CHAR
+
+/* Define to 1 if the system has the type `unsigned short'. */
+#undef HAVE_UNSIGNED_SHORT
+
+/* Define if you want use complete types */
+#undef INCOMPLETE_TYPES_BROKEN
+
+/* Define if you have BSD-like bzero and bcopy */
+#undef NEED_BSD_STRINGS
+
+/* Define if you need short function names */
+#undef NEED_SHORT_EXTERNAL_NAMES
+
+/* Define if you have sys/types.h */
+#undef NEED_SYS_TYPES_H
+
+/* Define if shift is unsigned */
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+/* Use accelerated SIMD routines. */
+#undef WITH_SIMD
+
+/* Define to 1 if type `char' is unsigned and you are not using gcc. */
+#ifndef __CHAR_UNSIGNED__
+# undef __CHAR_UNSIGNED__
+#endif
+
+/* Define to empty if `const' does not conform to ANSI C. */
+#undef const
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+ calls it, or to nothing if 'inline' is not supported under any name. */
+#ifndef __cplusplus
+#undef inline
+#endif
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+#undef size_t
diff --git a/jconfig.mac b/jconfig.mac
deleted file mode 100644
index 0de3efe..0000000
--- a/jconfig.mac
+++ /dev/null
@@ -1,43 +0,0 @@
-/* jconfig.mac --- jconfig.h for CodeWarrior on Apple Macintosh */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#define USE_MAC_MEMMGR /* Define this if you use jmemmac.c */
-
-#define ALIGN_TYPE long /* Needed for 680x0 Macs */
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED /* BMP image file format */
-#define GIF_SUPPORTED /* GIF image file format */
-#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED /* Utah RLE image file format */
-#define TARGA_SUPPORTED /* Targa image file format */
-
-#define USE_CCOMMAND /* Command line reader for Macintosh */
-#define TWO_FILE_COMMANDLINE /* Binary I/O thru stdin/stdout doesn't work */
-
-#undef NEED_SIGNAL_CATCHER
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT /* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.manx b/jconfig.manx
deleted file mode 100644
index 6dd0d00..0000000
--- a/jconfig.manx
+++ /dev/null
@@ -1,43 +0,0 @@
-/* jconfig.manx --- jconfig.h for Amiga systems using Manx Aztec C ver 5.x. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#define TEMP_DIRECTORY "JPEGTMP:" /* recommended setting for Amiga */
-
-#define SHORTxSHORT_32 /* produces better DCT code with Aztec C */
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED /* BMP image file format */
-#define GIF_SUPPORTED /* GIF image file format */
-#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED /* Utah RLE image file format */
-#define TARGA_SUPPORTED /* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE
-#define NEED_SIGNAL_CATCHER
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT /* optional */
-
-#define signal_catcher _abort /* hack for Aztec C naming requirements */
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.mc6 b/jconfig.mc6
deleted file mode 100644
index c55082d..0000000
--- a/jconfig.mc6
+++ /dev/null
@@ -1,52 +0,0 @@
-/* jconfig.mc6 --- jconfig.h for Microsoft C on MS-DOS, version 6.00A & up. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#define NEED_FAR_POINTERS /* for small or medium memory model */
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#define USE_MSDOS_MEMMGR /* Define this if you use jmemdos.c */
-
-#define MAX_ALLOC_CHUNK 65520L /* Maximum request to malloc() */
-
-#define USE_FMEM /* Microsoft has _fmemcpy() and _fmemset() */
-
-#define NEED_FHEAPMIN /* far heap management routines are broken */
-
-#define SHORTxLCONST_32 /* enable compiler-specific DCT optimization */
-/* Note: the above define is known to improve the code with Microsoft C 6.00A.
- * I do not know whether it is good for later compiler versions.
- * Please report any info on this point to jpeg-info@uunet.uu.net.
- */
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED /* BMP image file format */
-#define GIF_SUPPORTED /* GIF image file format */
-#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED /* Utah RLE image file format */
-#define TARGA_SUPPORTED /* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE
-#define USE_SETMODE /* Microsoft has setmode() */
-#define NEED_SIGNAL_CATCHER /* Define this if you use jmemdos.c */
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT /* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.sas b/jconfig.sas
deleted file mode 100644
index efdac22..0000000
--- a/jconfig.sas
+++ /dev/null
@@ -1,43 +0,0 @@
-/* jconfig.sas --- jconfig.h for Amiga systems using SAS C 6.0 and up. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#define TEMP_DIRECTORY "JPEGTMP:" /* recommended setting for Amiga */
-
-#define NO_MKTEMP /* SAS C doesn't have mktemp() */
-
-#define SHORTxSHORT_32 /* produces better DCT code with SAS C */
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED /* BMP image file format */
-#define GIF_SUPPORTED /* GIF image file format */
-#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED /* Utah RLE image file format */
-#define TARGA_SUPPORTED /* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE
-#define NEED_SIGNAL_CATCHER
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT /* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.st b/jconfig.st
deleted file mode 100644
index 4421b7a..0000000
--- a/jconfig.st
+++ /dev/null
@@ -1,42 +0,0 @@
-/* jconfig.st --- jconfig.h for Atari ST/STE/TT using Pure C or Turbo C. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-#define INCOMPLETE_TYPES_BROKEN /* suppress undefined-structure warnings */
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#define ALIGN_TYPE long /* apparently double is a weird size? */
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED /* BMP image file format */
-#define GIF_SUPPORTED /* GIF image file format */
-#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED /* Utah RLE image file format */
-#define TARGA_SUPPORTED /* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE /* optional -- undef if you like Unix style */
-/* Note: if you undef TWO_FILE_COMMANDLINE, you may need to define
- * USE_SETMODE. Some Atari compilers require it, some do not.
- */
-#define NEED_SIGNAL_CATCHER /* needed if you use jmemname.c */
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT /* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.vms b/jconfig.vms
deleted file mode 100644
index 55a6ffb..0000000
--- a/jconfig.vms
+++ /dev/null
@@ -1,37 +0,0 @@
-/* jconfig.vms --- jconfig.h for use on Digital VMS. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED /* BMP image file format */
-#define GIF_SUPPORTED /* GIF image file format */
-#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED /* Utah RLE image file format */
-#define TARGA_SUPPORTED /* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE /* Needed on VMS */
-#undef NEED_SIGNAL_CATCHER
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT /* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.wat b/jconfig.wat
deleted file mode 100644
index 6cc545b..0000000
--- a/jconfig.wat
+++ /dev/null
@@ -1,38 +0,0 @@
-/* jconfig.wat --- jconfig.h for Watcom C/C++ on MS-DOS or OS/2. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#define CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS /* Watcom uses flat 32-bit addressing */
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED /* BMP image file format */
-#define GIF_SUPPORTED /* GIF image file format */
-#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED /* Utah RLE image file format */
-#define TARGA_SUPPORTED /* Targa image file format */
-
-#undef TWO_FILE_COMMANDLINE /* optional */
-#define USE_SETMODE /* Needed to make one-file style work in Watcom */
-#undef NEED_SIGNAL_CATCHER /* Define this if you use jmemname.c */
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT /* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jcparam.c b/jcparam.c
index 6fc48f5..a1d49d9 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -2,6 +2,7 @@
* jcparam.c
*
* Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -363,6 +364,12 @@
jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
break;
case JCS_RGB:
+ case JCS_EXT_RGB:
+ case JCS_EXT_RGBX:
+ case JCS_EXT_BGR:
+ case JCS_EXT_BGRX:
+ case JCS_EXT_XBGR:
+ case JCS_EXT_XRGB:
jpeg_set_colorspace(cinfo, JCS_YCbCr);
break;
case JCS_YCbCr:
diff --git a/jcphuff.c b/jcphuff.c
index 07f9178..3102871 100644
--- a/jcphuff.c
+++ b/jcphuff.c
@@ -223,7 +223,6 @@
* between calls, so 24 bits are sufficient.
*/
-INLINE
LOCAL(void)
emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
/* Emit some bits, unless we are in gather mode */
@@ -276,7 +275,6 @@
* Emit (or just count) a Huffman symbol.
*/
-INLINE
LOCAL(void)
emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
{
diff --git a/jcsample.c b/jcsample.c
index 212ec87..eea376f 100644
--- a/jcsample.c
+++ b/jcsample.c
@@ -2,6 +2,7 @@
* jcsample.c
*
* Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -48,6 +49,7 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
+#include "jsimd.h"
/* Pointer to routine to downsample a single component */
@@ -494,7 +496,10 @@
} else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
compptr->v_samp_factor == cinfo->max_v_samp_factor) {
smoothok = FALSE;
- downsample->methods[ci] = h2v1_downsample;
+ if (jsimd_can_h2v1_downsample())
+ downsample->methods[ci] = jsimd_h2v1_downsample;
+ else
+ downsample->methods[ci] = h2v1_downsample;
} else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
#ifdef INPUT_SMOOTHING_SUPPORTED
@@ -503,7 +508,10 @@
downsample->pub.need_context_rows = TRUE;
} else
#endif
- downsample->methods[ci] = h2v2_downsample;
+ if (jsimd_can_h2v2_downsample())
+ downsample->methods[ci] = jsimd_h2v2_downsample;
+ else
+ downsample->methods[ci] = h2v2_downsample;
} else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
(cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
smoothok = FALSE;
diff --git a/jdcoefct.c b/jdcoefct.c
index 4938d20..f56af5f 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -47,6 +47,9 @@
*/
JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];
+ /* Temporary workspace for one MCU */
+ JCOEF * workspace;
+
#ifdef D_MULTISCAN_FILES_SUPPORTED
/* In multi-pass modes, we need a virtual block array for each component. */
jvirt_barray_ptr whole_image[MAX_COMPONENTS];
@@ -471,13 +474,16 @@
jpeg_component_info *compptr;
inverse_DCT_method_ptr inverse_DCT;
boolean first_row, last_row;
- JBLOCK workspace;
+ JCOEF * workspace;
int *coef_bits;
JQUANT_TBL *quanttbl;
INT32 Q00,Q01,Q02,Q10,Q11,Q20, num;
int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
int Al, pred;
+ /* Keep a local variable to avoid looking it up more than once */
+ workspace = coef->workspace;
+
/* Force some input to be done if we are getting ahead of the input. */
while (cinfo->input_scan_number <= cinfo->output_scan_number &&
! cinfo->inputctl->eoi_reached) {
@@ -733,4 +739,9 @@
coef->pub.decompress_data = decompress_onepass;
coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */
}
+
+ /* Allocate the workspace buffer */
+ coef->workspace = (JCOEF *)
+ (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+ SIZEOF(JCOEF) * DCTSIZE2);
}
diff --git a/jdcolor.c b/jdcolor.c
index 6c04dfe..e02ea4f 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -2,6 +2,8 @@
* jdcolor.c
*
* Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009, D. R. Commander.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -11,6 +13,7 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
+#include "jsimd.h"
/* Private subobject */
@@ -146,12 +149,12 @@
cb = GETJSAMPLE(inptr1[col]);
cr = GETJSAMPLE(inptr2[col]);
/* Range-limiting is essential due to noise introduced by DCT losses. */
- outptr[RGB_RED] = range_limit[y + Crrtab[cr]];
- outptr[RGB_GREEN] = range_limit[y +
+ outptr[rgb_red[cinfo->out_color_space]] = range_limit[y + Crrtab[cr]];
+ outptr[rgb_green[cinfo->out_color_space]] = range_limit[y +
((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
SCALEBITS))];
- outptr[RGB_BLUE] = range_limit[y + Cbbtab[cb]];
- outptr += RGB_PIXELSIZE;
+ outptr[rgb_blue[cinfo->out_color_space]] = range_limit[y + Cbbtab[cb]];
+ outptr += rgb_pixelsize[cinfo->out_color_space];
}
}
}
@@ -219,16 +222,21 @@
JSAMPARRAY output_buf, int num_rows)
{
register JSAMPROW inptr, outptr;
+ JSAMPLE *maxinptr;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->output_width;
+ int rindex = rgb_red[cinfo->out_color_space];
+ int gindex = rgb_green[cinfo->out_color_space];
+ int bindex = rgb_blue[cinfo->out_color_space];
+ int rgbstride = rgb_pixelsize[cinfo->out_color_space];
while (--num_rows >= 0) {
inptr = input_buf[0][input_row++];
+ maxinptr = &inptr[num_cols];
outptr = *output_buf++;
- for (col = 0; col < num_cols; col++) {
+ for (; inptr < maxinptr; inptr++, outptr += rgbstride) {
/* We can dispense with GETJSAMPLE() here */
- outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
- outptr += RGB_PIXELSIZE;
+ outptr[rindex] = outptr[gindex] = outptr[bindex] = *inptr;
}
}
}
@@ -356,13 +364,24 @@
break;
case JCS_RGB:
- cinfo->out_color_components = RGB_PIXELSIZE;
+ case JCS_EXT_RGB:
+ case JCS_EXT_RGBX:
+ case JCS_EXT_BGR:
+ case JCS_EXT_BGRX:
+ case JCS_EXT_XBGR:
+ case JCS_EXT_XRGB:
+ cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space];
if (cinfo->jpeg_color_space == JCS_YCbCr) {
- cconvert->pub.color_convert = ycc_rgb_convert;
- build_ycc_rgb_table(cinfo);
+ if (jsimd_can_ycc_rgb())
+ cconvert->pub.color_convert = jsimd_ycc_rgb_convert;
+ else {
+ cconvert->pub.color_convert = ycc_rgb_convert;
+ build_ycc_rgb_table(cinfo);
+ }
} else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
cconvert->pub.color_convert = gray_rgb_convert;
- } else if (cinfo->jpeg_color_space == JCS_RGB && RGB_PIXELSIZE == 3) {
+ } else if (cinfo->jpeg_color_space == cinfo->out_color_space &&
+ rgb_pixelsize[cinfo->out_color_space] == 3) {
cconvert->pub.color_convert = null_convert;
} else
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
diff --git a/jdct.h b/jdct.h
index 04192a2..7b49a97 100644
--- a/jdct.h
+++ b/jdct.h
@@ -23,18 +23,26 @@
* have a range of +-8K for 8-bit data, +-128K for 12-bit data. This
* convention improves accuracy in integer implementations and saves some
* work in floating-point ones.
- * Quantization of the output coefficients is done by jcdctmgr.c.
+ * Quantization of the output coefficients is done by jcdctmgr.c. This
+ * step requires an unsigned type and also one with twice the bits.
*/
#if BITS_IN_JSAMPLE == 8
+#ifndef WITH_SIMD
typedef int DCTELEM; /* 16 or 32 bits is fine */
+typedef unsigned int UDCTELEM;
+typedef unsigned long long UDCTELEM2;
+#else
+typedef short DCTELEM; /* prefer 16 bit with SIMD for parellelism */
+typedef unsigned short UDCTELEM;
+typedef unsigned int UDCTELEM2;
+#endif
#else
typedef INT32 DCTELEM; /* must have 32 bits */
+typedef UINT32 UDCTELEM;
+typedef unsigned long long UDCTELEM2;
#endif
-typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
-typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
-
/*
* An inverse DCT routine is given a pointer to the input JBLOCK and a pointer
diff --git a/jddctmgr.c b/jddctmgr.c
index bbf8d0e..52f5090 100644
--- a/jddctmgr.c
+++ b/jddctmgr.c
@@ -2,6 +2,7 @@
* jddctmgr.c
*
* Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -19,6 +20,7 @@
#include "jinclude.h"
#include "jpeglib.h"
#include "jdct.h" /* Private declarations for DCT subsystem */
+#include "jsimddct.h"
/*
@@ -105,11 +107,17 @@
method = JDCT_ISLOW; /* jidctred uses islow-style table */
break;
case 2:
- method_ptr = jpeg_idct_2x2;
+ if (jsimd_can_idct_2x2())
+ method_ptr = jsimd_idct_2x2;
+ else
+ method_ptr = jpeg_idct_2x2;
method = JDCT_ISLOW; /* jidctred uses islow-style table */
break;
case 4:
- method_ptr = jpeg_idct_4x4;
+ if (jsimd_can_idct_4x4())
+ method_ptr = jsimd_idct_4x4;
+ else
+ method_ptr = jpeg_idct_4x4;
method = JDCT_ISLOW; /* jidctred uses islow-style table */
break;
#endif
@@ -117,19 +125,28 @@
switch (cinfo->dct_method) {
#ifdef DCT_ISLOW_SUPPORTED
case JDCT_ISLOW:
- method_ptr = jpeg_idct_islow;
+ if (jsimd_can_idct_islow())
+ method_ptr = jsimd_idct_islow;
+ else
+ method_ptr = jpeg_idct_islow;
method = JDCT_ISLOW;
break;
#endif
#ifdef DCT_IFAST_SUPPORTED
case JDCT_IFAST:
- method_ptr = jpeg_idct_ifast;
+ if (jsimd_can_idct_ifast())
+ method_ptr = jsimd_idct_ifast;
+ else
+ method_ptr = jpeg_idct_ifast;
method = JDCT_IFAST;
break;
#endif
#ifdef DCT_FLOAT_SUPPORTED
case JDCT_FLOAT:
- method_ptr = jpeg_idct_float;
+ if (jsimd_can_idct_float())
+ method_ptr = jsimd_idct_float;
+ else
+ method_ptr = jpeg_idct_float;
method = JDCT_FLOAT;
break;
#endif
diff --git a/jdhuff.c b/jdhuff.c
index b5ba39f..18a0c7e 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -14,6 +14,21 @@
* storage only upon successful completion of an MCU.
*/
+/* Modifications:
+ * Copyright (C)2007 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version. The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * wxWindows Library License for more details.
+ */
+
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
@@ -234,7 +249,8 @@
* with that code.
*/
- MEMZERO(dtbl->look_nbits, SIZEOF(dtbl->look_nbits));
+ for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
+ dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
p = 0;
for (l = 1; l <= HUFF_LOOKAHEAD; l++) {
@@ -243,8 +259,7 @@
/* Generate left-justified code followed by all possible bit sequences */
lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l);
for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) {
- dtbl->look_nbits[lookbits] = l;
- dtbl->look_sym[lookbits] = htbl->huffval[p];
+ dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p];
lookbits++;
}
}
@@ -438,9 +453,10 @@
* On some machines, a shift and add will be faster than a table lookup.
*/
+#define AVOID_TABLES
#ifdef AVOID_TABLES
-#define HUFF_EXTEND(x,s) ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
+#define HUFF_EXTEND(x,s) ((x) + ((((x) - (1<<((s)-1))) >> 31) & (((-1)<<(s)) + 1)))
#else
@@ -498,6 +514,236 @@
}
+LOCAL(boolean)
+decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ BITREAD_STATE_VARS;
+ int blkn;
+ savable_state state;
+ /* Outer loop handles each block in the MCU */
+
+ /* Load up working state */
+ BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+ ASSIGN_STATE(state, entropy->saved);
+
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ JBLOCKROW block = MCU_data[blkn];
+ d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
+ d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
+ register int s, k, r;
+
+ /* Decode a single block's worth of coefficients */
+
+ /* Section F.2.2.1: decode the DC coefficient difference */
+ HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
+ if (s) {
+ CHECK_BIT_BUFFER(br_state, s, return FALSE);
+ r = GET_BITS(s);
+ s = HUFF_EXTEND(r, s);
+ }
+
+ if (entropy->dc_needed[blkn]) {
+ /* Convert DC difference to actual value, update last_dc_val */
+ int ci = cinfo->MCU_membership[blkn];
+ s += state.last_dc_val[ci];
+ state.last_dc_val[ci] = s;
+ /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
+ (*block)[0] = (JCOEF) s;
+ }
+
+ if (entropy->ac_needed[blkn]) {
+
+ /* Section F.2.2.2: decode the AC coefficients */
+ /* Since zeroes are skipped, output area must be cleared beforehand */
+ for (k = 1; k < DCTSIZE2; k++) {
+ HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
+
+ r = s >> 4;
+ s &= 15;
+
+ if (s) {
+ k += r;
+ CHECK_BIT_BUFFER(br_state, s, return FALSE);
+ r = GET_BITS(s);
+ s = HUFF_EXTEND(r, s);
+ /* Output coefficient in natural (dezigzagged) order.
+ * Note: the extra entries in jpeg_natural_order[] will save us
+ * if k >= DCTSIZE2, which could happen if the data is corrupted.
+ */
+ (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+ } else {
+ if (r != 15)
+ break;
+ k += 15;
+ }
+ }
+
+ } else {
+
+ /* Section F.2.2.2: decode the AC coefficients */
+ /* In this path we just discard the values */
+ for (k = 1; k < DCTSIZE2; k++) {
+ HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
+
+ r = s >> 4;
+ s &= 15;
+
+ if (s) {
+ k += r;
+ CHECK_BIT_BUFFER(br_state, s, return FALSE);
+ DROP_BITS(s);
+ } else {
+ if (r != 15)
+ break;
+ k += 15;
+ }
+ }
+ }
+ }
+
+ /* Completed MCU, so update state */
+ BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+ ASSIGN_STATE(entropy->saved, state);
+ return TRUE;
+}
+
+
+/***************************************************************/
+
+#define ADD_BYTE { \
+ int val0 = *(buffer++); \
+ int val1 = *(buffer); \
+ \
+ bits_left += 8; \
+ get_buffer = (get_buffer << 8) | (val0); \
+ if (val0 == 0xFF) { \
+ buffer++; \
+ if (val1 != 0) { \
+ buffer -= 2; \
+ get_buffer &= ~0xFF; \
+ } \
+ } \
+}
+
+/***************************************************************/
+
+#if __WORDSIZE == 64
+
+#define ENSURE_SHORT \
+ if (bits_left < 16) { \
+ ADD_BYTE ADD_BYTE ADD_BYTE ADD_BYTE ADD_BYTE ADD_BYTE \
+ }
+
+#else
+
+#define ENSURE_SHORT if (bits_left < 16) { ADD_BYTE ADD_BYTE }
+
+#endif
+
+/***************************************************************/
+
+#define HUFF_DECODE_FAST(symbol, size, htbl) { \
+ ENSURE_SHORT \
+ symbol = PEEK_BITS(HUFF_LOOKAHEAD); \
+ symbol = htbl->lookup[symbol]; \
+ size = symbol >> 8; \
+ bits_left -= size; \
+ symbol = symbol & ((1 << HUFF_LOOKAHEAD) - 1); \
+ if (size == HUFF_LOOKAHEAD + 1) { \
+ symbol = (get_buffer >> bits_left) & ((1 << (size)) - 1); \
+ while (symbol > htbl->maxcode[size]) { \
+ symbol <<= 1; \
+ symbol |= GET_BITS(1); \
+ size++; \
+ } \
+ symbol = htbl->pub->huffval[ (int) (symbol + htbl->valoffset[size]) ]; \
+ } \
+}
+
+/***************************************************************/
+
+LOCAL(boolean)
+decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ BITREAD_STATE_VARS;
+ JOCTET *buffer;
+ int blkn;
+ savable_state state;
+ /* Outer loop handles each block in the MCU */
+
+ /* Load up working state */
+ BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+ buffer = (JOCTET *) br_state.next_input_byte;
+ ASSIGN_STATE(state, entropy->saved);
+
+ for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+ JBLOCKROW block = MCU_data[blkn];
+ d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
+ d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
+ register int s, k, r, l;
+
+ HUFF_DECODE_FAST(s, l, dctbl);
+ if (s) {
+ ENSURE_SHORT
+ r = GET_BITS(s);
+ s = HUFF_EXTEND(r, s);
+ }
+
+ if (entropy->dc_needed[blkn]) {
+ int ci = cinfo->MCU_membership[blkn];
+ s += state.last_dc_val[ci];
+ state.last_dc_val[ci] = s;
+ (*block)[0] = (JCOEF) s;
+ }
+
+ if (entropy->ac_needed[blkn]) {
+
+ for (k = 1; k < DCTSIZE2; k++) {
+ HUFF_DECODE_FAST(s, l, actbl);
+ r = s >> 4;
+ s &= 15;
+
+ if (s) {
+ k += r;
+ ENSURE_SHORT
+ r = GET_BITS(s);
+ s = HUFF_EXTEND(r, s);
+ (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+ } else {
+ if (r != 15) break;
+ k += 15;
+ }
+ }
+
+ } else {
+
+ for (k = 1; k < DCTSIZE2; k++) {
+ HUFF_DECODE_FAST(s, l, actbl);
+ r = s >> 4;
+ s &= 15;
+
+ if (s) {
+ k += r;
+ ENSURE_SHORT
+ DROP_BITS(s);
+ } else {
+ if (r != 15) break;
+ k += 15;
+ }
+ }
+ }
+ }
+
+ br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
+ br_state.next_input_byte = buffer;
+ BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+ ASSIGN_STATE(entropy->saved, state);
+ return TRUE;
+}
+
+
/*
* Decode and return one MCU's worth of Huffman-compressed coefficients.
* The coefficients are reordered from zigzag order into natural array order,
@@ -513,13 +759,12 @@
* this module, since we'll just re-assign them on the next call.)
*/
+#define BUFSIZE (DCTSIZE2 * 2)
+
METHODDEF(boolean)
decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
{
huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
- int blkn;
- BITREAD_STATE_VARS;
- savable_state state;
/* Process restart marker if needed; may have to suspend */
if (cinfo->restart_interval) {
@@ -533,91 +778,13 @@
*/
if (! entropy->pub.insufficient_data) {
- /* Load up working state */
- BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
- ASSIGN_STATE(state, entropy->saved);
-
- /* Outer loop handles each block in the MCU */
-
- for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
- JBLOCKROW block = MCU_data[blkn];
- d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
- d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
- register int s, k, r;
-
- /* Decode a single block's worth of coefficients */
-
- /* Section F.2.2.1: decode the DC coefficient difference */
- HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
- if (s) {
- CHECK_BIT_BUFFER(br_state, s, return FALSE);
- r = GET_BITS(s);
- s = HUFF_EXTEND(r, s);
- }
-
- if (entropy->dc_needed[blkn]) {
- /* Convert DC difference to actual value, update last_dc_val */
- int ci = cinfo->MCU_membership[blkn];
- s += state.last_dc_val[ci];
- state.last_dc_val[ci] = s;
- /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
- (*block)[0] = (JCOEF) s;
- }
-
- if (entropy->ac_needed[blkn]) {
-
- /* Section F.2.2.2: decode the AC coefficients */
- /* Since zeroes are skipped, output area must be cleared beforehand */
- for (k = 1; k < DCTSIZE2; k++) {
- HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
-
- r = s >> 4;
- s &= 15;
-
- if (s) {
- k += r;
- CHECK_BIT_BUFFER(br_state, s, return FALSE);
- r = GET_BITS(s);
- s = HUFF_EXTEND(r, s);
- /* Output coefficient in natural (dezigzagged) order.
- * Note: the extra entries in jpeg_natural_order[] will save us
- * if k >= DCTSIZE2, which could happen if the data is corrupted.
- */
- (*block)[jpeg_natural_order[k]] = (JCOEF) s;
- } else {
- if (r != 15)
- break;
- k += 15;
- }
- }
-
- } else {
-
- /* Section F.2.2.2: decode the AC coefficients */
- /* In this path we just discard the values */
- for (k = 1; k < DCTSIZE2; k++) {
- HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
-
- r = s >> 4;
- s &= 15;
-
- if (s) {
- k += r;
- CHECK_BIT_BUFFER(br_state, s, return FALSE);
- DROP_BITS(s);
- } else {
- if (r != 15)
- break;
- k += 15;
- }
- }
-
- }
+ if (cinfo->src->bytes_in_buffer >= BUFSIZE) {
+ if (!decode_mcu_fast(cinfo, MCU_data)) return FALSE;
+ }
+ else {
+ if (!decode_mcu_slow(cinfo, MCU_data)) return FALSE;
}
- /* Completed MCU, so update state */
- BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
- ASSIGN_STATE(entropy->saved, state);
}
/* Account for restart interval (no-op if not using restarts) */
diff --git a/jdhuff.h b/jdhuff.h
index ae19b6c..a7c8188 100644
--- a/jdhuff.h
+++ b/jdhuff.h
@@ -36,13 +36,17 @@
/* Link to public Huffman table (needed only in jpeg_huff_decode) */
JHUFF_TBL *pub;
- /* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
+ /* Lookahead table: indexed by the next HUFF_LOOKAHEAD bits of
* the input data stream. If the next Huffman code is no more
* than HUFF_LOOKAHEAD bits long, we can obtain its length and
- * the corresponding symbol directly from these tables.
+ * the corresponding symbol directly from this tables.
+ *
+ * The lower 8 bits of each table entry contain the number of
+ * bits in the corresponding Huffman code, or HUFF_LOOKAHEAD + 1
+ * if too long. The next 8 bits of each entry contain the
+ * symbol.
*/
- int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
- UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
+ int lookup[1<<HUFF_LOOKAHEAD];
} d_derived_tbl;
/* Expand a Huffman table definition into the derived format */
@@ -69,8 +73,17 @@
* necessary.
*/
+#if __WORDSIZE == 64
+
+typedef long bit_buf_type; /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE 64 /* size of buffer in bits */
+
+#else
+
typedef INT32 bit_buf_type; /* type of bit-extraction buffer */
-#define BIT_BUF_SIZE 32 /* size of buffer in bits */
+#define BIT_BUF_SIZE 32 /* size of buffer in bits */
+
+#endif
/* If long is > 32 bits on your machine, and shifting/masking longs is
* reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
@@ -183,11 +196,10 @@
} \
} \
look = PEEK_BITS(HUFF_LOOKAHEAD); \
- if ((nb = htbl->look_nbits[look]) != 0) { \
+ if ((nb = (htbl->lookup[look] >> HUFF_LOOKAHEAD)) <= HUFF_LOOKAHEAD) { \
DROP_BITS(nb); \
- result = htbl->look_sym[look]; \
+ result = htbl->lookup[look] & ((1 << HUFF_LOOKAHEAD) - 1); \
} else { \
- nb = HUFF_LOOKAHEAD+1; \
slowlabel: \
if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
{ failaction; } \
diff --git a/jdmaster.c b/jdmaster.c
index 2802c5b..8314b67 100644
--- a/jdmaster.c
+++ b/jdmaster.c
@@ -2,6 +2,7 @@
* jdmaster.c
*
* Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -49,8 +50,14 @@
return FALSE;
/* jdmerge.c only supports YCC=>RGB color conversion */
if (cinfo->jpeg_color_space != JCS_YCbCr || cinfo->num_components != 3 ||
- cinfo->out_color_space != JCS_RGB ||
- cinfo->out_color_components != RGB_PIXELSIZE)
+ (cinfo->out_color_space != JCS_RGB &&
+ cinfo->out_color_space != JCS_EXT_RGB &&
+ cinfo->out_color_space != JCS_EXT_RGBX &&
+ cinfo->out_color_space != JCS_EXT_BGR &&
+ cinfo->out_color_space != JCS_EXT_BGRX &&
+ cinfo->out_color_space != JCS_EXT_XBGR &&
+ cinfo->out_color_space != JCS_EXT_XRGB) ||
+ cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space])
return FALSE;
/* and it only handles 2h1v or 2h2v sampling ratios */
if (cinfo->comp_info[0].h_samp_factor != 2 ||
@@ -175,10 +182,14 @@
cinfo->out_color_components = 1;
break;
case JCS_RGB:
-#if RGB_PIXELSIZE != 3
- cinfo->out_color_components = RGB_PIXELSIZE;
+ case JCS_EXT_RGB:
+ case JCS_EXT_RGBX:
+ case JCS_EXT_BGR:
+ case JCS_EXT_BGRX:
+ case JCS_EXT_XBGR:
+ case JCS_EXT_XRGB:
+ cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space];
break;
-#endif /* else share code with YCbCr */
case JCS_YCbCr:
cinfo->out_color_components = 3;
break;
diff --git a/jdmerge.c b/jdmerge.c
index 3744446..edf061a 100644
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -2,6 +2,8 @@
* jdmerge.c
*
* Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009, D. R. Commander.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -35,6 +37,7 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
+#include "jsimd.h"
#ifdef UPSAMPLE_MERGING_SUPPORTED
@@ -255,15 +258,15 @@
cblue = Cbbtab[cb];
/* Fetch 2 Y values and emit 2 pixels */
y = GETJSAMPLE(*inptr0++);
- outptr[RGB_RED] = range_limit[y + cred];
- outptr[RGB_GREEN] = range_limit[y + cgreen];
- outptr[RGB_BLUE] = range_limit[y + cblue];
- outptr += RGB_PIXELSIZE;
+ outptr[rgb_red[cinfo->out_color_space]] = range_limit[y + cred];
+ outptr[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+ outptr[rgb_blue[cinfo->out_color_space]] = range_limit[y + cblue];
+ outptr += rgb_pixelsize[cinfo->out_color_space];
y = GETJSAMPLE(*inptr0++);
- outptr[RGB_RED] = range_limit[y + cred];
- outptr[RGB_GREEN] = range_limit[y + cgreen];
- outptr[RGB_BLUE] = range_limit[y + cblue];
- outptr += RGB_PIXELSIZE;
+ outptr[rgb_red[cinfo->out_color_space]] = range_limit[y + cred];
+ outptr[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+ outptr[rgb_blue[cinfo->out_color_space]] = range_limit[y + cblue];
+ outptr += rgb_pixelsize[cinfo->out_color_space];
}
/* If image width is odd, do the last output column separately */
if (cinfo->output_width & 1) {
@@ -273,9 +276,9 @@
cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
y = GETJSAMPLE(*inptr0);
- outptr[RGB_RED] = range_limit[y + cred];
- outptr[RGB_GREEN] = range_limit[y + cgreen];
- outptr[RGB_BLUE] = range_limit[y + cblue];
+ outptr[rgb_red[cinfo->out_color_space]] = range_limit[y + cred];
+ outptr[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+ outptr[rgb_blue[cinfo->out_color_space]] = range_limit[y + cblue];
}
}
@@ -319,24 +322,24 @@
cblue = Cbbtab[cb];
/* Fetch 4 Y values and emit 4 pixels */
y = GETJSAMPLE(*inptr00++);
- outptr0[RGB_RED] = range_limit[y + cred];
- outptr0[RGB_GREEN] = range_limit[y + cgreen];
- outptr0[RGB_BLUE] = range_limit[y + cblue];
+ outptr0[rgb_red[cinfo->out_color_space]] = range_limit[y + cred];
+ outptr0[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+ outptr0[rgb_blue[cinfo->out_color_space]] = range_limit[y + cblue];
outptr0 += RGB_PIXELSIZE;
y = GETJSAMPLE(*inptr00++);
- outptr0[RGB_RED] = range_limit[y + cred];
- outptr0[RGB_GREEN] = range_limit[y + cgreen];
- outptr0[RGB_BLUE] = range_limit[y + cblue];
+ outptr0[rgb_red[cinfo->out_color_space]] = range_limit[y + cred];
+ outptr0[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+ outptr0[rgb_blue[cinfo->out_color_space]] = range_limit[y + cblue];
outptr0 += RGB_PIXELSIZE;
y = GETJSAMPLE(*inptr01++);
- outptr1[RGB_RED] = range_limit[y + cred];
- outptr1[RGB_GREEN] = range_limit[y + cgreen];
- outptr1[RGB_BLUE] = range_limit[y + cblue];
+ outptr1[rgb_red[cinfo->out_color_space]] = range_limit[y + cred];
+ outptr1[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+ outptr1[rgb_blue[cinfo->out_color_space]] = range_limit[y + cblue];
outptr1 += RGB_PIXELSIZE;
y = GETJSAMPLE(*inptr01++);
- outptr1[RGB_RED] = range_limit[y + cred];
- outptr1[RGB_GREEN] = range_limit[y + cgreen];
- outptr1[RGB_BLUE] = range_limit[y + cblue];
+ outptr1[rgb_red[cinfo->out_color_space]] = range_limit[y + cred];
+ outptr1[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+ outptr1[rgb_blue[cinfo->out_color_space]] = range_limit[y + cblue];
outptr1 += RGB_PIXELSIZE;
}
/* If image width is odd, do the last output column separately */
@@ -347,13 +350,13 @@
cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
cblue = Cbbtab[cb];
y = GETJSAMPLE(*inptr00);
- outptr0[RGB_RED] = range_limit[y + cred];
- outptr0[RGB_GREEN] = range_limit[y + cgreen];
- outptr0[RGB_BLUE] = range_limit[y + cblue];
+ outptr0[rgb_red[cinfo->out_color_space]] = range_limit[y + cred];
+ outptr0[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+ outptr0[rgb_blue[cinfo->out_color_space]] = range_limit[y + cblue];
y = GETJSAMPLE(*inptr01);
- outptr1[RGB_RED] = range_limit[y + cred];
- outptr1[RGB_GREEN] = range_limit[y + cgreen];
- outptr1[RGB_BLUE] = range_limit[y + cblue];
+ outptr1[rgb_red[cinfo->out_color_space]] = range_limit[y + cred];
+ outptr1[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+ outptr1[rgb_blue[cinfo->out_color_space]] = range_limit[y + cblue];
}
}
@@ -382,14 +385,20 @@
if (cinfo->max_v_samp_factor == 2) {
upsample->pub.upsample = merged_2v_upsample;
- upsample->upmethod = h2v2_merged_upsample;
+ if (jsimd_can_h2v2_merged_upsample())
+ upsample->upmethod = jsimd_h2v2_merged_upsample;
+ else
+ upsample->upmethod = h2v2_merged_upsample;
/* Allocate a spare row buffer */
upsample->spare_row = (JSAMPROW)
(*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
(size_t) (upsample->out_row_width * SIZEOF(JSAMPLE)));
} else {
upsample->pub.upsample = merged_1v_upsample;
- upsample->upmethod = h2v1_merged_upsample;
+ if (jsimd_can_h2v1_merged_upsample())
+ upsample->upmethod = jsimd_h2v1_merged_upsample;
+ else
+ upsample->upmethod = h2v1_merged_upsample;
/* No spare row needed */
upsample->spare_row = NULL;
}
diff --git a/jdsample.c b/jdsample.c
index 80ffefb..4e0b8b4 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -2,6 +2,7 @@
* jdsample.c
*
* Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -21,6 +22,7 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
+#include "jsimd.h"
/* Pointer to routine to upsample a single component */
@@ -447,18 +449,32 @@
} else if (h_in_group * 2 == h_out_group &&
v_in_group == v_out_group) {
/* Special cases for 2h1v upsampling */
- if (do_fancy && compptr->downsampled_width > 2)
- upsample->methods[ci] = h2v1_fancy_upsample;
- else
- upsample->methods[ci] = h2v1_upsample;
+ if (do_fancy && compptr->downsampled_width > 2) {
+ if (jsimd_can_h2v1_fancy_upsample())
+ upsample->methods[ci] = jsimd_h2v1_fancy_upsample;
+ else
+ upsample->methods[ci] = h2v1_fancy_upsample;
+ } else {
+ if (jsimd_can_h2v1_upsample())
+ upsample->methods[ci] = jsimd_h2v1_upsample;
+ else
+ upsample->methods[ci] = h2v1_upsample;
+ }
} else if (h_in_group * 2 == h_out_group &&
v_in_group * 2 == v_out_group) {
/* Special cases for 2h2v upsampling */
if (do_fancy && compptr->downsampled_width > 2) {
- upsample->methods[ci] = h2v2_fancy_upsample;
+ if (jsimd_can_h2v2_fancy_upsample())
+ upsample->methods[ci] = jsimd_h2v2_fancy_upsample;
+ else
+ upsample->methods[ci] = h2v2_fancy_upsample;
upsample->pub.need_context_rows = TRUE;
- } else
- upsample->methods[ci] = h2v2_upsample;
+ } else {
+ if (jsimd_can_h2v2_upsample())
+ upsample->methods[ci] = jsimd_h2v2_upsample;
+ else
+ upsample->methods[ci] = h2v2_upsample;
+ }
} else if ((h_out_group % h_in_group) == 0 &&
(v_out_group % v_in_group) == 0) {
/* Generic integral-factors upsampling method */
diff --git a/jmemansi.c b/jmemansi.c
deleted file mode 100644
index 2d93e49..0000000
--- a/jmemansi.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * jmemansi.c
- *
- * Copyright (C) 1992-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file provides a simple generic implementation of the system-
- * dependent portion of the JPEG memory manager. This implementation
- * assumes that you have the ANSI-standard library routine tmpfile().
- * Also, the problem of determining the amount of memory available
- * is shoved onto the user.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h" /* import the system-dependent declarations */
-
-#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare malloc(),free() */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
-#endif
-
-#ifndef SEEK_SET /* pre-ANSI systems may not define this; */
-#define SEEK_SET 0 /* if not, assume 0 is correct */
-#endif
-
-
-/*
- * Memory allocation and freeing are controlled by the regular library
- * routines malloc() and free().
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
- return (void *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
- free(object);
-}
-
-
-/*
- * "Large" objects are treated the same as "small" ones.
- * NB: although we include FAR keywords in the routine declarations,
- * this file won't actually work in 80x86 small/medium model; at least,
- * you probably won't be able to process useful-size images in only 64KB.
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
- return (void FAR *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
- free(object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- * It's impossible to do this in a portable way; our current solution is
- * to make the user tell us (with a default value set at compile time).
- * If you can actually get the available space, it's a good idea to subtract
- * a slop factor of 5% or so.
- */
-
-#ifndef DEFAULT_MAX_MEM /* so can override from makefile */
-#define DEFAULT_MAX_MEM 1000000L /* default: one megabyte */
-#endif
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
- long max_bytes_needed, long already_allocated)
-{
- return cinfo->mem->max_memory_to_use - already_allocated;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed. You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-
-METHODDEF(void)
-read_backing_store (j_common_ptr cinfo, backing_store_ptr info,
- void FAR * buffer_address,
- long file_offset, long byte_count)
-{
- if (fseek(info->temp_file, file_offset, SEEK_SET))
- ERREXIT(cinfo, JERR_TFILE_SEEK);
- if (JFREAD(info->temp_file, buffer_address, byte_count)
- != (size_t) byte_count)
- ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_backing_store (j_common_ptr cinfo, backing_store_ptr info,
- void FAR * buffer_address,
- long file_offset, long byte_count)
-{
- if (fseek(info->temp_file, file_offset, SEEK_SET))
- ERREXIT(cinfo, JERR_TFILE_SEEK);
- if (JFWRITE(info->temp_file, buffer_address, byte_count)
- != (size_t) byte_count)
- ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_backing_store (j_common_ptr cinfo, backing_store_ptr info)
-{
- fclose(info->temp_file);
- /* Since this implementation uses tmpfile() to create the file,
- * no explicit file deletion is needed.
- */
-}
-
-
-/*
- * Initial opening of a backing-store object.
- *
- * This version uses tmpfile(), which constructs a suitable file name
- * behind the scenes. We don't have to use info->temp_name[] at all;
- * indeed, we can't even find out the actual name of the temp file.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
- long total_bytes_needed)
-{
- if ((info->temp_file = tmpfile()) == NULL)
- ERREXITS(cinfo, JERR_TFILE_CREATE, "");
- info->read_backing_store = read_backing_store;
- info->write_backing_store = write_backing_store;
- info->close_backing_store = close_backing_store;
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
- return DEFAULT_MAX_MEM; /* default for max_memory_to_use */
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
- /* no work */
-}
diff --git a/jmemdos.c b/jmemdos.c
deleted file mode 100644
index 60b45c6..0000000
--- a/jmemdos.c
+++ /dev/null
@@ -1,638 +0,0 @@
-/*
- * jmemdos.c
- *
- * Copyright (C) 1992-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file provides an MS-DOS-compatible implementation of the system-
- * dependent portion of the JPEG memory manager. Temporary data can be
- * stored in extended or expanded memory as well as in regular DOS files.
- *
- * If you use this file, you must be sure that NEED_FAR_POINTERS is defined
- * if you compile in a small-data memory model; it should NOT be defined if
- * you use a large-data memory model. This file is not recommended if you
- * are using a flat-memory-space 386 environment such as DJGCC or Watcom C.
- * Also, this code will NOT work if struct fields are aligned on greater than
- * 2-byte boundaries.
- *
- * Based on code contributed by Ge' Weijers.
- */
-
-/*
- * If you have both extended and expanded memory, you may want to change the
- * order in which they are tried in jopen_backing_store. On a 286 machine
- * expanded memory is usually faster, since extended memory access involves
- * an expensive protected-mode-and-back switch. On 386 and better, extended
- * memory is usually faster. As distributed, the code tries extended memory
- * first (what? not everyone has a 386? :-).
- *
- * You can disable use of extended/expanded memory entirely by altering these
- * definitions or overriding them from the Makefile (eg, -DEMS_SUPPORTED=0).
- */
-
-#ifndef XMS_SUPPORTED
-#define XMS_SUPPORTED 1
-#endif
-#ifndef EMS_SUPPORTED
-#define EMS_SUPPORTED 1
-#endif
-
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h" /* import the system-dependent declarations */
-
-#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare these */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
-extern char * getenv JPP((const char * name));
-#endif
-
-#ifdef NEED_FAR_POINTERS
-
-#ifdef __TURBOC__
-/* These definitions work for Borland C (Turbo C) */
-#include <alloc.h> /* need farmalloc(), farfree() */
-#define far_malloc(x) farmalloc(x)
-#define far_free(x) farfree(x)
-#else
-/* These definitions work for Microsoft C and compatible compilers */
-#include <malloc.h> /* need _fmalloc(), _ffree() */
-#define far_malloc(x) _fmalloc(x)
-#define far_free(x) _ffree(x)
-#endif
-
-#else /* not NEED_FAR_POINTERS */
-
-#define far_malloc(x) malloc(x)
-#define far_free(x) free(x)
-
-#endif /* NEED_FAR_POINTERS */
-
-#ifdef DONT_USE_B_MODE /* define mode parameters for fopen() */
-#define READ_BINARY "r"
-#else
-#define READ_BINARY "rb"
-#endif
-
-#ifndef USE_MSDOS_MEMMGR /* make sure user got configuration right */
- You forgot to define USE_MSDOS_MEMMGR in jconfig.h. /* deliberate syntax error */
-#endif
-
-#if MAX_ALLOC_CHUNK >= 65535L /* make sure jconfig.h got this right */
- MAX_ALLOC_CHUNK should be less than 64K. /* deliberate syntax error */
-#endif
-
-
-/*
- * Declarations for assembly-language support routines (see jmemdosa.asm).
- *
- * The functions are declared "far" as are all their pointer arguments;
- * this ensures the assembly source code will work regardless of the
- * compiler memory model. We assume "short" is 16 bits, "long" is 32.
- */
-
-typedef void far * XMSDRIVER; /* actually a pointer to code */
-typedef struct { /* registers for calling XMS driver */
- unsigned short ax, dx, bx;
- void far * ds_si;
- } XMScontext;
-typedef struct { /* registers for calling EMS driver */
- unsigned short ax, dx, bx;
- void far * ds_si;
- } EMScontext;
-
-extern short far jdos_open JPP((short far * handle, char far * filename));
-extern short far jdos_close JPP((short handle));
-extern short far jdos_seek JPP((short handle, long offset));
-extern short far jdos_read JPP((short handle, void far * buffer,
- unsigned short count));
-extern short far jdos_write JPP((short handle, void far * buffer,
- unsigned short count));
-extern void far jxms_getdriver JPP((XMSDRIVER far *));
-extern void far jxms_calldriver JPP((XMSDRIVER, XMScontext far *));
-extern short far jems_available JPP((void));
-extern void far jems_calldriver JPP((EMScontext far *));
-
-
-/*
- * Selection of a file name for a temporary file.
- * This is highly system-dependent, and you may want to customize it.
- */
-
-static int next_file_num; /* to distinguish among several temp files */
-
-LOCAL(void)
-select_file_name (char * fname)
-{
- const char * env;
- char * ptr;
- FILE * tfile;
-
- /* Keep generating file names till we find one that's not in use */
- for (;;) {
- /* Get temp directory name from environment TMP or TEMP variable;
- * if none, use "."
- */
- if ((env = (const char *) getenv("TMP")) == NULL)
- if ((env = (const char *) getenv("TEMP")) == NULL)
- env = ".";
- if (*env == '\0') /* null string means "." */
- env = ".";
- ptr = fname; /* copy name to fname */
- while (*env != '\0')
- *ptr++ = *env++;
- if (ptr[-1] != '\\' && ptr[-1] != '/')
- *ptr++ = '\\'; /* append backslash if not in env variable */
- /* Append a suitable file name */
- next_file_num++; /* advance counter */
- sprintf(ptr, "JPG%03d.TMP", next_file_num);
- /* Probe to see if file name is already in use */
- if ((tfile = fopen(fname, READ_BINARY)) == NULL)
- break;
- fclose(tfile); /* oops, it's there; close tfile & try again */
- }
-}
-
-
-/*
- * Near-memory allocation and freeing are controlled by the regular library
- * routines malloc() and free().
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
- return (void *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
- free(object);
-}
-
-
-/*
- * "Large" objects are allocated in far memory, if possible
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
- return (void FAR *) far_malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
- far_free(object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- * It's impossible to do this in a portable way; our current solution is
- * to make the user tell us (with a default value set at compile time).
- * If you can actually get the available space, it's a good idea to subtract
- * a slop factor of 5% or so.
- */
-
-#ifndef DEFAULT_MAX_MEM /* so can override from makefile */
-#define DEFAULT_MAX_MEM 300000L /* for total usage about 450K */
-#endif
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
- long max_bytes_needed, long already_allocated)
-{
- return cinfo->mem->max_memory_to_use - already_allocated;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed. You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-/*
- * For MS-DOS we support three types of backing storage:
- * 1. Conventional DOS files. We access these by direct DOS calls rather
- * than via the stdio package. This provides a bit better performance,
- * but the real reason is that the buffers to be read or written are FAR.
- * The stdio library for small-data memory models can't cope with that.
- * 2. Extended memory, accessed per the XMS V2.0 specification.
- * 3. Expanded memory, accessed per the LIM/EMS 4.0 specification.
- * You'll need copies of those specs to make sense of the related code.
- * The specs are available by Internet FTP from the SIMTEL archives
- * (oak.oakland.edu and its various mirror sites). See files
- * pub/msdos/microsoft/xms20.arc and pub/msdos/info/limems41.zip.
- */
-
-
-/*
- * Access methods for a DOS file.
- */
-
-
-METHODDEF(void)
-read_file_store (j_common_ptr cinfo, backing_store_ptr info,
- void FAR * buffer_address,
- long file_offset, long byte_count)
-{
- if (jdos_seek(info->handle.file_handle, file_offset))
- ERREXIT(cinfo, JERR_TFILE_SEEK);
- /* Since MAX_ALLOC_CHUNK is less than 64K, byte_count will be too. */
- if (byte_count > 65535L) /* safety check */
- ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
- if (jdos_read(info->handle.file_handle, buffer_address,
- (unsigned short) byte_count))
- ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_file_store (j_common_ptr cinfo, backing_store_ptr info,
- void FAR * buffer_address,
- long file_offset, long byte_count)
-{
- if (jdos_seek(info->handle.file_handle, file_offset))
- ERREXIT(cinfo, JERR_TFILE_SEEK);
- /* Since MAX_ALLOC_CHUNK is less than 64K, byte_count will be too. */
- if (byte_count > 65535L) /* safety check */
- ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
- if (jdos_write(info->handle.file_handle, buffer_address,
- (unsigned short) byte_count))
- ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_file_store (j_common_ptr cinfo, backing_store_ptr info)
-{
- jdos_close(info->handle.file_handle); /* close the file */
- remove(info->temp_name); /* delete the file */
-/* If your system doesn't have remove(), try unlink() instead.
- * remove() is the ANSI-standard name for this function, but
- * unlink() was more common in pre-ANSI systems.
- */
- TRACEMSS(cinfo, 1, JTRC_TFILE_CLOSE, info->temp_name);
-}
-
-
-LOCAL(boolean)
-open_file_store (j_common_ptr cinfo, backing_store_ptr info,
- long total_bytes_needed)
-{
- short handle;
-
- select_file_name(info->temp_name);
- if (jdos_open((short far *) & handle, (char far *) info->temp_name)) {
- /* might as well exit since jpeg_open_backing_store will fail anyway */
- ERREXITS(cinfo, JERR_TFILE_CREATE, info->temp_name);
- return FALSE;
- }
- info->handle.file_handle = handle;
- info->read_backing_store = read_file_store;
- info->write_backing_store = write_file_store;
- info->close_backing_store = close_file_store;
- TRACEMSS(cinfo, 1, JTRC_TFILE_OPEN, info->temp_name);
- return TRUE; /* succeeded */
-}
-
-
-/*
- * Access methods for extended memory.
- */
-
-#if XMS_SUPPORTED
-
-static XMSDRIVER xms_driver; /* saved address of XMS driver */
-
-typedef union { /* either long offset or real-mode pointer */
- long offset;
- void far * ptr;
- } XMSPTR;
-
-typedef struct { /* XMS move specification structure */
- long length;
- XMSH src_handle;
- XMSPTR src;
- XMSH dst_handle;
- XMSPTR dst;
- } XMSspec;
-
-#define ODD(X) (((X) & 1L) != 0)
-
-
-METHODDEF(void)
-read_xms_store (j_common_ptr cinfo, backing_store_ptr info,
- void FAR * buffer_address,
- long file_offset, long byte_count)
-{
- XMScontext ctx;
- XMSspec spec;
- char endbuffer[2];
-
- /* The XMS driver can't cope with an odd length, so handle the last byte
- * specially if byte_count is odd. We don't expect this to be common.
- */
-
- spec.length = byte_count & (~ 1L);
- spec.src_handle = info->handle.xms_handle;
- spec.src.offset = file_offset;
- spec.dst_handle = 0;
- spec.dst.ptr = buffer_address;
-
- ctx.ds_si = (void far *) & spec;
- ctx.ax = 0x0b00; /* EMB move */
- jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
- if (ctx.ax != 1)
- ERREXIT(cinfo, JERR_XMS_READ);
-
- if (ODD(byte_count)) {
- read_xms_store(cinfo, info, (void FAR *) endbuffer,
- file_offset + byte_count - 1L, 2L);
- ((char FAR *) buffer_address)[byte_count - 1L] = endbuffer[0];
- }
-}
-
-
-METHODDEF(void)
-write_xms_store (j_common_ptr cinfo, backing_store_ptr info,
- void FAR * buffer_address,
- long file_offset, long byte_count)
-{
- XMScontext ctx;
- XMSspec spec;
- char endbuffer[2];
-
- /* The XMS driver can't cope with an odd length, so handle the last byte
- * specially if byte_count is odd. We don't expect this to be common.
- */
-
- spec.length = byte_count & (~ 1L);
- spec.src_handle = 0;
- spec.src.ptr = buffer_address;
- spec.dst_handle = info->handle.xms_handle;
- spec.dst.offset = file_offset;
-
- ctx.ds_si = (void far *) & spec;
- ctx.ax = 0x0b00; /* EMB move */
- jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
- if (ctx.ax != 1)
- ERREXIT(cinfo, JERR_XMS_WRITE);
-
- if (ODD(byte_count)) {
- read_xms_store(cinfo, info, (void FAR *) endbuffer,
- file_offset + byte_count - 1L, 2L);
- endbuffer[0] = ((char FAR *) buffer_address)[byte_count - 1L];
- write_xms_store(cinfo, info, (void FAR *) endbuffer,
- file_offset + byte_count - 1L, 2L);
- }
-}
-
-
-METHODDEF(void)
-close_xms_store (j_common_ptr cinfo, backing_store_ptr info)
-{
- XMScontext ctx;
-
- ctx.dx = info->handle.xms_handle;
- ctx.ax = 0x0a00;
- jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
- TRACEMS1(cinfo, 1, JTRC_XMS_CLOSE, info->handle.xms_handle);
- /* we ignore any error return from the driver */
-}
-
-
-LOCAL(boolean)
-open_xms_store (j_common_ptr cinfo, backing_store_ptr info,
- long total_bytes_needed)
-{
- XMScontext ctx;
-
- /* Get address of XMS driver */
- jxms_getdriver((XMSDRIVER far *) & xms_driver);
- if (xms_driver == NULL)
- return FALSE; /* no driver to be had */
-
- /* Get version number, must be >= 2.00 */
- ctx.ax = 0x0000;
- jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
- if (ctx.ax < (unsigned short) 0x0200)
- return FALSE;
-
- /* Try to get space (expressed in kilobytes) */
- ctx.dx = (unsigned short) ((total_bytes_needed + 1023L) >> 10);
- ctx.ax = 0x0900;
- jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
- if (ctx.ax != 1)
- return FALSE;
-
- /* Succeeded, save the handle and away we go */
- info->handle.xms_handle = ctx.dx;
- info->read_backing_store = read_xms_store;
- info->write_backing_store = write_xms_store;
- info->close_backing_store = close_xms_store;
- TRACEMS1(cinfo, 1, JTRC_XMS_OPEN, ctx.dx);
- return TRUE; /* succeeded */
-}
-
-#endif /* XMS_SUPPORTED */
-
-
-/*
- * Access methods for expanded memory.
- */
-
-#if EMS_SUPPORTED
-
-/* The EMS move specification structure requires word and long fields aligned
- * at odd byte boundaries. Some compilers will align struct fields at even
- * byte boundaries. While it's usually possible to force byte alignment,
- * that causes an overall performance penalty and may pose problems in merging
- * JPEG into a larger application. Instead we accept some rather dirty code
- * here. Note this code would fail if the hardware did not allow odd-byte
- * word & long accesses, but all 80x86 CPUs do.
- */
-
-typedef void far * EMSPTR;
-
-typedef union { /* EMS move specification structure */
- long length; /* It's easy to access first 4 bytes */
- char bytes[18]; /* Misaligned fields in here! */
- } EMSspec;
-
-/* Macros for accessing misaligned fields */
-#define FIELD_AT(spec,offset,type) (*((type *) &(spec.bytes[offset])))
-#define SRC_TYPE(spec) FIELD_AT(spec,4,char)
-#define SRC_HANDLE(spec) FIELD_AT(spec,5,EMSH)
-#define SRC_OFFSET(spec) FIELD_AT(spec,7,unsigned short)
-#define SRC_PAGE(spec) FIELD_AT(spec,9,unsigned short)
-#define SRC_PTR(spec) FIELD_AT(spec,7,EMSPTR)
-#define DST_TYPE(spec) FIELD_AT(spec,11,char)
-#define DST_HANDLE(spec) FIELD_AT(spec,12,EMSH)
-#define DST_OFFSET(spec) FIELD_AT(spec,14,unsigned short)
-#define DST_PAGE(spec) FIELD_AT(spec,16,unsigned short)
-#define DST_PTR(spec) FIELD_AT(spec,14,EMSPTR)
-
-#define EMSPAGESIZE 16384L /* gospel, see the EMS specs */
-
-#define HIBYTE(W) (((W) >> 8) & 0xFF)
-#define LOBYTE(W) ((W) & 0xFF)
-
-
-METHODDEF(void)
-read_ems_store (j_common_ptr cinfo, backing_store_ptr info,
- void FAR * buffer_address,
- long file_offset, long byte_count)
-{
- EMScontext ctx;
- EMSspec spec;
-
- spec.length = byte_count;
- SRC_TYPE(spec) = 1;
- SRC_HANDLE(spec) = info->handle.ems_handle;
- SRC_PAGE(spec) = (unsigned short) (file_offset / EMSPAGESIZE);
- SRC_OFFSET(spec) = (unsigned short) (file_offset % EMSPAGESIZE);
- DST_TYPE(spec) = 0;
- DST_HANDLE(spec) = 0;
- DST_PTR(spec) = buffer_address;
-
- ctx.ds_si = (void far *) & spec;
- ctx.ax = 0x5700; /* move memory region */
- jems_calldriver((EMScontext far *) & ctx);
- if (HIBYTE(ctx.ax) != 0)
- ERREXIT(cinfo, JERR_EMS_READ);
-}
-
-
-METHODDEF(void)
-write_ems_store (j_common_ptr cinfo, backing_store_ptr info,
- void FAR * buffer_address,
- long file_offset, long byte_count)
-{
- EMScontext ctx;
- EMSspec spec;
-
- spec.length = byte_count;
- SRC_TYPE(spec) = 0;
- SRC_HANDLE(spec) = 0;
- SRC_PTR(spec) = buffer_address;
- DST_TYPE(spec) = 1;
- DST_HANDLE(spec) = info->handle.ems_handle;
- DST_PAGE(spec) = (unsigned short) (file_offset / EMSPAGESIZE);
- DST_OFFSET(spec) = (unsigned short) (file_offset % EMSPAGESIZE);
-
- ctx.ds_si = (void far *) & spec;
- ctx.ax = 0x5700; /* move memory region */
- jems_calldriver((EMScontext far *) & ctx);
- if (HIBYTE(ctx.ax) != 0)
- ERREXIT(cinfo, JERR_EMS_WRITE);
-}
-
-
-METHODDEF(void)
-close_ems_store (j_common_ptr cinfo, backing_store_ptr info)
-{
- EMScontext ctx;
-
- ctx.ax = 0x4500;
- ctx.dx = info->handle.ems_handle;
- jems_calldriver((EMScontext far *) & ctx);
- TRACEMS1(cinfo, 1, JTRC_EMS_CLOSE, info->handle.ems_handle);
- /* we ignore any error return from the driver */
-}
-
-
-LOCAL(boolean)
-open_ems_store (j_common_ptr cinfo, backing_store_ptr info,
- long total_bytes_needed)
-{
- EMScontext ctx;
-
- /* Is EMS driver there? */
- if (! jems_available())
- return FALSE;
-
- /* Get status, make sure EMS is OK */
- ctx.ax = 0x4000;
- jems_calldriver((EMScontext far *) & ctx);
- if (HIBYTE(ctx.ax) != 0)
- return FALSE;
-
- /* Get version, must be >= 4.0 */
- ctx.ax = 0x4600;
- jems_calldriver((EMScontext far *) & ctx);
- if (HIBYTE(ctx.ax) != 0 || LOBYTE(ctx.ax) < 0x40)
- return FALSE;
-
- /* Try to allocate requested space */
- ctx.ax = 0x4300;
- ctx.bx = (unsigned short) ((total_bytes_needed + EMSPAGESIZE-1L) / EMSPAGESIZE);
- jems_calldriver((EMScontext far *) & ctx);
- if (HIBYTE(ctx.ax) != 0)
- return FALSE;
-
- /* Succeeded, save the handle and away we go */
- info->handle.ems_handle = ctx.dx;
- info->read_backing_store = read_ems_store;
- info->write_backing_store = write_ems_store;
- info->close_backing_store = close_ems_store;
- TRACEMS1(cinfo, 1, JTRC_EMS_OPEN, ctx.dx);
- return TRUE; /* succeeded */
-}
-
-#endif /* EMS_SUPPORTED */
-
-
-/*
- * Initial opening of a backing-store object.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
- long total_bytes_needed)
-{
- /* Try extended memory, then expanded memory, then regular file. */
-#if XMS_SUPPORTED
- if (open_xms_store(cinfo, info, total_bytes_needed))
- return;
-#endif
-#if EMS_SUPPORTED
- if (open_ems_store(cinfo, info, total_bytes_needed))
- return;
-#endif
- if (open_file_store(cinfo, info, total_bytes_needed))
- return;
- ERREXITS(cinfo, JERR_TFILE_CREATE, "");
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
- next_file_num = 0; /* initialize temp file name generator */
- return DEFAULT_MAX_MEM; /* default for max_memory_to_use */
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
- /* Microsoft C, at least in v6.00A, will not successfully reclaim freed
- * blocks of size > 32Kbytes unless we give it a kick in the rear, like so:
- */
-#ifdef NEED_FHEAPMIN
- _fheapmin();
-#endif
-}
diff --git a/jmemdosa.asm b/jmemdosa.asm
deleted file mode 100644
index ecd4372..0000000
--- a/jmemdosa.asm
+++ /dev/null
@@ -1,379 +0,0 @@
-;
-; jmemdosa.asm
-;
-; Copyright (C) 1992, Thomas G. Lane.
-; This file is part of the Independent JPEG Group's software.
-; For conditions of distribution and use, see the accompanying README file.
-;
-; This file contains low-level interface routines to support the MS-DOS
-; backing store manager (jmemdos.c). Routines are provided to access disk
-; files through direct DOS calls, and to access XMS and EMS drivers.
-;
-; This file should assemble with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler). If you haven't got
-; a compatible assembler, better fall back to jmemansi.c or jmemname.c.
-;
-; To minimize dependence on the C compiler's register usage conventions,
-; we save and restore all 8086 registers, even though most compilers only
-; require SI,DI,DS to be preserved. Also, we use only 16-bit-wide return
-; values, which everybody returns in AX.
-;
-; Based on code contributed by Ge' Weijers.
-;
-
-JMEMDOSA_TXT segment byte public 'CODE'
-
- assume cs:JMEMDOSA_TXT
-
- public _jdos_open
- public _jdos_close
- public _jdos_seek
- public _jdos_read
- public _jdos_write
- public _jxms_getdriver
- public _jxms_calldriver
- public _jems_available
- public _jems_calldriver
-
-;
-; short far jdos_open (short far * handle, char far * filename)
-;
-; Create and open a temporary file
-;
-_jdos_open proc far
- push bp ; linkage
- mov bp,sp
- push si ; save all registers for safety
- push di
- push bx
- push cx
- push dx
- push es
- push ds
- mov cx,0 ; normal file attributes
- lds dx,dword ptr [bp+10] ; get filename pointer
- mov ah,3ch ; create file
- int 21h
- jc open_err ; if failed, return error code
- lds bx,dword ptr [bp+6] ; get handle pointer
- mov word ptr [bx],ax ; save the handle
- xor ax,ax ; return zero for OK
-open_err: pop ds ; restore registers and exit
- pop es
- pop dx
- pop cx
- pop bx
- pop di
- pop si
- pop bp
- ret
-_jdos_open endp
-
-
-;
-; short far jdos_close (short handle)
-;
-; Close the file handle
-;
-_jdos_close proc far
- push bp ; linkage
- mov bp,sp
- push si ; save all registers for safety
- push di
- push bx
- push cx
- push dx
- push es
- push ds
- mov bx,word ptr [bp+6] ; file handle
- mov ah,3eh ; close file
- int 21h
- jc close_err ; if failed, return error code
- xor ax,ax ; return zero for OK
-close_err: pop ds ; restore registers and exit
- pop es
- pop dx
- pop cx
- pop bx
- pop di
- pop si
- pop bp
- ret
-_jdos_close endp
-
-
-;
-; short far jdos_seek (short handle, long offset)
-;
-; Set file position
-;
-_jdos_seek proc far
- push bp ; linkage
- mov bp,sp
- push si ; save all registers for safety
- push di
- push bx
- push cx
- push dx
- push es
- push ds
- mov bx,word ptr [bp+6] ; file handle
- mov dx,word ptr [bp+8] ; LS offset
- mov cx,word ptr [bp+10] ; MS offset
- mov ax,4200h ; absolute seek
- int 21h
- jc seek_err ; if failed, return error code
- xor ax,ax ; return zero for OK
-seek_err: pop ds ; restore registers and exit
- pop es
- pop dx
- pop cx
- pop bx
- pop di
- pop si
- pop bp
- ret
-_jdos_seek endp
-
-
-;
-; short far jdos_read (short handle, void far * buffer, unsigned short count)
-;
-; Read from file
-;
-_jdos_read proc far
- push bp ; linkage
- mov bp,sp
- push si ; save all registers for safety
- push di
- push bx
- push cx
- push dx
- push es
- push ds
- mov bx,word ptr [bp+6] ; file handle
- lds dx,dword ptr [bp+8] ; buffer address
- mov cx,word ptr [bp+12] ; number of bytes
- mov ah,3fh ; read file
- int 21h
- jc read_err ; if failed, return error code
- cmp ax,word ptr [bp+12] ; make sure all bytes were read
- je read_ok
- mov ax,1 ; else return 1 for not OK
- jmp short read_err
-read_ok: xor ax,ax ; return zero for OK
-read_err: pop ds ; restore registers and exit
- pop es
- pop dx
- pop cx
- pop bx
- pop di
- pop si
- pop bp
- ret
-_jdos_read endp
-
-
-;
-; short far jdos_write (short handle, void far * buffer, unsigned short count)
-;
-; Write to file
-;
-_jdos_write proc far
- push bp ; linkage
- mov bp,sp
- push si ; save all registers for safety
- push di
- push bx
- push cx
- push dx
- push es
- push ds
- mov bx,word ptr [bp+6] ; file handle
- lds dx,dword ptr [bp+8] ; buffer address
- mov cx,word ptr [bp+12] ; number of bytes
- mov ah,40h ; write file
- int 21h
- jc write_err ; if failed, return error code
- cmp ax,word ptr [bp+12] ; make sure all bytes written
- je write_ok
- mov ax,1 ; else return 1 for not OK
- jmp short write_err
-write_ok: xor ax,ax ; return zero for OK
-write_err: pop ds ; restore registers and exit
- pop es
- pop dx
- pop cx
- pop bx
- pop di
- pop si
- pop bp
- ret
-_jdos_write endp
-
-
-;
-; void far jxms_getdriver (XMSDRIVER far *)
-;
-; Get the address of the XMS driver, or NULL if not available
-;
-_jxms_getdriver proc far
- push bp ; linkage
- mov bp,sp
- push si ; save all registers for safety
- push di
- push bx
- push cx
- push dx
- push es
- push ds
- mov ax,4300h ; call multiplex interrupt with
- int 2fh ; a magic cookie, hex 4300
- cmp al,80h ; AL should contain hex 80
- je xmsavail
- xor dx,dx ; no XMS driver available
- xor ax,ax ; return a nil pointer
- jmp short xmsavail_done
-xmsavail: mov ax,4310h ; fetch driver address with
- int 2fh ; another magic cookie
- mov dx,es ; copy address to dx:ax
- mov ax,bx
-xmsavail_done: les bx,dword ptr [bp+6] ; get pointer to return value
- mov word ptr es:[bx],ax
- mov word ptr es:[bx+2],dx
- pop ds ; restore registers and exit
- pop es
- pop dx
- pop cx
- pop bx
- pop di
- pop si
- pop bp
- ret
-_jxms_getdriver endp
-
-
-;
-; void far jxms_calldriver (XMSDRIVER, XMScontext far *)
-;
-; The XMScontext structure contains values for the AX,DX,BX,SI,DS registers.
-; These are loaded, the XMS call is performed, and the new values of the
-; AX,DX,BX registers are written back to the context structure.
-;
-_jxms_calldriver proc far
- push bp ; linkage
- mov bp,sp
- push si ; save all registers for safety
- push di
- push bx
- push cx
- push dx
- push es
- push ds
- les bx,dword ptr [bp+10] ; get XMScontext pointer
- mov ax,word ptr es:[bx] ; load registers
- mov dx,word ptr es:[bx+2]
- mov si,word ptr es:[bx+6]
- mov ds,word ptr es:[bx+8]
- mov bx,word ptr es:[bx+4]
- call dword ptr [bp+6] ; call the driver
- mov cx,bx ; save returned BX for a sec
- les bx,dword ptr [bp+10] ; get XMScontext pointer
- mov word ptr es:[bx],ax ; put back ax,dx,bx
- mov word ptr es:[bx+2],dx
- mov word ptr es:[bx+4],cx
- pop ds ; restore registers and exit
- pop es
- pop dx
- pop cx
- pop bx
- pop di
- pop si
- pop bp
- ret
-_jxms_calldriver endp
-
-
-;
-; short far jems_available (void)
-;
-; Have we got an EMS driver? (this comes straight from the EMS 4.0 specs)
-;
-_jems_available proc far
- push si ; save all registers for safety
- push di
- push bx
- push cx
- push dx
- push es
- push ds
- mov ax,3567h ; get interrupt vector 67h
- int 21h
- push cs
- pop ds
- mov di,000ah ; check offs 10 in returned seg
- lea si,ASCII_device_name ; against literal string
- mov cx,8
- cld
- repe cmpsb
- jne no_ems
- mov ax,1 ; match, it's there
- jmp short avail_done
-no_ems: xor ax,ax ; it's not there
-avail_done: pop ds ; restore registers and exit
- pop es
- pop dx
- pop cx
- pop bx
- pop di
- pop si
- ret
-
-ASCII_device_name db "EMMXXXX0"
-
-_jems_available endp
-
-
-;
-; void far jems_calldriver (EMScontext far *)
-;
-; The EMScontext structure contains values for the AX,DX,BX,SI,DS registers.
-; These are loaded, the EMS trap is performed, and the new values of the
-; AX,DX,BX registers are written back to the context structure.
-;
-_jems_calldriver proc far
- push bp ; linkage
- mov bp,sp
- push si ; save all registers for safety
- push di
- push bx
- push cx
- push dx
- push es
- push ds
- les bx,dword ptr [bp+6] ; get EMScontext pointer
- mov ax,word ptr es:[bx] ; load registers
- mov dx,word ptr es:[bx+2]
- mov si,word ptr es:[bx+6]
- mov ds,word ptr es:[bx+8]
- mov bx,word ptr es:[bx+4]
- int 67h ; call the EMS driver
- mov cx,bx ; save returned BX for a sec
- les bx,dword ptr [bp+6] ; get EMScontext pointer
- mov word ptr es:[bx],ax ; put back ax,dx,bx
- mov word ptr es:[bx+2],dx
- mov word ptr es:[bx+4],cx
- pop ds ; restore registers and exit
- pop es
- pop dx
- pop cx
- pop bx
- pop di
- pop si
- pop bp
- ret
-_jems_calldriver endp
-
-JMEMDOSA_TXT ends
-
- end
diff --git a/jmemmac.c b/jmemmac.c
deleted file mode 100644
index 106f9be..0000000
--- a/jmemmac.c
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * jmemmac.c
- *
- * Copyright (C) 1992-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * jmemmac.c provides an Apple Macintosh implementation of the system-
- * dependent portion of the JPEG memory manager.
- *
- * If you use jmemmac.c, then you must define USE_MAC_MEMMGR in the
- * JPEG_INTERNALS part of jconfig.h.
- *
- * jmemmac.c uses the Macintosh toolbox routines NewPtr and DisposePtr
- * instead of malloc and free. It accurately determines the amount of
- * memory available by using CompactMem. Notice that if left to its
- * own devices, this code can chew up all available space in the
- * application's zone, with the exception of the rather small "slop"
- * factor computed in jpeg_mem_available(). The application can ensure
- * that more space is left over by reducing max_memory_to_use.
- *
- * Large images are swapped to disk using temporary files and System 7.0+'s
- * temporary folder functionality.
- *
- * Note that jmemmac.c depends on two features of MacOS that were first
- * introduced in System 7: FindFolder and the FSSpec-based calls.
- * If your application uses jmemmac.c and is run under System 6 or earlier,
- * and the jpeg library decides it needs a temporary file, it will abort,
- * printing error messages about requiring System 7. (If no temporary files
- * are created, it will run fine.)
- *
- * If you want to use jmemmac.c in an application that might be used with
- * System 6 or earlier, then you should remove dependencies on FindFolder
- * and the FSSpec calls. You will need to replace FindFolder with some
- * other mechanism for finding a place to put temporary files, and you
- * should replace the FSSpec calls with their HFS equivalents:
- *
- * FSpDelete -> HDelete
- * FSpGetFInfo -> HGetFInfo
- * FSpCreate -> HCreate
- * FSpOpenDF -> HOpen *** Note: not HOpenDF ***
- * FSMakeFSSpec -> (fill in spec by hand.)
- *
- * (Use HOpen instead of HOpenDF. HOpen is just a glue-interface to PBHOpen,
- * which is on all HFS macs. HOpenDF is a System 7 addition which avoids the
- * ages-old problem of names starting with a period.)
- *
- * Contributed by Sam Bushell (jsam@iagu.on.net) and
- * Dan Gildor (gyld@in-touch.com).
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h" /* import the system-dependent declarations */
-
-#ifndef USE_MAC_MEMMGR /* make sure user got configuration right */
- You forgot to define USE_MAC_MEMMGR in jconfig.h. /* deliberate syntax error */
-#endif
-
-#include <Memory.h> /* we use the MacOS memory manager */
-#include <Files.h> /* we use the MacOS File stuff */
-#include <Folders.h> /* we use the MacOS HFS stuff */
-#include <Script.h> /* for smSystemScript */
-#include <Gestalt.h> /* we use Gestalt to test for specific functionality */
-
-#ifndef TEMP_FILE_NAME /* can override from jconfig.h or Makefile */
-#define TEMP_FILE_NAME "JPG%03d.TMP"
-#endif
-
-static int next_file_num; /* to distinguish among several temp files */
-
-
-/*
- * Memory allocation and freeing are controlled by the MacOS library
- * routines NewPtr() and DisposePtr(), which allocate fixed-address
- * storage. Unfortunately, the IJG library isn't smart enough to cope
- * with relocatable storage.
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
- return (void *) NewPtr(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
- DisposePtr((Ptr) object);
-}
-
-
-/*
- * "Large" objects are treated the same as "small" ones.
- * NB: we include FAR keywords in the routine declarations simply for
- * consistency with the rest of the IJG code; FAR should expand to empty
- * on rational architectures like the Mac.
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
- return (void FAR *) NewPtr(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
- DisposePtr((Ptr) object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- */
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
- long max_bytes_needed, long already_allocated)
-{
- long limit = cinfo->mem->max_memory_to_use - already_allocated;
- long slop, mem;
-
- /* Don't ask for more than what application has told us we may use */
- if (max_bytes_needed > limit && limit > 0)
- max_bytes_needed = limit;
- /* Find whether there's a big enough free block in the heap.
- * CompactMem tries to create a contiguous block of the requested size,
- * and then returns the size of the largest free block (which could be
- * much more or much less than we asked for).
- * We add some slop to ensure we don't use up all available memory.
- */
- slop = max_bytes_needed / 16 + 32768L;
- mem = CompactMem(max_bytes_needed + slop) - slop;
- if (mem < 0)
- mem = 0; /* sigh, couldn't even get the slop */
- /* Don't take more than the application says we can have */
- if (mem > limit && limit > 0)
- mem = limit;
- return mem;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed. You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-
-METHODDEF(void)
-read_backing_store (j_common_ptr cinfo, backing_store_ptr info,
- void FAR * buffer_address,
- long file_offset, long byte_count)
-{
- long bytes = byte_count;
- long retVal;
-
- if ( SetFPos ( info->temp_file, fsFromStart, file_offset ) != noErr )
- ERREXIT(cinfo, JERR_TFILE_SEEK);
-
- retVal = FSRead ( info->temp_file, &bytes,
- (unsigned char *) buffer_address );
- if ( retVal != noErr || bytes != byte_count )
- ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_backing_store (j_common_ptr cinfo, backing_store_ptr info,
- void FAR * buffer_address,
- long file_offset, long byte_count)
-{
- long bytes = byte_count;
- long retVal;
-
- if ( SetFPos ( info->temp_file, fsFromStart, file_offset ) != noErr )
- ERREXIT(cinfo, JERR_TFILE_SEEK);
-
- retVal = FSWrite ( info->temp_file, &bytes,
- (unsigned char *) buffer_address );
- if ( retVal != noErr || bytes != byte_count )
- ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_backing_store (j_common_ptr cinfo, backing_store_ptr info)
-{
- FSClose ( info->temp_file );
- FSpDelete ( &(info->tempSpec) );
-}
-
-
-/*
- * Initial opening of a backing-store object.
- *
- * This version uses FindFolder to find the Temporary Items folder,
- * and puts the temporary file in there.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
- long total_bytes_needed)
-{
- short tmpRef, vRefNum;
- long dirID;
- FInfo finderInfo;
- FSSpec theSpec;
- Str255 fName;
- OSErr osErr;
- long gestaltResponse = 0;
-
- /* Check that FSSpec calls are available. */
- osErr = Gestalt( gestaltFSAttr, &gestaltResponse );
- if ( ( osErr != noErr )
- || !( gestaltResponse & (1<<gestaltHasFSSpecCalls) ) )
- ERREXITS(cinfo, JERR_TFILE_CREATE, "- System 7.0 or later required");
- /* TO DO: add a proper error message to jerror.h. */
-
- /* Check that FindFolder is available. */
- osErr = Gestalt( gestaltFindFolderAttr, &gestaltResponse );
- if ( ( osErr != noErr )
- || !( gestaltResponse & (1<<gestaltFindFolderPresent) ) )
- ERREXITS(cinfo, JERR_TFILE_CREATE, "- System 7.0 or later required.");
- /* TO DO: add a proper error message to jerror.h. */
-
- osErr = FindFolder ( kOnSystemDisk, kTemporaryFolderType, kCreateFolder,
- &vRefNum, &dirID );
- if ( osErr != noErr )
- ERREXITS(cinfo, JERR_TFILE_CREATE, "- temporary items folder unavailable");
- /* TO DO: Try putting the temp files somewhere else. */
-
- /* Keep generating file names till we find one that's not in use */
- for (;;) {
- next_file_num++; /* advance counter */
-
- sprintf(info->temp_name, TEMP_FILE_NAME, next_file_num);
- strcpy ( (Ptr)fName+1, info->temp_name );
- *fName = strlen (info->temp_name);
- osErr = FSMakeFSSpec ( vRefNum, dirID, fName, &theSpec );
-
- if ( (osErr = FSpGetFInfo ( &theSpec, &finderInfo ) ) != noErr )
- break;
- }
-
- osErr = FSpCreate ( &theSpec, '????', '????', smSystemScript );
- if ( osErr != noErr )
- ERREXITS(cinfo, JERR_TFILE_CREATE, info->temp_name);
-
- osErr = FSpOpenDF ( &theSpec, fsRdWrPerm, &(info->temp_file) );
- if ( osErr != noErr )
- ERREXITS(cinfo, JERR_TFILE_CREATE, info->temp_name);
-
- info->tempSpec = theSpec;
-
- info->read_backing_store = read_backing_store;
- info->write_backing_store = write_backing_store;
- info->close_backing_store = close_backing_store;
- TRACEMSS(cinfo, 1, JTRC_TFILE_OPEN, info->temp_name);
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
- next_file_num = 0;
-
- /* max_memory_to_use will be initialized to FreeMem()'s result;
- * the calling application might later reduce it, for example
- * to leave room to invoke multiple JPEG objects.
- * Note that FreeMem returns the total number of free bytes;
- * it may not be possible to allocate a single block of this size.
- */
- return FreeMem();
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
- /* no work */
-}
diff --git a/jmemmgr.c b/jmemmgr.c
index d801b32..058a115 100644
--- a/jmemmgr.c
+++ b/jmemmgr.c
@@ -57,22 +57,25 @@
* requirement, and we had better do so too.
* There isn't any really portable way to determine the worst-case alignment
* requirement. This module assumes that the alignment requirement is
- * multiples of sizeof(ALIGN_TYPE).
- * By default, we define ALIGN_TYPE as double. This is necessary on some
+ * multiples of ALIGN_SIZE.
+ * By default, we define ALIGN_SIZE as sizeof(double). This is necessary on some
* workstations (where doubles really do need 8-byte alignment) and will work
* fine on nearly everything. If your machine has lesser alignment needs,
- * you can save a few bytes by making ALIGN_TYPE smaller.
+ * you can save a few bytes by making ALIGN_SIZE smaller.
* The only place I know of where this will NOT work is certain Macintosh
* 680x0 compilers that define double as a 10-byte IEEE extended float.
* Doing 10-byte alignment is counterproductive because longwords won't be
- * aligned well. Put "#define ALIGN_TYPE long" in jconfig.h if you have
+ * aligned well. Put "#define ALIGN_SIZE 4" in jconfig.h if you have
* such a compiler.
*/
-#ifndef ALIGN_TYPE /* so can override from jconfig.h */
-#define ALIGN_TYPE double
+#ifndef ALIGN_SIZE /* so can override from jconfig.h */
+#ifndef WITH_SIMD
+#define ALIGN_SIZE SIZEOF(double)
+#else
+#define ALIGN_SIZE 16 /* Most SIMD implementations require this */
#endif
-
+#endif
/*
* We allocate objects from "pools", where each pool is gotten with a single
@@ -81,34 +84,24 @@
* header with a link to the next pool of the same class.
* Small and large pool headers are identical except that the latter's
* link pointer must be FAR on 80x86 machines.
- * Notice that the "real" header fields are union'ed with a dummy ALIGN_TYPE
- * field. This forces the compiler to make SIZEOF(small_pool_hdr) a multiple
- * of the alignment requirement of ALIGN_TYPE.
*/
-typedef union small_pool_struct * small_pool_ptr;
+typedef struct small_pool_struct * small_pool_ptr;
-typedef union small_pool_struct {
- struct {
- small_pool_ptr next; /* next in list of pools */
- size_t bytes_used; /* how many bytes already used within pool */
- size_t bytes_left; /* bytes still available in this pool */
- } hdr;
- ALIGN_TYPE dummy; /* included in union to ensure alignment */
+typedef struct small_pool_struct {
+ small_pool_ptr next; /* next in list of pools */
+ size_t bytes_used; /* how many bytes already used within pool */
+ size_t bytes_left; /* bytes still available in this pool */
} small_pool_hdr;
-typedef union large_pool_struct FAR * large_pool_ptr;
+typedef struct large_pool_struct FAR * large_pool_ptr;
-typedef union large_pool_struct {
- struct {
- large_pool_ptr next; /* next in list of pools */
- size_t bytes_used; /* how many bytes already used within pool */
- size_t bytes_left; /* bytes still available in this pool */
- } hdr;
- ALIGN_TYPE dummy; /* included in union to ensure alignment */
+typedef struct large_pool_struct {
+ large_pool_ptr next; /* next in list of pools */
+ size_t bytes_used; /* how many bytes already used within pool */
+ size_t bytes_left; /* bytes still available in this pool */
} large_pool_hdr;
-
/*
* Here is the full definition of a memory manager object.
*/
@@ -197,16 +190,16 @@
pool_id, mem->total_space_allocated);
for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL;
- lhdr_ptr = lhdr_ptr->hdr.next) {
+ lhdr_ptr = lhdr_ptr->next) {
fprintf(stderr, " Large chunk used %ld\n",
- (long) lhdr_ptr->hdr.bytes_used);
+ (long) lhdr_ptr->bytes_used);
}
for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL;
- shdr_ptr = shdr_ptr->hdr.next) {
+ shdr_ptr = shdr_ptr->next) {
fprintf(stderr, " Small chunk used %ld free %ld\n",
- (long) shdr_ptr->hdr.bytes_used,
- (long) shdr_ptr->hdr.bytes_left);
+ (long) shdr_ptr->bytes_used,
+ (long) shdr_ptr->bytes_left);
}
}
@@ -236,6 +229,10 @@
* and we also distinguish the first pool of a class from later ones.
* NOTE: the values given work fairly well on both 16- and 32-bit-int
* machines, but may be too small if longs are 64 bits or more.
+ *
+ * Since we do not know what alignment malloc() gives us, we have to
+ * allocate ALIGN_SIZE-1 extra space per pool to have room for alignment
+ * adjustment.
*/
static const size_t first_pool_slop[JPOOL_NUMPOOLS] =
@@ -260,33 +257,36 @@
my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
small_pool_ptr hdr_ptr, prev_hdr_ptr;
char * data_ptr;
- size_t odd_bytes, min_request, slop;
+ size_t min_request, slop;
+
+ /*
+ * Round up the requested size to a multiple of ALIGN_SIZE in order
+ * to assure alignment for the next object allocated in the same pool
+ * and so that algorithms can straddle outside the proper area up
+ * to the next alignment.
+ */
+ sizeofobject = jround_up(sizeofobject, ALIGN_SIZE);
/* Check for unsatisfiable request (do now to ensure no overflow below) */
- if (sizeofobject > (size_t) (MAX_ALLOC_CHUNK-SIZEOF(small_pool_hdr)))
+ if ((SIZEOF(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > MAX_ALLOC_CHUNK)
out_of_memory(cinfo, 1); /* request exceeds malloc's ability */
- /* Round up the requested size to a multiple of SIZEOF(ALIGN_TYPE) */
- odd_bytes = sizeofobject % SIZEOF(ALIGN_TYPE);
- if (odd_bytes > 0)
- sizeofobject += SIZEOF(ALIGN_TYPE) - odd_bytes;
-
/* See if space is available in any existing pool */
if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
prev_hdr_ptr = NULL;
hdr_ptr = mem->small_list[pool_id];
while (hdr_ptr != NULL) {
- if (hdr_ptr->hdr.bytes_left >= sizeofobject)
+ if (hdr_ptr->bytes_left >= sizeofobject)
break; /* found pool with enough space */
prev_hdr_ptr = hdr_ptr;
- hdr_ptr = hdr_ptr->hdr.next;
+ hdr_ptr = hdr_ptr->next;
}
/* Time to make a new pool? */
if (hdr_ptr == NULL) {
/* min_request is what we need now, slop is what will be leftover */
- min_request = sizeofobject + SIZEOF(small_pool_hdr);
+ min_request = SIZEOF(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1;
if (prev_hdr_ptr == NULL) /* first pool in class? */
slop = first_pool_slop[pool_id];
else
@@ -305,20 +305,23 @@
}
mem->total_space_allocated += min_request + slop;
/* Success, initialize the new pool header and add to end of list */
- hdr_ptr->hdr.next = NULL;
- hdr_ptr->hdr.bytes_used = 0;
- hdr_ptr->hdr.bytes_left = sizeofobject + slop;
+ hdr_ptr->next = NULL;
+ hdr_ptr->bytes_used = 0;
+ hdr_ptr->bytes_left = sizeofobject + slop;
if (prev_hdr_ptr == NULL) /* first pool in class? */
mem->small_list[pool_id] = hdr_ptr;
else
- prev_hdr_ptr->hdr.next = hdr_ptr;
+ prev_hdr_ptr->next = hdr_ptr;
}
/* OK, allocate the object from the current pool */
- data_ptr = (char *) (hdr_ptr + 1); /* point to first data byte in pool */
- data_ptr += hdr_ptr->hdr.bytes_used; /* point to place for object */
- hdr_ptr->hdr.bytes_used += sizeofobject;
- hdr_ptr->hdr.bytes_left -= sizeofobject;
+ data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+ data_ptr += SIZEOF(small_pool_hdr); /* ...by skipping the header... */
+ if ((unsigned long)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
+ data_ptr += ALIGN_SIZE - (unsigned long)data_ptr % ALIGN_SIZE;
+ data_ptr += hdr_ptr->bytes_used; /* point to place for object */
+ hdr_ptr->bytes_used += sizeofobject;
+ hdr_ptr->bytes_left -= sizeofobject;
return (void *) data_ptr;
}
@@ -344,37 +347,45 @@
{
my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
large_pool_ptr hdr_ptr;
- size_t odd_bytes;
+ char FAR * data_ptr;
+
+ /*
+ * Round up the requested size to a multiple of ALIGN_SIZE so that
+ * algorithms can straddle outside the proper area up to the next
+ * alignment.
+ */
+ sizeofobject = jround_up(sizeofobject, ALIGN_SIZE);
/* Check for unsatisfiable request (do now to ensure no overflow below) */
- if (sizeofobject > (size_t) (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)))
+ if ((SIZEOF(large_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > MAX_ALLOC_CHUNK)
out_of_memory(cinfo, 3); /* request exceeds malloc's ability */
- /* Round up the requested size to a multiple of SIZEOF(ALIGN_TYPE) */
- odd_bytes = sizeofobject % SIZEOF(ALIGN_TYPE);
- if (odd_bytes > 0)
- sizeofobject += SIZEOF(ALIGN_TYPE) - odd_bytes;
-
/* Always make a new pool */
if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
hdr_ptr = (large_pool_ptr) jpeg_get_large(cinfo, sizeofobject +
- SIZEOF(large_pool_hdr));
+ SIZEOF(large_pool_hdr) +
+ ALIGN_SIZE - 1);
if (hdr_ptr == NULL)
out_of_memory(cinfo, 4); /* jpeg_get_large failed */
- mem->total_space_allocated += sizeofobject + SIZEOF(large_pool_hdr);
+ mem->total_space_allocated += sizeofobject + SIZEOF(large_pool_hdr) + ALIGN_SIZE - 1;
/* Success, initialize the new pool header and add to list */
- hdr_ptr->hdr.next = mem->large_list[pool_id];
+ hdr_ptr->next = mem->large_list[pool_id];
/* We maintain space counts in each pool header for statistical purposes,
* even though they are not needed for allocation.
*/
- hdr_ptr->hdr.bytes_used = sizeofobject;
- hdr_ptr->hdr.bytes_left = 0;
+ hdr_ptr->bytes_used = sizeofobject;
+ hdr_ptr->bytes_left = 0;
mem->large_list[pool_id] = hdr_ptr;
- return (void FAR *) (hdr_ptr + 1); /* point to first data byte in pool */
+ data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+ data_ptr += SIZEOF(small_pool_hdr); /* ...by skipping the header... */
+ if ((unsigned long)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
+ data_ptr += ALIGN_SIZE - (unsigned long)data_ptr % ALIGN_SIZE;
+
+ return (void FAR *) data_ptr;
}
@@ -389,6 +400,10 @@
* this chunking of rows. The rowsperchunk value is left in the mem manager
* object so that it can be saved away if this sarray is the workspace for
* a virtual array.
+ *
+ * Since we are often upsampling with a factor 2, we align the size (not
+ * the start) to 2 * ALIGN_SIZE so that the upsampling routines don't have
+ * to be as careful about size.
*/
METHODDEF(JSAMPARRAY)
@@ -402,6 +417,11 @@
JDIMENSION rowsperchunk, currow, i;
long ltemp;
+ /* Make sure each row is properly aligned */
+ if ((ALIGN_SIZE % SIZEOF(JSAMPLE)) != 0)
+ out_of_memory(cinfo, 5); /* safety check */
+ samplesperrow = jround_up(samplesperrow, (2 * ALIGN_SIZE) / SIZEOF(JSAMPLE));
+
/* Calculate max # of rows allowed in one allocation chunk */
ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
((long) samplesperrow * SIZEOF(JSAMPLE));
@@ -450,6 +470,10 @@
JDIMENSION rowsperchunk, currow, i;
long ltemp;
+ /* Make sure each row is properly aligned */
+ if ((SIZEOF(JBLOCK) % ALIGN_SIZE) != 0)
+ out_of_memory(cinfo, 6); /* safety check */
+
/* Calculate max # of rows allowed in one allocation chunk */
ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
((long) blocksperrow * SIZEOF(JBLOCK));
@@ -968,9 +992,9 @@
mem->large_list[pool_id] = NULL;
while (lhdr_ptr != NULL) {
- large_pool_ptr next_lhdr_ptr = lhdr_ptr->hdr.next;
- space_freed = lhdr_ptr->hdr.bytes_used +
- lhdr_ptr->hdr.bytes_left +
+ large_pool_ptr next_lhdr_ptr = lhdr_ptr->next;
+ space_freed = lhdr_ptr->bytes_used +
+ lhdr_ptr->bytes_left +
SIZEOF(large_pool_hdr);
jpeg_free_large(cinfo, (void FAR *) lhdr_ptr, space_freed);
mem->total_space_allocated -= space_freed;
@@ -982,9 +1006,9 @@
mem->small_list[pool_id] = NULL;
while (shdr_ptr != NULL) {
- small_pool_ptr next_shdr_ptr = shdr_ptr->hdr.next;
- space_freed = shdr_ptr->hdr.bytes_used +
- shdr_ptr->hdr.bytes_left +
+ small_pool_ptr next_shdr_ptr = shdr_ptr->next;
+ space_freed = shdr_ptr->bytes_used +
+ shdr_ptr->bytes_left +
SIZEOF(small_pool_hdr);
jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed);
mem->total_space_allocated -= space_freed;
@@ -1041,16 +1065,16 @@
* in common if and only if X is a power of 2, ie has only one one-bit.
* Some compilers may give an "unreachable code" warning here; ignore it.
*/
- if ((SIZEOF(ALIGN_TYPE) & (SIZEOF(ALIGN_TYPE)-1)) != 0)
+ if ((ALIGN_SIZE & (ALIGN_SIZE-1)) != 0)
ERREXIT(cinfo, JERR_BAD_ALIGN_TYPE);
/* MAX_ALLOC_CHUNK must be representable as type size_t, and must be
- * a multiple of SIZEOF(ALIGN_TYPE).
+ * a multiple of ALIGN_SIZE.
* Again, an "unreachable code" warning may be ignored here.
* But a "constant too large" warning means you need to fix MAX_ALLOC_CHUNK.
*/
test_mac = (size_t) MAX_ALLOC_CHUNK;
if ((long) test_mac != MAX_ALLOC_CHUNK ||
- (MAX_ALLOC_CHUNK % SIZEOF(ALIGN_TYPE)) != 0)
+ (MAX_ALLOC_CHUNK % ALIGN_SIZE) != 0)
ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */
diff --git a/jmemname.c b/jmemname.c
deleted file mode 100644
index ed96dee..0000000
--- a/jmemname.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * jmemname.c
- *
- * Copyright (C) 1992-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file provides a generic implementation of the system-dependent
- * portion of the JPEG memory manager. This implementation assumes that
- * you must explicitly construct a name for each temp file.
- * Also, the problem of determining the amount of memory available
- * is shoved onto the user.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h" /* import the system-dependent declarations */
-
-#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare malloc(),free() */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
-#endif
-
-#ifndef SEEK_SET /* pre-ANSI systems may not define this; */
-#define SEEK_SET 0 /* if not, assume 0 is correct */
-#endif
-
-#ifdef DONT_USE_B_MODE /* define mode parameters for fopen() */
-#define READ_BINARY "r"
-#define RW_BINARY "w+"
-#else
-#ifdef VMS /* VMS is very nonstandard */
-#define READ_BINARY "rb", "ctx=stm"
-#define RW_BINARY "w+b", "ctx=stm"
-#else /* standard ANSI-compliant case */
-#define READ_BINARY "rb"
-#define RW_BINARY "w+b"
-#endif
-#endif
-
-
-/*
- * Selection of a file name for a temporary file.
- * This is system-dependent!
- *
- * The code as given is suitable for most Unix systems, and it is easily
- * modified for most non-Unix systems. Some notes:
- * 1. The temp file is created in the directory named by TEMP_DIRECTORY.
- * The default value is /usr/tmp, which is the conventional place for
- * creating large temp files on Unix. On other systems you'll probably
- * want to change the file location. You can do this by editing the
- * #define, or (preferred) by defining TEMP_DIRECTORY in jconfig.h.
- *
- * 2. If you need to change the file name as well as its location,
- * you can override the TEMP_FILE_NAME macro. (Note that this is
- * actually a printf format string; it must contain %s and %d.)
- * Few people should need to do this.
- *
- * 3. mktemp() is used to ensure that multiple processes running
- * simultaneously won't select the same file names. If your system
- * doesn't have mktemp(), define NO_MKTEMP to do it the hard way.
- * (If you don't have <errno.h>, also define NO_ERRNO_H.)
- *
- * 4. You probably want to define NEED_SIGNAL_CATCHER so that cjpeg.c/djpeg.c
- * will cause the temp files to be removed if you stop the program early.
- */
-
-#ifndef TEMP_DIRECTORY /* can override from jconfig.h or Makefile */
-#define TEMP_DIRECTORY "/usr/tmp/" /* recommended setting for Unix */
-#endif
-
-static int next_file_num; /* to distinguish among several temp files */
-
-#ifdef NO_MKTEMP
-
-#ifndef TEMP_FILE_NAME /* can override from jconfig.h or Makefile */
-#define TEMP_FILE_NAME "%sJPG%03d.TMP"
-#endif
-
-#ifndef NO_ERRNO_H
-#include <errno.h> /* to define ENOENT */
-#endif
-
-/* ANSI C specifies that errno is a macro, but on older systems it's more
- * likely to be a plain int variable. And not all versions of errno.h
- * bother to declare it, so we have to in order to be most portable. Thus:
- */
-#ifndef errno
-extern int errno;
-#endif
-
-
-LOCAL(void)
-select_file_name (char * fname)
-{
- FILE * tfile;
-
- /* Keep generating file names till we find one that's not in use */
- for (;;) {
- next_file_num++; /* advance counter */
- sprintf(fname, TEMP_FILE_NAME, TEMP_DIRECTORY, next_file_num);
- if ((tfile = fopen(fname, READ_BINARY)) == NULL) {
- /* fopen could have failed for a reason other than the file not
- * being there; for example, file there but unreadable.
- * If <errno.h> isn't available, then we cannot test the cause.
- */
-#ifdef ENOENT
- if (errno != ENOENT)
- continue;
-#endif
- break;
- }
- fclose(tfile); /* oops, it's there; close tfile & try again */
- }
-}
-
-#else /* ! NO_MKTEMP */
-
-/* Note that mktemp() requires the initial filename to end in six X's */
-#ifndef TEMP_FILE_NAME /* can override from jconfig.h or Makefile */
-#define TEMP_FILE_NAME "%sJPG%dXXXXXX"
-#endif
-
-LOCAL(void)
-select_file_name (char * fname)
-{
- next_file_num++; /* advance counter */
- sprintf(fname, TEMP_FILE_NAME, TEMP_DIRECTORY, next_file_num);
- mktemp(fname); /* make sure file name is unique */
- /* mktemp replaces the trailing XXXXXX with a unique string of characters */
-}
-
-#endif /* NO_MKTEMP */
-
-
-/*
- * Memory allocation and freeing are controlled by the regular library
- * routines malloc() and free().
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
- return (void *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
- free(object);
-}
-
-
-/*
- * "Large" objects are treated the same as "small" ones.
- * NB: although we include FAR keywords in the routine declarations,
- * this file won't actually work in 80x86 small/medium model; at least,
- * you probably won't be able to process useful-size images in only 64KB.
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
- return (void FAR *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
- free(object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- * It's impossible to do this in a portable way; our current solution is
- * to make the user tell us (with a default value set at compile time).
- * If you can actually get the available space, it's a good idea to subtract
- * a slop factor of 5% or so.
- */
-
-#ifndef DEFAULT_MAX_MEM /* so can override from makefile */
-#define DEFAULT_MAX_MEM 1000000L /* default: one megabyte */
-#endif
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
- long max_bytes_needed, long already_allocated)
-{
- return cinfo->mem->max_memory_to_use - already_allocated;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed. You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-
-METHODDEF(void)
-read_backing_store (j_common_ptr cinfo, backing_store_ptr info,
- void FAR * buffer_address,
- long file_offset, long byte_count)
-{
- if (fseek(info->temp_file, file_offset, SEEK_SET))
- ERREXIT(cinfo, JERR_TFILE_SEEK);
- if (JFREAD(info->temp_file, buffer_address, byte_count)
- != (size_t) byte_count)
- ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_backing_store (j_common_ptr cinfo, backing_store_ptr info,
- void FAR * buffer_address,
- long file_offset, long byte_count)
-{
- if (fseek(info->temp_file, file_offset, SEEK_SET))
- ERREXIT(cinfo, JERR_TFILE_SEEK);
- if (JFWRITE(info->temp_file, buffer_address, byte_count)
- != (size_t) byte_count)
- ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_backing_store (j_common_ptr cinfo, backing_store_ptr info)
-{
- fclose(info->temp_file); /* close the file */
- unlink(info->temp_name); /* delete the file */
-/* If your system doesn't have unlink(), use remove() instead.
- * remove() is the ANSI-standard name for this function, but if
- * your system was ANSI you'd be using jmemansi.c, right?
- */
- TRACEMSS(cinfo, 1, JTRC_TFILE_CLOSE, info->temp_name);
-}
-
-
-/*
- * Initial opening of a backing-store object.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
- long total_bytes_needed)
-{
- select_file_name(info->temp_name);
- if ((info->temp_file = fopen(info->temp_name, RW_BINARY)) == NULL)
- ERREXITS(cinfo, JERR_TFILE_CREATE, info->temp_name);
- info->read_backing_store = read_backing_store;
- info->write_backing_store = write_backing_store;
- info->close_backing_store = close_backing_store;
- TRACEMSS(cinfo, 1, JTRC_TFILE_OPEN, info->temp_name);
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
- next_file_num = 0; /* initialize temp file name generator */
- return DEFAULT_MAX_MEM; /* default for max_memory_to_use */
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
- /* no work */
-}
diff --git a/jmorecfg.h b/jmorecfg.h
index 54a7d1c..0e7fb72 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -2,6 +2,7 @@
* jmorecfg.h
*
* Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -62,11 +63,11 @@
#else /* not HAVE_UNSIGNED_CHAR */
typedef char JSAMPLE;
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
#define GETJSAMPLE(value) ((int) (value))
#else
#define GETJSAMPLE(value) ((int) (value) & 0xFF)
-#endif /* CHAR_IS_UNSIGNED */
+#endif /* __CHAR_UNSIGNED__ */
#endif /* HAVE_UNSIGNED_CHAR */
@@ -113,11 +114,11 @@
#else /* not HAVE_UNSIGNED_CHAR */
typedef char JOCTET;
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
#define GETJOCTET(value) (value)
#else
#define GETJOCTET(value) ((value) & 0xFF)
-#endif /* CHAR_IS_UNSIGNED */
+#endif /* __CHAR_UNSIGNED__ */
#endif /* HAVE_UNSIGNED_CHAR */
@@ -134,11 +135,11 @@
#ifdef HAVE_UNSIGNED_CHAR
typedef unsigned char UINT8;
#else /* not HAVE_UNSIGNED_CHAR */
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
typedef char UINT8;
-#else /* not CHAR_IS_UNSIGNED */
+#else /* not __CHAR_UNSIGNED__ */
typedef short UINT8;
-#endif /* CHAR_IS_UNSIGNED */
+#endif /* __CHAR_UNSIGNED__ */
#endif /* HAVE_UNSIGNED_CHAR */
/* UINT16 must hold at least the values 0..65535. */
@@ -316,31 +317,37 @@
#define RGB_BLUE 2 /* Offset of Blue */
#define RGB_PIXELSIZE 3 /* JSAMPLEs per RGB scanline element */
+#define JPEG_NUMCS 12
+
+static const int rgb_red[JPEG_NUMCS] = {
+ -1, -1, RGB_RED, -1, -1, -1, 0, 0, 2, 2, 3, 1
+};
+
+static const int rgb_green[JPEG_NUMCS] = {
+ -1, -1, RGB_GREEN, -1, -1, -1, 1, 1, 1, 1, 2, 2
+};
+
+static const int rgb_blue[JPEG_NUMCS] = {
+ -1, -1, RGB_BLUE, -1, -1, -1, 2, 2, 0, 0, 1, 3
+};
+
+static const int rgb_pixelsize[JPEG_NUMCS] = {
+ -1, -1, RGB_PIXELSIZE, -1, -1, -1, 3, 4, 3, 4, 4, 4
+};
/* Definitions for speed-related optimizations. */
-
-/* If your compiler supports inline functions, define INLINE
- * as the inline keyword; otherwise define it as empty.
- */
-
-#ifndef INLINE
-#ifdef __GNUC__ /* for instance, GNU C knows about inline */
-#define INLINE __inline__
-#endif
-#ifndef INLINE
-#define INLINE /* default is to define it as empty */
-#endif
-#endif
-
-
/* On some machines (notably 68000 series) "int" is 32 bits, but multiplying
* two 16-bit shorts is faster than multiplying two ints. Define MULTIPLIER
* as short on such a machine. MULTIPLIER must be at least 16 bits wide.
*/
#ifndef MULTIPLIER
+#ifndef WITH_SIMD
#define MULTIPLIER int /* type for fastest integer multiply */
+#else
+#define MULTIPLIER short /* prefer 16-bit with SIMD for parellelism */
+#endif
#endif
diff --git a/jpeg.dsp b/jpeg.dsp
new file mode 100644
index 0000000..5f97af7
--- /dev/null
+++ b/jpeg.dsp
@@ -0,0 +1,332 @@
+# Microsoft Developer Studio Project File - Name="jpeg" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Static Library" 0x0104
+
+CFG=jpeg - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "jpeg.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "jpeg.mak" CFG="jpeg - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "jpeg - Win32 Release" (based on "Win32 (x86) Static Library")
+!MESSAGE "jpeg - Win32 Debug" (based on "Win32 (x86) Static Library")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "jpeg - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "..\Release"
+# PROP Intermediate_Dir "..\Release\jpeg"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_MBCS" /D "_LIB" /YX /FD /c
+# ADD CPP /nologo /MT /W3 /GX /O2 /D "NDEBUG" /D "_LIB" /D "WIN32" /D "_MBCS" /YX /FD /c /I "win"
+# ADD BASE RSC /l 0x419 /d "NDEBUG"
+# ADD RSC /l 0x809 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo
+
+!ELSEIF "$(CFG)" == "jpeg - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "..\Debug"
+# PROP BASE Intermediate_Dir "..\Debug\jpeg"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_MBCS" /D "_LIB" /YX /FD /GZ /c
+# ADD CPP /nologo /MTd /W3 /Gm /GX /Zi /Od /D "_DEBUG" /D "_LIB" /D "WIN32" /D "_MBCS" /YX /FD /GZ /c /I "win"
+# ADD BASE RSC /l 0x419 /d "_DEBUG"
+# ADD RSC /l 0x809 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo
+
+!ENDIF
+
+# Begin Target
+
+# Name "jpeg - Win32 Release"
+# Name "jpeg - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=.\jcapimin.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcapistd.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jccoefct.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jccolor.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcdctmgr.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jchuff.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcinit.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcmainct.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcmarker.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcmaster.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcomapi.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcparam.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcphuff.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcprepct.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcsample.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jctrans.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdapimin.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdapistd.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdatadst.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdatasrc.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdcoefct.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdcolor.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jddctmgr.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdhuff.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdinput.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdmainct.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdmarker.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdmaster.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdmerge.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdphuff.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdpostct.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdsample.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdtrans.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jerror.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jfdctflt.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jfdctfst.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jfdctint.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jidctflt.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jidctfst.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jidctint.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jidctred.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jmemmgr.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jmemnobs.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jquant1.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jquant2.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jsimd_none.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jutils.c
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=.\jchuff.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jconfig.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdct.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdhuff.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jerror.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jinclude.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jmemsys.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jmorecfg.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jpegint.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jpeglib.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jsimd.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jsimddct.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jversion.h
+# End Source File
+# End Group
+# End Target
+# End Project
diff --git a/jpeglib.h b/jpeglib.h
index d1be8dd..31f0dd6 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -2,6 +2,7 @@
* jpeglib.h
*
* Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -203,13 +204,22 @@
/* Known color spaces. */
+#define JCS_EXTENSIONS 1
+
typedef enum {
JCS_UNKNOWN, /* error/unspecified */
JCS_GRAYSCALE, /* monochrome */
- JCS_RGB, /* red/green/blue */
+ JCS_RGB, /* red/green/blue as specified by the RGB_RED, RGB_GREEN,
+ RGB_BLUE, and RGB_PIXELSIZE macros */
JCS_YCbCr, /* Y/Cb/Cr (also known as YUV) */
JCS_CMYK, /* C/M/Y/K */
- JCS_YCCK /* Y/Cb/Cr/K */
+ JCS_YCCK, /* Y/Cb/Cr/K */
+ JCS_EXT_RGB, /* red/green/blue */
+ JCS_EXT_RGBX, /* red/green/blue/x */
+ JCS_EXT_BGR, /* blue/green/red */
+ JCS_EXT_BGRX, /* blue/green/red/x */
+ JCS_EXT_XBGR, /* x/blue/green/red */
+ JCS_EXT_XRGB, /* x/red/green/blue */
} J_COLOR_SPACE;
/* DCT/IDCT algorithm options. */
diff --git a/jpegut.c b/jpegut.c
new file mode 100644
index 0000000..3b02fba
--- /dev/null
+++ b/jpegut.c
@@ -0,0 +1,384 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version. The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * wxWindows Library License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./rrtimer.h"
+#include "./turbojpeg.h"
+
+#define _catch(f) {if((f)==-1) {printf("TJPEG: %s\n", tjGetErrorStr()); goto finally;}}
+
+const char *_subnamel[NUMSUBOPT]={"4:4:4", "4:2:2", "4:2:0", "GRAY"};
+const char *_subnames[NUMSUBOPT]={"444", "422", "420", "GRAY"};
+
+int pixels[9][3]=
+{
+ {0, 255, 0},
+ {255, 0, 255},
+ {255, 255, 0},
+ {0, 0, 255},
+ {0, 255, 255},
+ {255, 0, 0},
+ {255, 255, 255},
+ {0, 0, 0},
+ {255, 0, 0}
+};
+
+void initbuf(unsigned char *buf, int w, int h, int ps, int flags)
+{
+ int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
+ _i, j;
+ if(flags&TJ_ALPHAFIRST) {roffset++; goffset++; boffset++;}
+ memset(buf, 0, w*h*ps);
+ for(_i=0; _i<16; _i++)
+ {
+ if(flags&TJ_BOTTOMUP) i=h-_i-1; else i=_i;
+ for(j=0; j<w; j++)
+ {
+ buf[(w*i+j)*ps+roffset]=255;
+ if(((_i/8)+(j/8))%2==0)
+ {
+ buf[(w*i+j)*ps+goffset]=255;
+ buf[(w*i+j)*ps+boffset]=255;
+ }
+ }
+ }
+ for(_i=16; _i<h; _i++)
+ {
+ if(flags&TJ_BOTTOMUP) i=h-_i-1; else i=_i;
+ for(j=0; j<w; j++)
+ {
+ if(((_i/8)+(j/8))%2!=0)
+ {
+ buf[(w*i+j)*ps+roffset]=255;
+ buf[(w*i+j)*ps+goffset]=255;
+ }
+ }
+ }
+}
+
+int dumpbuf(unsigned char *buf, int w, int h, int ps, int flags)
+{
+ int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
+ j;
+ for(i=0; i<h; i++)
+ {
+ for(j=0; j<w; j++)
+ {
+ printf("%.3d/%.3d/%.3d ", buf[(w*i+j)*ps+roffset],
+ buf[(w*i+j)*ps+roffset], buf[(w*i+j)*ps+roffset]);
+ }
+ printf("\n");
+ }
+}
+
+int checkbuf(unsigned char *buf, int w, int h, int ps, int subsamp, int flags)
+{
+ int roffset=(flags&TJ_BGR)?2:0, goffset=1, boffset=(flags&TJ_BGR)?0:2, i,
+ _i, j;
+ if(flags&TJ_ALPHAFIRST) {roffset++; goffset++; boffset++;}
+ if(subsamp==TJ_GRAYSCALE)
+ {
+ for(_i=0; _i<16; _i++)
+ {
+ if(flags&TJ_BOTTOMUP) i=h-_i-1; else i=_i;
+ for(j=0; j<w; j++)
+ {
+ unsigned char r=buf[(w*i+j)*ps+roffset],
+ g=buf[(w*i+j)*ps+goffset],
+ b=buf[(w*i+j)*ps+boffset];
+ if(((_i/8)+(j/8))%2==0)
+ {
+ if(r<253 || g<253 || b<253) return 0;
+ }
+ else
+ {
+ if(r<74 || r>78 || g<74 || g>78 || b<74 || b>78) return 0;
+ }
+ }
+ }
+ for(_i=16; _i<h; _i++)
+ {
+ if(flags&TJ_BOTTOMUP) i=h-_i-1; else i=_i;
+ for(j=0; j<w; j++)
+ {
+ unsigned char r=buf[(w*i+j)*ps+roffset],
+ g=buf[(w*i+j)*ps+goffset],
+ b=buf[(w*i+j)*ps+boffset];
+ if(((_i/8)+(j/8))%2==0)
+ {
+ if(r>2 || g>2 || b>2) return 0;
+ }
+ else
+ {
+ if(r<224 || r>228 || g<224 || g>228 || b<224 || b>228) return 0;
+ }
+ }
+ }
+ }
+ else
+ {
+ for(_i=0; _i<16; _i++)
+ {
+ if(flags&TJ_BOTTOMUP) i=h-_i-1; else i=_i;
+ for(j=0; j<w; j++)
+ {
+ if(buf[(w*i+j)*ps+roffset]<253) return 0;
+ if(((_i/8)+(j/8))%2==0)
+ {
+ if(buf[(w*i+j)*ps+goffset]<253) return 0;
+ if(buf[(w*i+j)*ps+boffset]<253) return 0;
+ }
+ else
+ {
+ if(buf[(w*i+j)*ps+goffset]>2) return 0;
+ if(buf[(w*i+j)*ps+boffset]>2) return 0;
+ }
+ }
+ }
+ for(_i=16; _i<h; _i++)
+ {
+ if(flags&TJ_BOTTOMUP) i=h-_i-1; else i=_i;
+ for(j=0; j<w; j++)
+ {
+ if(buf[(w*i+j)*ps+boffset]>2) return 0;
+ if(((_i/8)+(j/8))%2==0)
+ {
+ if(buf[(w*i+j)*ps+roffset]>2) return 0;
+ if(buf[(w*i+j)*ps+goffset]>2) return 0;
+ }
+ else
+ {
+ if(buf[(w*i+j)*ps+roffset]<253) return 0;
+ if(buf[(w*i+j)*ps+goffset]<253) return 0;
+ }
+ }
+ }
+ }
+ return 1;
+}
+
+void writejpeg(unsigned char *jpegbuf, unsigned long jpgbufsize, char *filename)
+{
+ FILE *outfile=NULL;
+ if((outfile=fopen(filename, "wb"))==NULL)
+ {
+ printf("ERROR: Could not open %s for writing.\n", filename);
+ goto finally;
+ }
+ if(fwrite(jpegbuf, jpgbufsize, 1, outfile)!=1)
+ {
+ printf("ERROR: Could not write to %s.\n", filename);
+ goto finally;
+ }
+
+ finally:
+ if(outfile) fclose(outfile);
+}
+
+void gentestjpeg(tjhandle hnd, unsigned char *jpegbuf, unsigned long *size,
+ int w, int h, int ps, char *basefilename, int subsamp, int qual, int flags)
+{
+ char tempstr[1024]; unsigned char *bmpbuf=NULL;
+ const char *pixformat; double t;
+
+ if(flags&TJ_BGR)
+ {
+ if(ps==3) pixformat="BGR";
+ else {if(flags&TJ_ALPHAFIRST) pixformat="ABGR"; else pixformat="BGRA";}
+ }
+ else
+ {
+ if(ps==3) pixformat="RGB";
+ else {if(flags&TJ_ALPHAFIRST) pixformat="ARGB"; else pixformat="RGBA";}
+ }
+ printf("%s %s -> %s Q%d ... ", pixformat,
+ (flags&TJ_BOTTOMUP)?"Bottom-Up":"Top-Down ", _subnamel[subsamp], qual);
+
+ if((bmpbuf=(unsigned char *)malloc(w*h*ps+1))==NULL)
+ {
+ printf("ERROR: Could not allocate buffer\n"); goto finally;
+ }
+ initbuf(bmpbuf, w, h, ps, flags);
+ memset(jpegbuf, 0, TJBUFSIZE(w, h));
+
+ t=rrtime();
+ _catch(tjCompress(hnd, bmpbuf, w, 0, h, ps, jpegbuf, size, subsamp, qual, flags));
+ t=rrtime()-t;
+
+ sprintf(tempstr, "%s_enc_%s_%s_%sQ%d.jpg", basefilename, pixformat,
+ (flags&TJ_BOTTOMUP)? "BU":"TD", _subnames[subsamp], qual);
+ writejpeg(jpegbuf, *size, tempstr);
+ printf("Done. %f ms\n Result in %s\n", t*1000., tempstr);
+
+ finally:
+ if(bmpbuf) free(bmpbuf);
+}
+
+void gentestbmp(tjhandle hnd, unsigned char *jpegbuf, unsigned long jpegsize,
+ int w, int h, int ps, char *basefilename, int subsamp, int qual, int flags)
+{
+ unsigned char *bmpbuf=NULL;
+ const char *pixformat; int _w=0, _h=0; double t;
+
+ if(flags&TJ_BGR)
+ {
+ if(ps==3) pixformat="BGR";
+ else {if(flags&TJ_ALPHAFIRST) pixformat="ABGR"; else pixformat="BGRA";}
+ }
+ else
+ {
+ if(ps==3) pixformat="RGB";
+ else {if(flags&TJ_ALPHAFIRST) pixformat="ARGB"; else pixformat="RGBA";}
+ }
+ printf("JPEG -> %s %s ... ", pixformat, (flags&TJ_BOTTOMUP)?"Bottom-Up":"Top-Down ");
+
+ _catch(tjDecompressHeader(hnd, jpegbuf, jpegsize, &_w, &_h));
+ if(_w!=w || _h!=h)
+ {
+ printf("Incorrect JPEG header\n"); goto finally;
+ }
+
+ if((bmpbuf=(unsigned char *)malloc(w*h*ps+1))==NULL)
+ {
+ printf("ERROR: Could not allocate buffer\n"); goto finally;
+ }
+ memset(bmpbuf, 0, w*ps*h);
+
+ t=rrtime();
+ _catch(tjDecompress(hnd, jpegbuf, jpegsize, bmpbuf, w, w*ps, h, ps, flags));
+ t=rrtime()-t;
+
+ if(checkbuf(bmpbuf, w, h, ps, subsamp, flags)) printf("Passed.");
+ else {printf("FAILED!"); dumpbuf(bmpbuf, w, h, ps, flags);}
+
+ printf(" %f ms\n\n", t*1000.);
+
+ finally:
+ if(bmpbuf) free(bmpbuf);
+}
+
+void dotest(int w, int h, int ps, int subsamp, char *basefilename)
+{
+ tjhandle hnd=NULL, dhnd=NULL; unsigned char *jpegbuf=NULL;
+ unsigned long size;
+
+ if((jpegbuf=(unsigned char *)malloc(TJBUFSIZE(w, h))) == NULL)
+ {
+ puts("ERROR: Could not allocate buffer."); goto finally;
+ }
+
+ if((hnd=tjInitCompress())==NULL)
+ {printf("Error in tjInitCompress():\n%s\n", tjGetErrorStr()); goto finally;}
+ if((dhnd=tjInitDecompress())==NULL)
+ {printf("Error in tjInitDecompress():\n%s\n", tjGetErrorStr()); goto finally;}
+
+ gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, 0);
+ gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, 0);
+
+ gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_BGR);
+ gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_BGR);
+
+ gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_BOTTOMUP);
+ gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_BOTTOMUP);
+
+ gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_BGR|TJ_BOTTOMUP);
+ gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_BGR|TJ_BOTTOMUP);
+
+ if(ps==4)
+ {
+ gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST);
+ gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST);
+
+ gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR);
+ gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR);
+
+ gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BOTTOMUP);
+ gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BOTTOMUP);
+
+ gentestjpeg(hnd, jpegbuf, &size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR|TJ_BOTTOMUP);
+ gentestbmp(dhnd, jpegbuf, size, w, h, ps, basefilename, subsamp, 100, TJ_ALPHAFIRST|TJ_BGR|TJ_BOTTOMUP);
+ }
+
+ finally:
+ if(hnd) tjDestroy(hnd);
+ if(dhnd) tjDestroy(dhnd);
+
+ if(jpegbuf) free(jpegbuf);
+}
+
+#define MAXLENGTH 2048
+
+void dotest1(void)
+{
+ int i, j, i2; unsigned char *bmpbuf=NULL, *jpgbuf=NULL;
+ tjhandle hnd=NULL; unsigned long size;
+ if((hnd=tjInitCompress())==NULL)
+ {printf("Error in tjInitCompress():\n%s\n", tjGetErrorStr()); goto finally;}
+ printf("Buffer size regression test\n");
+ for(j=1; j<48; j++)
+ {
+ for(i=1; i<(j==1?MAXLENGTH:48); i++)
+ {
+ if(i%100==0) printf("%.4d x %.4d\b\b\b\b\b\b\b\b\b\b\b", i, j);
+ if((bmpbuf=(unsigned char *)malloc(i*j*4))==NULL
+ || (jpgbuf=(unsigned char *)malloc(TJBUFSIZE(i, j)))==NULL)
+ {
+ printf("Memory allocation failure\n"); goto finally;
+ }
+ memset(bmpbuf, 0, i*j*4);
+ for(i2=0; i2<i*j; i2++)
+ {
+ bmpbuf[i2*4]=pixels[i2%9][2];
+ bmpbuf[i2*4+1]=pixels[i2%9][1];
+ bmpbuf[i2*2+2]=pixels[i2%9][0];
+ }
+ _catch(tjCompress(hnd, bmpbuf, i, i*4, j, 4,
+ jpgbuf, &size, TJ_444, 100, TJ_BGR));
+ free(bmpbuf); bmpbuf=NULL; free(jpgbuf); jpgbuf=NULL;
+
+ if((bmpbuf=(unsigned char *)malloc(j*i*4))==NULL
+ || (jpgbuf=(unsigned char *)malloc(TJBUFSIZE(j, i)))==NULL)
+ {
+ printf("Memory allocation failure\n"); goto finally;
+ }
+ for(i2=0; i2<j*i*4; i2++)
+ {
+ if(i2%2==0) bmpbuf[i2]=0xFF;
+ else bmpbuf[i2]=0;
+ }
+ _catch(tjCompress(hnd, bmpbuf, j, j*4, i, 4,
+ jpgbuf, &size, TJ_444, 100, TJ_BGR));
+ free(bmpbuf); bmpbuf=NULL; free(jpgbuf); jpgbuf=NULL;
+ }
+ }
+ printf("Done. \n");
+
+ finally:
+ if(bmpbuf) free(bmpbuf); if(jpgbuf) free(jpgbuf);
+ if(hnd) tjDestroy(hnd);
+}
+
+int main(int argc, char *argv[])
+{
+ dotest(35, 41, 3, TJ_444, "test");
+ dotest(35, 41, 4, TJ_444, "test");
+ dotest(35, 41, 3, TJ_GRAYSCALE, "test");
+ dotest(35, 41, 4, TJ_GRAYSCALE, "test");
+ dotest1();
+
+ return 0;
+}
diff --git a/jpgtest.cxx b/jpgtest.cxx
new file mode 100644
index 0000000..da0ef93
--- /dev/null
+++ b/jpgtest.cxx
@@ -0,0 +1,392 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005, 2006 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version. The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * wxWindows Library License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "./bmp.h"
+#include "./rrutil.h"
+#include "./rrtimer.h"
+#include "./turbojpeg.h"
+
+#define _catch(f) {if((f)==-1) {printf("Error in %s:\n%s\n", #f, tjGetErrorStr()); goto bailout;}}
+
+int forcemmx=0, forcesse=0, forcesse2=0, forcesse3=0, fastupsample=0;
+const int _ps[BMPPIXELFORMATS]={3, 4, 3, 4, 4, 4};
+const int _flags[BMPPIXELFORMATS]={0, 0, TJ_BGR, TJ_BGR,
+ TJ_BGR|TJ_ALPHAFIRST, TJ_ALPHAFIRST};
+const int _rindex[BMPPIXELFORMATS]={0, 0, 2, 2, 3, 1};
+const int _gindex[BMPPIXELFORMATS]={1, 1, 1, 1, 2, 2};
+const int _bindex[BMPPIXELFORMATS]={2, 2, 0, 0, 1, 3};
+const char *_pfname[]={"RGB", "RGBA", "BGR", "BGRA", "ABGR", "ARGB"};
+const char *_subnamel[NUMSUBOPT]={"4:4:4", "4:2:2", "4:2:0", "GRAY"};
+const char *_subnames[NUMSUBOPT]={"444", "422", "420", "GRAY"};
+
+void printsigfig(double val, int figs)
+{
+ char format[80];
+ double _l=log10(val); int l;
+ if(_l<0.)
+ {
+ l=(int)fabs(_l);
+ sprintf(format, "%%%d.%df", figs+l+2, figs+l);
+ }
+ else
+ {
+ l=(int)_l+1;
+ if(figs<=l) sprintf(format, "%%.0f");
+ else sprintf(format, "%%%d.%df", figs+1, figs-l);
+ }
+ printf(format, val);
+}
+
+void dotest(unsigned char *srcbuf, int w, int h, BMPPIXELFORMAT pf, int bu,
+ int jpegsub, int qual, char *filename, int dotile, int useppm, int quiet)
+{
+ char tempstr[1024];
+ FILE *outfile; tjhandle hnd;
+ unsigned char **jpegbuf=NULL, *rgbbuf=NULL;
+ rrtimer timer; double elapsed;
+ int jpgbufsize=0, i, j, tilesizex, tilesizey, numtilesx, numtilesy, ITER;
+ unsigned long *comptilesize=NULL;
+ int flags=(forcemmx?TJ_FORCEMMX:0)|(forcesse?TJ_FORCESSE:0)
+ |(forcesse2?TJ_FORCESSE2:0)|(forcesse3?TJ_FORCESSE3:0)
+ |(fastupsample?TJ_FASTUPSAMPLE:0);
+ int ps=_ps[pf];
+ int pitch=w*ps;
+
+ flags |= _flags[pf];
+ if(bu) flags |= TJ_BOTTOMUP;
+
+ if((rgbbuf=(unsigned char *)malloc(pitch*h)) == NULL)
+ {
+ puts("ERROR: Could not allocate image buffer.");
+ exit(1);
+ }
+
+ if(!quiet) printf("\n>>>>> %s (%s) <--> JPEG %s Q%d <<<<<\n", _pfname[pf],
+ bu?"Bottom-up":"Top-down", _subnamel[jpegsub], qual);
+ if(dotile) {tilesizex=tilesizey=4;} else {tilesizex=w; tilesizey=h;}
+
+ do
+ {
+ tilesizex*=2; if(tilesizex>w) tilesizex=w;
+ tilesizey*=2; if(tilesizey>h) tilesizey=h;
+ numtilesx=(w+tilesizex-1)/tilesizex;
+ numtilesy=(h+tilesizey-1)/tilesizey;
+ if((comptilesize=(unsigned long *)malloc(sizeof(unsigned long)*numtilesx*numtilesy)) == NULL
+ || (jpegbuf=(unsigned char **)malloc(sizeof(unsigned char *)*numtilesx*numtilesy)) == NULL)
+ {
+ puts("ERROR: Could not allocate image buffers.");
+ goto bailout;
+ }
+ memset(jpegbuf, 0, sizeof(unsigned char *)*numtilesx*numtilesy);
+ for(i=0; i<numtilesx*numtilesy; i++)
+ {
+ if((jpegbuf[i]=(unsigned char *)malloc(TJBUFSIZE(tilesizex, tilesizey))) == NULL)
+ {
+ puts("ERROR: Could not allocate image buffers.");
+ goto bailout;
+ }
+ }
+
+ // Compression test
+ if(quiet) printf("%s\t%s\t%s\t%d\t", _pfname[pf], bu?"BU":"TD",
+ _subnamel[jpegsub], qual);
+ for(i=0; i<h; i++) memcpy(&rgbbuf[pitch*i], &srcbuf[w*ps*i], w*ps);
+ if((hnd=tjInitCompress())==NULL)
+ {
+ printf("Error in tjInitCompress():\n%s\n", tjGetErrorStr());
+ goto bailout;
+ }
+ _catch(tjCompress(hnd, rgbbuf, tilesizex, pitch, tilesizey, ps,
+ jpegbuf[0], &comptilesize[0], jpegsub, qual, flags));
+ ITER=0;
+ timer.start();
+ do
+ {
+ jpgbufsize=0; int tilen=0;
+ for(i=0; i<h; i+=tilesizey)
+ {
+ for(j=0; j<w; j+=tilesizex)
+ {
+ int tempw=min(tilesizex, w-j), temph=min(tilesizey, h-i);
+ _catch(tjCompress(hnd, &rgbbuf[pitch*i+j*ps], tempw, pitch,
+ temph, ps, jpegbuf[tilen], &comptilesize[tilen], jpegsub, qual,
+ flags));
+ jpgbufsize+=comptilesize[tilen];
+ tilen++;
+ }
+ }
+ ITER++;
+ } while((elapsed=timer.elapsed())<5.);
+ _catch(tjDestroy(hnd));
+ if(quiet)
+ {
+ if(tilesizex==w && tilesizey==h) printf("Full \t");
+ else printf("%-4d %-4d\t", tilesizex, tilesizey);
+ printsigfig((double)(w*h)/1000000.*(double)ITER/elapsed, 4);
+ printf("\t");
+ printsigfig((double)(w*h*ps)/(double)jpgbufsize, 4);
+ printf("\t");
+ }
+ else
+ {
+ if(tilesizex==w && tilesizey==h) printf("\nFull image\n");
+ else printf("\nTile size: %d x %d\n", tilesizex, tilesizey);
+ printf("C--> Frame rate: %f fps\n", (double)ITER/elapsed);
+ printf(" Output image size: %d bytes\n", jpgbufsize);
+ printf(" Compression ratio: %f:1\n",
+ (double)(w*h*ps)/(double)jpgbufsize);
+ printf(" Source throughput: %f Megapixels/sec\n",
+ (double)(w*h)/1000000.*(double)ITER/elapsed);
+ printf(" Output bit stream: %f Megabits/sec\n",
+ (double)jpgbufsize*8./1000000.*(double)ITER/elapsed);
+ }
+ if(tilesizex==w && tilesizey==h)
+ {
+ sprintf(tempstr, "%s_%sQ%d.jpg", filename, _subnames[jpegsub], qual);
+ if((outfile=fopen(tempstr, "wb"))==NULL)
+ {
+ puts("ERROR: Could not open reference image");
+ exit(1);
+ }
+ if(fwrite(jpegbuf[0], jpgbufsize, 1, outfile)!=1)
+ {
+ puts("ERROR: Could not write reference image");
+ exit(1);
+ }
+ fclose(outfile);
+ if(!quiet) printf("Reference image written to %s\n", tempstr);
+ }
+
+ // Decompression test
+ memset(rgbbuf, 127, pitch*h); // Grey image means decompressor did nothing
+ if((hnd=tjInitDecompress())==NULL)
+ {
+ printf("Error in tjInitDecompress():\n%s\n", tjGetErrorStr());
+ goto bailout;
+ }
+ _catch(tjDecompress(hnd, jpegbuf[0], jpgbufsize, rgbbuf, tilesizex, pitch,
+ tilesizey, ps, flags));
+ ITER=0;
+ timer.start();
+ do
+ {
+ int tilen=0;
+ for(i=0; i<h; i+=tilesizey)
+ {
+ for(j=0; j<w; j+=tilesizex)
+ {
+ int tempw=min(tilesizex, w-j), temph=min(tilesizey, h-i);
+ _catch(tjDecompress(hnd, jpegbuf[tilen], comptilesize[tilen],
+ &rgbbuf[pitch*i+ps*j], tempw, pitch, temph, ps, flags));
+ tilen++;
+ }
+ }
+ ITER++;
+ } while((elapsed=timer.elapsed())<5.);
+ _catch(tjDestroy(hnd));
+ if(quiet)
+ {
+ printsigfig((double)(w*h)/1000000.*(double)ITER/elapsed, 4);
+ printf("\n");
+ }
+ else
+ {
+ printf("D--> Frame rate: %f fps\n", (double)ITER/elapsed);
+ printf(" Dest. throughput: %f Megapixels/sec\n",
+ (double)(w*h)/1000000.*(double)ITER/elapsed);
+ }
+ if(tilesizex==w && tilesizey==h)
+ sprintf(tempstr, "%s_%sQ%d_full.%s", filename, _subnames[jpegsub], qual,
+ useppm?"ppm":"bmp");
+ else sprintf(tempstr, "%s_%sQ%d_%dx%d.%s", filename, _subnames[jpegsub],
+ qual, tilesizex, tilesizey, useppm?"ppm":"bmp");
+ if(savebmp(tempstr, rgbbuf, w, h, pf, pitch, bu)==-1)
+ {
+ printf("ERROR saving bitmap: %s\n", bmpgeterr());
+ goto bailout;
+ }
+ sprintf(strrchr(tempstr, '.'), "-err.%s", useppm?"ppm":"bmp");
+ if(!quiet)
+ printf("Computing compression error and saving to %s.\n", tempstr);
+ if(jpegsub==TJ_GRAYSCALE)
+ {
+ for(j=0; j<h; j++)
+ {
+ for(i=0; i<w*ps; i+=ps)
+ {
+ int y=(int)((double)srcbuf[w*ps*j+i+_rindex[pf]]*0.299
+ + (double)srcbuf[w*ps*j+i+_gindex[pf]]*0.587
+ + (double)srcbuf[w*ps*j+i+_bindex[pf]]*0.114 + 0.5);
+ if(y>255) y=255; if(y<0) y=0;
+ rgbbuf[pitch*j+i+_rindex[pf]]=abs(rgbbuf[pitch*j+i+_rindex[pf]]-y);
+ rgbbuf[pitch*j+i+_gindex[pf]]=abs(rgbbuf[pitch*j+i+_gindex[pf]]-y);
+ rgbbuf[pitch*j+i+_bindex[pf]]=abs(rgbbuf[pitch*j+i+_bindex[pf]]-y);
+ }
+ }
+ }
+ else
+ {
+ for(j=0; j<h; j++) for(i=0; i<w*ps; i++)
+ rgbbuf[pitch*j+i]=abs(rgbbuf[pitch*j+i]-srcbuf[w*ps*j+i]);
+ }
+ if(savebmp(tempstr, rgbbuf, w, h, pf, pitch, bu)==-1)
+ {
+ printf("ERROR saving bitmap: %s\n", bmpgeterr());
+ goto bailout;
+ }
+
+ // Cleanup
+ if(jpegbuf)
+ {
+ for(i=0; i<numtilesx*numtilesy; i++)
+ {if(jpegbuf[i]) free(jpegbuf[i]); jpegbuf[i]=NULL;}
+ free(jpegbuf); jpegbuf=NULL;
+ }
+ if(comptilesize) {free(comptilesize); comptilesize=NULL;}
+ } while(tilesizex<w || tilesizey<h);
+
+ if(rgbbuf) {free(rgbbuf); rgbbuf=NULL;}
+ return;
+
+ bailout:
+ if(jpegbuf)
+ {
+ for(i=0; i<numtilesx*numtilesy; i++)
+ {if(jpegbuf[i]) free(jpegbuf[i]); jpegbuf[i]=NULL;}
+ free(jpegbuf); jpegbuf=NULL;
+ }
+ if(comptilesize) {free(comptilesize); comptilesize=NULL;}
+ if(rgbbuf) {free(rgbbuf); rgbbuf=NULL;}
+ return;
+}
+
+
+int main(int argc, char *argv[])
+{
+ unsigned char *bmpbuf=NULL; int w, h, i, useppm=0;
+ int qual, dotile=0, quiet=0, hiqual=-1; char *temp;
+ BMPPIXELFORMAT pf=BMP_BGR;
+ int bu=0;
+
+ printf("\n");
+
+ if(argc<3)
+ {
+ printf("USAGE: %s <Inputfile (BMP|PPM)> <%% Quality>\n\n", argv[0]);
+ printf(" [-tile]\n");
+ printf(" Test performance of the codec when the image is encoded\n");
+ printf(" as separate tiles of varying sizes.\n\n");
+ printf(" [-forcemmx] [-forcesse] [-forcesse2] [-forcesse3]\n");
+ printf(" Force MMX, SSE, or SSE2 code paths in Intel codec\n\n");
+ printf(" [-rgb | -bgr | -rgba | -bgra | -abgr | -argb]\n");
+ printf(" Test the specified color conversion path in the codec (default: BGR)\n\n");
+ printf(" [-fastupsample]\n");
+ printf(" Use fast, inaccurate upsampling code to perform 4:2:2 and 4:2:0\n");
+ printf(" YUV decoding in libjpeg decompressor\n\n");
+ printf(" [-quiet]\n");
+ printf(" Output in tabular rather than verbose format\n\n");
+ printf(" NOTE: If the quality is specified as a range, i.e. 90-100, a separate\n");
+ printf(" test will be performed for all quality values in the range.\n");
+ exit(1);
+ }
+ if((qual=atoi(argv[2]))<1 || qual>100)
+ {
+ puts("ERROR: Quality must be between 1 and 100.");
+ exit(1);
+ }
+ if((temp=strchr(argv[2], '-'))!=NULL && strlen(temp)>1
+ && sscanf(&temp[1], "%d", &hiqual)==1 && hiqual>qual && hiqual>=1
+ && hiqual<=100) {}
+ else hiqual=qual;
+
+ if(argc>3)
+ {
+ for(i=3; i<argc; i++)
+ {
+ if(!stricmp(argv[i], "-tile")) dotile=1;
+ if(!stricmp(argv[i], "-forcesse3"))
+ {
+ printf("Using SSE3 code in Intel compressor\n");
+ forcesse3=1;
+ }
+ if(!stricmp(argv[i], "-forcesse2"))
+ {
+ printf("Using SSE2 code in Intel compressor\n");
+ forcesse2=1;
+ }
+ if(!stricmp(argv[i], "-forcesse"))
+ {
+ printf("Using SSE code in Intel compressor\n");
+ forcesse=1;
+ }
+ if(!stricmp(argv[i], "-forcemmx"))
+ {
+ printf("Using MMX code in Intel compressor\n");
+ forcemmx=1;
+ }
+ if(!stricmp(argv[i], "-fastupsample"))
+ {
+ printf("Using fast upsampling code\n");
+ fastupsample=1;
+ }
+ if(!stricmp(argv[i], "-rgb")) pf=BMP_RGB;
+ if(!stricmp(argv[i], "-rgba")) pf=BMP_RGBA;
+ if(!stricmp(argv[i], "-bgr")) pf=BMP_BGR;
+ if(!stricmp(argv[i], "-bgra")) pf=BMP_BGRA;
+ if(!stricmp(argv[i], "-abgr")) pf=BMP_ABGR;
+ if(!stricmp(argv[i], "-argb")) pf=BMP_ARGB;
+ if(!stricmp(argv[i], "-bottomup")) bu=1;
+ if(!stricmp(argv[i], "-quiet")) quiet=1;
+ }
+ }
+
+ if(loadbmp(argv[1], &bmpbuf, &w, &h, pf, 1, bu)==-1)
+ {
+ printf("ERROR loading bitmap: %s\n", bmpgeterr()); exit(1);
+ }
+
+ temp=strrchr(argv[1], '.');
+ if(temp!=NULL)
+ {
+ if(!stricmp(temp, ".ppm")) useppm=1;
+ *temp='\0';
+ }
+
+ if(quiet)
+ {
+ printf("All performance values in Mpixels/sec\n\n");
+ printf("Bitmap\tBitmap\tJPEG\tJPEG\tTile Size\tCompr\tCompr\tDecomp\n");
+ printf("Format\tOrder\tFormat\tQual\t X Y \tPerf \tRatio\tPerf\n\n");
+ }
+
+ for(i=hiqual; i>=qual; i--)
+ dotest(bmpbuf, w, h, pf, bu, TJ_GRAYSCALE, i, argv[1], dotile, useppm, quiet);
+ if(quiet) printf("\n");
+ for(i=hiqual; i>=qual; i--)
+ dotest(bmpbuf, w, h, pf, bu, TJ_420, i, argv[1], dotile, useppm, quiet);
+ if(quiet) printf("\n");
+ for(i=hiqual; i>=qual; i--)
+ dotest(bmpbuf, w, h, pf, bu, TJ_422, i, argv[1], dotile, useppm, quiet);
+ if(quiet) printf("\n");
+ for(i=hiqual; i>=qual; i--)
+ dotest(bmpbuf, w, h, pf, bu, TJ_444, i, argv[1], dotile, useppm, quiet);
+
+ if(bmpbuf) free(bmpbuf);
+ return 0;
+}
diff --git a/jquant1.c b/jquant1.c
index b2f96aa..362bb1e 100644
--- a/jquant1.c
+++ b/jquant1.c
@@ -2,6 +2,7 @@
* jquant1.c
*
* Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -193,7 +194,10 @@
int total_colors, iroot, i, j;
boolean changed;
long temp;
- static const int RGB_order[3] = { RGB_GREEN, RGB_RED, RGB_BLUE };
+ int RGB_order[3] = { RGB_GREEN, RGB_RED, RGB_BLUE };
+ RGB_order[0] = rgb_green[cinfo->out_color_space];
+ RGB_order[1] = rgb_red[cinfo->out_color_space];
+ RGB_order[2] = rgb_blue[cinfo->out_color_space];
/* We can allocate at least the nc'th root of max_colors per component. */
/* Compute floor(nc'th root of max_colors). */
diff --git a/jquant2.c b/jquant2.c
index af601e3..da964f7 100644
--- a/jquant2.c
+++ b/jquant2.c
@@ -2,6 +2,7 @@
* jquant2.c
*
* Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -74,29 +75,10 @@
#define G_SCALE 3 /* scale G distances by this much */
#define B_SCALE 1 /* and B by this much */
-/* Relabel R/G/B as components 0/1/2, respecting the RGB ordering defined
- * in jmorecfg.h. As the code stands, it will do the right thing for R,G,B
- * and B,G,R orders. If you define some other weird order in jmorecfg.h,
- * you'll get compile errors until you extend this logic. In that case
- * you'll probably want to tweak the histogram sizes too.
- */
-
-#if RGB_RED == 0
-#define C0_SCALE R_SCALE
-#endif
-#if RGB_BLUE == 0
-#define C0_SCALE B_SCALE
-#endif
-#if RGB_GREEN == 1
-#define C1_SCALE G_SCALE
-#endif
-#if RGB_RED == 2
-#define C2_SCALE R_SCALE
-#endif
-#if RGB_BLUE == 2
-#define C2_SCALE B_SCALE
-#endif
-
+static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
+#define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]]
+#define C1_SCALE c_scales[rgb_green[cinfo->out_color_space]]
+#define C2_SCALE c_scales[rgb_blue[cinfo->out_color_space]]
/*
* First we have the histogram data structure and routines for creating it.
@@ -454,15 +436,16 @@
/* We want to break any ties in favor of green, then red, blue last.
* This code does the right thing for R,G,B or B,G,R color orders only.
*/
-#if RGB_RED == 0
- cmax = c1; n = 1;
- if (c0 > cmax) { cmax = c0; n = 0; }
- if (c2 > cmax) { n = 2; }
-#else
- cmax = c1; n = 1;
- if (c2 > cmax) { cmax = c2; n = 2; }
- if (c0 > cmax) { n = 0; }
-#endif
+ if (rgb_red[cinfo->out_color_space] == 0) {
+ cmax = c1; n = 1;
+ if (c0 > cmax) { cmax = c0; n = 0; }
+ if (c2 > cmax) { n = 2; }
+ }
+ else {
+ cmax = c1; n = 1;
+ if (c2 > cmax) { cmax = c2; n = 2; }
+ if (c0 > cmax) { n = 0; }
+ }
/* Choose split point along selected axis, and update box bounds.
* Current algorithm: split at halfway point.
* (Since the box has been shrunk to minimum volume,
diff --git a/jsimd.h b/jsimd.h
new file mode 100644
index 0000000..90be19e
--- /dev/null
+++ b/jsimd.h
@@ -0,0 +1,89 @@
+/*
+ * jsimd.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ */
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jsimd_can_rgb_ycc jSCanRgbYcc
+#define jsimd_can_ycc_rgb jSCanYccRgb
+#define jsimd_rgb_ycc_convert jSRgbYccConv
+#define jsimd_ycc_rgb_convert jSYccRgbConv
+#define jsimd_can_h2v2_downsample jSCanH2V2Down
+#define jsimd_can_h2v1_downsample jSCanH2V1Down
+#define jsimd_h2v2_downsample jSH2V2Down
+#define jsimd_h2v1_downsample jSH2V1Down
+#define jsimd_can_h2v2_upsample jSCanH2V2Up
+#define jsimd_can_h2v1_upsample jSCanH2V1Up
+#define jsimd_h2v2_upsample jSH2V2Up
+#define jsimd_h2v1_upsample jSH2V1Up
+#define jsimd_can_h2v2_fancy_upsample jSCanH2V2FUp
+#define jsimd_can_h2v1_fancy_upsample jSCanH2V1FUp
+#define jsimd_h2v2_fancy_upsample jSH2V2FUp
+#define jsimd_h2v1_fancy_upsample jSH2V1FUp
+#define jsimd_can_h2v2_merged_upsample jSCanH2V2MUp
+#define jsimd_can_h2v1_merged_upsample jSCanH2V1MUp
+#define jsimd_h2v2_merged_upsample jSH2V2MUp
+#define jsimd_h2v1_merged_upsample jSH2V1MUp
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+EXTERN(int) jsimd_can_rgb_ycc JPP((void));
+EXTERN(int) jsimd_can_ycc_rgb JPP((void));
+
+EXTERN(void) jsimd_rgb_ycc_convert
+ JPP((j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_ycc_rgb_convert
+ JPP((j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+
+EXTERN(int) jsimd_can_h2v2_downsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_downsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_downsample
+ JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample
+ JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+EXTERN(int) jsimd_can_h2v2_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_upsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_upsample
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(int) jsimd_can_h2v2_fancy_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_fancy_upsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_fancy_upsample
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample
+ JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(int) jsimd_can_h2v2_merged_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_merged_upsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_merged_upsample
+ JPP((j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample
+ JPP((j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf));
+
diff --git a/jsimd_none.c b/jsimd_none.c
new file mode 100644
index 0000000..8960802
--- /dev/null
+++ b/jsimd_none.c
@@ -0,0 +1,299 @@
+/*
+ * jsimd_none.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This file contains stubs for when there is no SIMD support available.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jsimd.h"
+#include "jdct.h"
+#include "jsimddct.h"
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+ DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
diff --git a/jsimddct.h b/jsimddct.h
new file mode 100644
index 0000000..d73d0c4
--- /dev/null
+++ b/jsimddct.h
@@ -0,0 +1,101 @@
+/*
+ * jsimddct.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ */
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jsimd_can_convsamp jSCanConv
+#define jsimd_can_convsamp_float jSCanConvF
+#define jsimd_convsamp jSConv
+#define jsimd_convsamp_float jSConvF
+#define jsimd_can_fdct_islow jSCanFDCTIS
+#define jsimd_can_fdct_ifast jSCanFDCTIF
+#define jsimd_can_fdct_float jSCanFDCTFl
+#define jsimd_fdct_islow jSFDCTIS
+#define jsimd_fdct_ifast jSFDCTIF
+#define jsimd_fdct_float jSFDCTFl
+#define jsimd_can_quantize jSCanQuant
+#define jsimd_can_quantize_float jSCanQuantF
+#define jsimd_quantize jSQuant
+#define jsimd_quantize_float jSQuantF
+#define jsimd_can_idct_2x2 jSCanIDCT22
+#define jsimd_can_idct_4x4 jSCanIDCT44
+#define jsimd_idct_2x2 jSIDCT22
+#define jsimd_idct_4x4 jSIDCT44
+#define jsimd_can_idct_islow jSCanIDCTIS
+#define jsimd_can_idct_ifast jSCanIDCTIF
+#define jsimd_can_idct_float jSCanIDCTFl
+#define jsimd_idct_islow jSIDCTIS
+#define jsimd_idct_ifast jSIDCTIF
+#define jsimd_idct_float jSIDCTFl
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+EXTERN(int) jsimd_can_convsamp JPP((void));
+EXTERN(int) jsimd_can_convsamp_float JPP((void));
+
+EXTERN(void) jsimd_convsamp JPP((JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ DCTELEM * workspace));
+EXTERN(void) jsimd_convsamp_float JPP((JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ FAST_FLOAT * workspace));
+
+EXTERN(int) jsimd_can_fdct_islow JPP((void));
+EXTERN(int) jsimd_can_fdct_ifast JPP((void));
+EXTERN(int) jsimd_can_fdct_float JPP((void));
+
+EXTERN(void) jsimd_fdct_islow JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_float JPP((FAST_FLOAT * data));
+
+EXTERN(int) jsimd_can_quantize JPP((void));
+EXTERN(int) jsimd_can_quantize_float JPP((void));
+
+EXTERN(void) jsimd_quantize JPP((JCOEFPTR coef_block,
+ DCTELEM * divisors,
+ DCTELEM * workspace));
+EXTERN(void) jsimd_quantize_float JPP((JCOEFPTR coef_block,
+ FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace));
+
+EXTERN(int) jsimd_can_idct_2x2 JPP((void));
+EXTERN(int) jsimd_can_idct_4x4 JPP((void));
+
+EXTERN(void) jsimd_idct_2x2 JPP((j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4 JPP((j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
+EXTERN(int) jsimd_can_idct_islow JPP((void));
+EXTERN(int) jsimd_can_idct_ifast JPP((void));
+EXTERN(int) jsimd_can_idct_float JPP((void));
+
+EXTERN(void) jsimd_idct_islow JPP((j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast JPP((j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+EXTERN(void) jsimd_idct_float JPP((j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
diff --git a/ltconfig b/ltconfig
deleted file mode 100755
index 2347e69..0000000
--- a/ltconfig
+++ /dev/null
@@ -1,1512 +0,0 @@
-#! /bin/sh
-
-# ltconfig - Create a system-specific libtool.
-# Copyright (C) 1996-1998 Free Software Foundation, Inc.
-# Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
-#
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# A lot of this script is taken from autoconf-2.10.
-
-# The HP-UX ksh and POSIX shell print the target directory to stdout
-# if CDPATH is set.
-if test "${CDPATH+set}" = set; then CDPATH=; export CDPATH; fi
-
-echo=echo
-if test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then :
-else
- # The Solaris and AIX default echo program unquotes backslashes.
- # This makes it impossible to quote backslashes using
- # echo "$something" | sed 's/\\/\\\\/g'
- # So, we emulate echo with printf '%s\n'
- echo="printf %s\\n"
- if test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then :
- else
- # Oops. We have no working printf. Try to find a not-so-buggy echo.
- echo=echo
- IFS="${IFS= }"; save_ifs="$IFS"; IFS="${IFS}:"
- for dir in $PATH /usr/ucb; do
- if test -f $dir/echo && test "X`$dir/echo '\t'`" = 'X\t'; then
- echo="$dir/echo"
- break
- fi
- done
- IFS="$save_ifs"
- fi
-fi
-
-# Sed substitution that helps us do robust quoting. It backslashifies
-# metacharacters that are still active within double-quoted strings.
-Xsed='sed -e s/^X//'
-sed_quote_subst='s/\([\\"\\`$\\\\]\)/\\\1/g'
-
-# Same as above, but do not quote variable references.
-double_quote_subst='s/\([\\"\\`\\\\]\)/\\\1/g'
-
-# The name of this program.
-progname=`$echo "X$0" | $Xsed -e 's%^.*/%%'`
-
-# Constants:
-PROGRAM=ltconfig
-PACKAGE=libtool
-VERSION=1.2
-ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.c 1>&5'
-ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.c $LIBS 1>&5'
-rm="rm -f"
-
-help="Try \`$progname --help' for more information."
-
-# Global variables:
-can_build_shared=yes
-enable_shared=yes
-# All known linkers require a `.a' archive for static linking.
-enable_static=yes
-ltmain=
-silent=
-srcdir=
-ac_config_guess=
-ac_config_sub=
-host=
-nonopt=
-verify_host=yes
-with_gcc=no
-with_gnu_ld=no
-
-old_AR="$AR"
-old_CC="$CC"
-old_CFLAGS="$CFLAGS"
-old_CPPFLAGS="$CPPFLAGS"
-old_LD="$LD"
-old_LN_S="$LN_S"
-old_NM="$NM"
-old_RANLIB="$RANLIB"
-
-# Parse the command line options.
-args=
-prev=
-for option
-do
- case "$option" in
- -*=*) optarg=`echo "$option" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
- *) optarg= ;;
- esac
-
- # If the previous option needs an argument, assign it.
- if test -n "$prev"; then
- eval "$prev=\$option"
- prev=
- continue
- fi
-
- case "$option" in
- --help) cat <<EOM
-Usage: $progname [OPTION]... LTMAIN [HOST]
-
-Generate a system-specific libtool script.
-
- --disable-shared do not build shared libraries
- --disable-static do not build static libraries
- --help display this help and exit
- --no-verify do not verify that HOST is a valid host type
- --quiet same as \`--silent'
- --silent do not print informational messages
- --srcdir=DIR find \`config.guess' in DIR
- --version output version information and exit
- --with-gcc assume that the GNU C compiler will be used
- --with-gnu-ld assume that the C compiler uses the GNU linker
-
-LTMAIN is the \`ltmain.sh' shell script fragment that provides basic libtool
-functionality.
-
-HOST is the canonical host system name [default=guessed].
-EOM
- exit 0
- ;;
-
- --disable-shared) enable_shared=no ;;
-
- --disable-static) enable_static=no ;;
-
- --quiet | --silent) silent=yes ;;
-
- --srcdir) prev=srcdir ;;
- --srcdir=*) srcdir="$optarg" ;;
-
- --no-verify) verify_host=no ;;
-
- --version) echo "$PROGRAM (GNU $PACKAGE) $VERSION"; exit 0 ;;
-
- --with-gcc) with_gcc=yes ;;
- --with-gnu-ld) with_gnu_ld=yes ;;
-
- -*)
- echo "$progname: unrecognized option \`$option'" 1>&2
- echo "$help" 1>&2
- exit 1
- ;;
-
- *)
- if test -z "$ltmain"; then
- ltmain="$option"
- elif test -z "$host"; then
-# This generates an unnecessary warning for sparc-sun-solaris4.1.3_U1
-# if test -n "`echo $option| sed 's/[-a-z0-9.]//g'`"; then
-# echo "$progname: warning \`$option' is not a valid host type" 1>&2
-# fi
- host="$option"
- else
- echo "$progname: too many arguments" 1>&2
- echo "$help" 1>&2
- exit 1
- fi ;;
- esac
-done
-
-if test -z "$ltmain"; then
- echo "$progname: you must specify a LTMAIN file" 1>&2
- echo "$help" 1>&2
- exit 1
-fi
-
-if test -f "$ltmain"; then :
-else
- echo "$progname: \`$ltmain' does not exist" 1>&2
- echo "$help" 1>&2
- exit 1
-fi
-
-# Quote any args containing shell metacharacters.
-ltconfig_args=
-for arg
-do
- case "$arg" in
- *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*)
- ltconfig_args="$ltconfig_args '$arg'" ;;
- *) ltconfig_args="$ltconfig_args $arg" ;;
- esac
-done
-
-# A relevant subset of AC_INIT.
-
-# File descriptor usage:
-# 0 standard input
-# 1 file creation
-# 2 errors and warnings
-# 3 some systems may open it to /dev/tty
-# 4 used on the Kubota Titan
-# 5 compiler messages saved in config.log
-# 6 checking for... messages and results
-if test "$silent" = yes; then
- exec 6>/dev/null
-else
- exec 6>&1
-fi
-exec 5>>./config.log
-
-# NLS nuisances.
-# Only set LANG and LC_ALL to C if already set.
-# These must not be set unconditionally because not all systems understand
-# e.g. LANG=C (notably SCO).
-if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi
-if test "${LANG+set}" = set; then LANG=C; export LANG; fi
-
-if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
- # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu.
- if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
- ac_n= ac_c='
-' ac_t=' '
- else
- ac_n=-n ac_c= ac_t=
- fi
-else
- ac_n= ac_c='\c' ac_t=
-fi
-
-if test -z "$srcdir"; then
- # Assume the source directory is the same one as the path to ltmain.sh.
- srcdir=`$echo "$ltmain" | $Xsed -e 's%/[^/]*$%%'`
- test "$srcdir" = "$ltmain" && srcdir=.
-fi
-
-trap "$rm conftest*; exit 1" 1 2 15
-if test "$verify_host" = yes; then
- # Check for config.guess and config.sub.
- ac_aux_dir=
- for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do
- if test -f $ac_dir/config.guess; then
- ac_aux_dir=$ac_dir
- break
- fi
- done
- if test -z "$ac_aux_dir"; then
- echo "$progname: cannot find config.guess in $srcdir $srcdir/.. $srcdir/../.." 1>&2
- echo "$help" 1>&2
- exit 1
- fi
- ac_config_guess=$ac_aux_dir/config.guess
- ac_config_sub=$ac_aux_dir/config.sub
-
- # Make sure we can run config.sub.
- if $ac_config_sub sun4 >/dev/null 2>&1; then :
- else
- echo "$progname: cannot run $ac_config_sub" 1>&2
- echo "$help" 1>&2
- exit 1
- fi
-
- echo $ac_n "checking host system type""... $ac_c" 1>&6
-
- host_alias=$host
- case "$host_alias" in
- "")
- if host_alias=`$ac_config_guess`; then :
- else
- echo "$progname: cannot guess host type; you must specify one" 1>&2
- echo "$help" 1>&2
- exit 1
- fi ;;
- esac
- host=`$ac_config_sub $host_alias`
- echo "$ac_t$host" 1>&6
-
- # Make sure the host verified.
- test -z "$host" && exit 1
-
-elif test -z "$host"; then
- echo "$progname: you must specify a host type if you use \`--no-verify'" 1>&2
- echo "$help" 1>&2
- exit 1
-else
- host_alias=$host
-fi
-
-# Transform linux* to *-*-linux-gnu*, to support old configure scripts.
-case "$host_os" in
-linux-gnu*) ;;
-linux*) host=`echo $host | sed 's/^\(.*-.*-linux\)\(.*\)$/\1-gnu\2/'`
-esac
-
-host_cpu=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
-host_vendor=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
-host_os=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
-
-case "$host_os" in
-aix3*)
- # AIX sometimes has problems with the GCC collect2 program. For some
- # reason, if we set the COLLECT_NAMES environment variable, the problems
- # vanish in a puff of smoke.
- if test "${COLLECT_NAMES+set}" != set; then
- COLLECT_NAMES=
- export COLLECT_NAMES
- fi
- ;;
-esac
-
-# Determine commands to create old-style static archives.
-old_archive_cmds='$AR cru $oldlib$oldobjs'
-old_postinstall_cmds='chmod 644 $oldlib'
-old_postuninstall_cmds=
-
-# Set a sane default for `AR'.
-test -z "$AR" && AR=ar
-
-# If RANLIB is not set, then run the test.
-if test "${RANLIB+set}" != "set"; then
- result=no
-
- echo $ac_n "checking for ranlib... $ac_c" 1>&6
- IFS="${IFS= }"; save_ifs="$IFS"; IFS="${IFS}:"
- for dir in $PATH; do
- test -z "$dir" && dir=.
- if test -f $dir/ranlib; then
- RANLIB="ranlib"
- result="ranlib"
- break
- fi
- done
- IFS="$save_ifs"
-
- echo "$ac_t$result" 1>&6
-fi
-
-if test -n "$RANLIB"; then
- old_archive_cmds="$old_archive_cmds;\$RANLIB \$oldlib"
- old_postinstall_cmds="\$RANLIB \$oldlib;$old_postinstall_cmds"
-fi
-
-# Check to see if we are using GCC.
-if test "$with_gcc" != yes || test -z "$CC"; then
- # If CC is not set, then try to find GCC or a usable CC.
- if test -z "$CC"; then
- echo $ac_n "checking for gcc... $ac_c" 1>&6
- IFS="${IFS= }"; save_ifs="$IFS"; IFS="${IFS}:"
- for dir in $PATH; do
- IFS="$save_ifs"
- test -z "$dir" && dir=.
- if test -f $dir/gcc; then
- CC="gcc"
- break
- fi
- done
- IFS="$save_ifs"
-
- if test -n "$CC"; then
- echo "$ac_t$CC" 1>&6
- else
- echo "$ac_t"no 1>&6
- fi
- fi
-
- # Not "gcc", so try "cc", rejecting "/usr/ucb/cc".
- if test -z "$CC"; then
- echo $ac_n "checking for cc... $ac_c" 1>&6
- IFS="${IFS= }"; save_ifs="$IFS"; IFS="${IFS}:"
- cc_rejected=no
- for dir in $PATH; do
- test -z "$dir" && dir=.
- if test -f $dir/cc; then
- if test "$dir/cc" = "/usr/ucb/cc"; then
- cc_rejected=yes
- continue
- fi
- CC="cc"
- break
- fi
- done
- IFS="$save_ifs"
- if test $cc_rejected = yes; then
- # We found a bogon in the path, so make sure we never use it.
- set dummy $CC
- shift
- if test $# -gt 0; then
- # We chose a different compiler from the bogus one.
- # However, it has the same name, so the bogon will be chosen
- # first if we set CC to just the name; use the full file name.
- shift
- set dummy "$dir/cc" "$@"
- shift
- CC="$@"
- fi
- fi
-
- if test -n "$CC"; then
- echo "$ac_t$CC" 1>&6
- else
- echo "$ac_t"no 1>&6
- fi
-
- if test -z "$CC"; then
- echo "$progname: error: no acceptable cc found in \$PATH" 1>&2
- exit 1
- fi
- fi
-
- # Now see if the compiler is really GCC.
- with_gcc=no
- echo $ac_n "checking whether we are using GNU C... $ac_c" 1>&6
- echo "$progname:424: checking whether we are using GNU C" >&5
-
- $rm conftest.c
- cat > conftest.c <<EOF
-#ifdef __GNUC__
- yes;
-#endif
-EOF
- if { ac_try='${CC-cc} -E conftest.c'; { (eval echo $progname:432: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
- with_gcc=yes
- fi
- $rm conftest.c
- echo "$ac_t$with_gcc" 1>&6
-fi
-
-# Allow CC to be a program name with arguments.
-set dummy $CC
-compiler="$2"
-
-echo $ac_n "checking for $compiler option to produce PIC... $ac_c" 1>&6
-pic_flag=
-special_shlib_compile_flags=
-wl=
-link_static_flag=
-no_builtin_flag=
-
-if test "$with_gcc" = yes; then
- wl='-Wl,'
- link_static_flag='-static'
- no_builtin_flag=' -fno-builtin'
-
- case "$host_os" in
- aix3* | aix4* | irix5* | irix6* | osf3* | osf4*)
- # PIC is the default for these OSes.
- ;;
- os2*)
- # We can build DLLs from non-PIC.
- ;;
- amigaos*)
- # FIXME: we need at least 68020 code to build shared libraries, but
- # adding the `-m68020' flag to GCC prevents building anything better,
- # like `-m68040'.
- pic_flag='-m68020 -resident32 -malways-restore-a4'
- ;;
- *)
- pic_flag='-fPIC'
- ;;
- esac
-else
- # PORTME Check for PIC flags for the system compiler.
- case "$host_os" in
- aix3* | aix4*)
- # All AIX code is PIC.
- link_static_flag='-bnso -bI:/lib/syscalls.exp'
- ;;
-
- hpux9* | hpux10*)
- # Is there a better link_static_flag that works with the bundled CC?
- wl='-Wl,'
- link_static_flag="${wl}-a ${wl}archive"
- pic_flag='+Z'
- ;;
-
- irix5* | irix6*)
- wl='-Wl,'
- link_static_flag='-non_shared'
- # PIC (with -KPIC) is the default.
- ;;
-
- os2*)
- # We can build DLLs from non-PIC.
- ;;
-
- osf3* | osf4*)
- # All OSF/1 code is PIC.
- wl='-Wl,'
- link_static_flag='-non_shared'
- ;;
-
- sco3.2v5*)
- pic_flag='-Kpic'
- link_static_flag='-dn'
- special_shlib_compile_flags='-belf'
- ;;
-
- solaris2*)
- pic_flag='-KPIC'
- link_static_flag='-Bstatic'
- wl='-Wl,'
- ;;
-
- sunos4*)
- pic_flag='-PIC'
- link_static_flag='-Bstatic'
- wl='-Qoption ld '
- ;;
-
- sysv4.2uw2*)
- pic_flag='-KPIC'
- link_static_flag='-Bstatic'
- wl='-Wl,'
- ;;
-
- uts4*)
- pic_flag='-pic'
- link_static_flag='-Bstatic'
- ;;
-
- *)
- can_build_shared=no
- ;;
- esac
-fi
-
-if test -n "$pic_flag"; then
- echo "$ac_t$pic_flag" 1>&6
-
- # Check to make sure the pic_flag actually works.
- echo $ac_n "checking if $compiler PIC flag $pic_flag works... $ac_c" 1>&6
- $rm conftest*
- echo > conftest.c
- save_CFLAGS="$CFLAGS"
- CFLAGS="$CFLAGS $pic_flag -DPIC"
- echo "$progname:547: checking if $compiler PIC flag $pic_flag works" >&5
- if { (eval echo $progname:548: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.o; then
- # Append any warnings to the config.log.
- cat conftest.err 1>&5
-
- # On HP-UX, both CC and GCC only warn that PIC is supported... then they
- # create non-PIC objects. So, if there were any warnings, we assume that
- # PIC is not supported.
- if test -s conftest.err; then
- echo "$ac_t"no 1>&6
- can_build_shared=no
- pic_flag=
- else
- echo "$ac_t"yes 1>&6
- pic_flag=" $pic_flag"
- fi
- else
- # Append any errors to the config.log.
- cat conftest.err 1>&5
- can_build_shared=no
- pic_flag=
- echo "$ac_t"no 1>&6
- fi
- CFLAGS="$save_CFLAGS"
- $rm conftest*
-else
- echo "$ac_t"none 1>&6
-fi
-
-# Check for any special shared library compilation flags.
-if test -n "$special_shlib_compile_flags"; then
- echo "$progname: warning: \`$CC' requires \`$special_shlib_compile_flags' to build shared libraries" 1>&2
- if echo "$old_CC $old_CFLAGS " | egrep -e "[ ]$special_shlib_compile_flags[ ]" >/dev/null; then :
- else
- echo "$progname: add \`$special_shlib_compile_flags' to the CC or CFLAGS env variable and reconfigure" 1>&2
- can_build_shared=no
- fi
-fi
-
-echo $ac_n "checking if $compiler static flag $link_static_flag works... $ac_c" 1>&6
-$rm conftest*
-echo 'main(){return(0);}' > conftest.c
-save_LDFLAGS="$LDFLAGS"
-LDFLAGS="$LDFLAGS $link_static_flag"
-echo "$progname:591: checking if $compiler static flag $link_static_flag works" >&5
-if { (eval echo $progname:592: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
- echo "$ac_t$link_static_flag" 1>&6
-else
- echo "$ac_t"none 1>&6
- link_static_flag=
-fi
-LDFLAGS="$save_LDFLAGS"
-$rm conftest*
-
-if test -z "$LN_S"; then
- # Check to see if we can use ln -s, or we need hard links.
- echo $ac_n "checking whether ln -s works... $ac_c" 1>&6
- $rm conftestdata
- if ln -s X conftestdata 2>/dev/null; then
- $rm conftestdata
- LN_S="ln -s"
- else
- LN_S=ln
- fi
- if test "$LN_S" = "ln -s"; then
- echo "$ac_t"yes 1>&6
- else
- echo "$ac_t"no 1>&6
- fi
-fi
-
-# Make sure LD is an absolute path.
-if test -z "$LD"; then
- ac_prog=ld
- if test "$with_gcc" = yes; then
- # Check if gcc -print-prog-name=ld gives a path.
- echo $ac_n "checking for ld used by GCC... $ac_c" 1>&6
- echo "$progname:624: checking for ld used by GCC" >&5
- ac_prog=`($CC -print-prog-name=ld) 2>&5`
- case "$ac_prog" in
- # Accept absolute paths.
- /* | [A-Za-z]:\\*)
- test -z "$LD" && LD="$ac_prog"
- ;;
- "")
- # If it fails, then pretend we are not using GCC.
- ac_prog=ld
- ;;
- *)
- # If it is relative, then search for the first ld in PATH.
- with_gnu_ld=unknown
- ;;
- esac
- elif test "$with_gnu_ld" = yes; then
- echo $ac_n "checking for GNU ld... $ac_c" 1>&6
- echo "$progname:642: checking for GNU ld" >&5
- else
- echo $ac_n "checking for non-GNU ld""... $ac_c" 1>&6
- echo "$progname:645: checking for non-GNU ld" >&5
- fi
-
- if test -z "$LD"; then
- IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:"
- for ac_dir in $PATH; do
- test -z "$ac_dir" && ac_dir=.
- if test -f "$ac_dir/$ac_prog"; then
- LD="$ac_dir/$ac_prog"
- # Check to see if the program is GNU ld. I'd rather use --version,
- # but apparently some GNU ld's only accept -v.
- # Break only if it was the GNU/non-GNU ld that we prefer.
- if "$LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
- test "$with_gnu_ld" != no && break
- else
- test "$with_gnu_ld" != yes && break
- fi
- fi
- done
- IFS="$ac_save_ifs"
- fi
-
- if test -n "$LD"; then
- echo "$ac_t$LD" 1>&6
- else
- echo "$ac_t"no 1>&6
- fi
-
- if test -z "$LD"; then
- echo "$progname: error: no acceptable ld found in \$PATH" 1>&2
- exit 1
- fi
-fi
-
-# Check to see if it really is or is not GNU ld.
-echo $ac_n "checking if the linker ($LD) is GNU ld... $ac_c" 1>&6
-# I'd rather use --version here, but apparently some GNU ld's only accept -v.
-if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
- with_gnu_ld=yes
-else
- with_gnu_ld=no
-fi
-echo "$ac_t$with_gnu_ld" 1>&6
-
-# See if the linker supports building shared libraries.
-echo $ac_n "checking whether the linker ($LD) supports shared libraries... $ac_c" 1>&6
-
-allow_undefined_flag=
-no_undefined_flag=
-archive_cmds=
-old_archive_from_new_cmds=
-export_dynamic_flag_spec=
-hardcode_libdir_flag_spec=
-hardcode_libdir_separator=
-hardcode_direct=no
-hardcode_minus_L=no
-hardcode_shlibpath_var=unsupported
-runpath_var=
-
-case "$host_os" in
-amigaos* | sunos4*)
- # On these operating systems, we should treat GNU ld like the system ld.
- gnu_ld_acts_native=yes
- ;;
-*)
- gnu_ld_acts_native=no
- ;;
-esac
-
-ld_shlibs=yes
-if test "$with_gnu_ld" = yes && test "$gnu_ld_acts_native" != yes; then
-
- # See if GNU ld supports shared libraries.
- if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
- archive_cmds='$CC -shared ${wl}-soname $wl$soname -o $lib$libobjs'
- runpath_var=LD_RUN_PATH
- ld_shlibs=yes
- else
- ld_shlibs=no
- fi
-
- if test "$ld_shlibs" = yes; then
- hardcode_libdir_flag_spec='${wl}--rpath ${wl}$libdir'
- export_dynamic_flag_spec='${wl}--export-dynamic'
- fi
-else
- # PORTME fill in a description of your system's linker (not GNU ld)
- case "$host_os" in
- aix3*)
- allow_undefined_flag=unsupported
- archive_cmds='$NM$libobjs | $global_symbol_pipe | sed '\''s/.* //'\'' > $lib.exp;$LD -o $objdir/$soname$libobjs -bE:$lib.exp -T512 -H512 -bM:SRE;$AR cru $lib $objdir/$soname'
- # Note: this linker hardcodes the directories in LIBPATH if there
- # are no directories specified by -L.
- hardcode_minus_L=yes
- if test "$with_gcc" = yes && test -z "$link_static_flag"; then
- # Neither direct hardcoding nor static linking is supported with a
- # broken collect2.
- hardcode_direct=unsupported
- fi
- ;;
-
- aix4*)
- allow_undefined_flag=unsupported
- archive_cmds='$NM$libobjs | $global_symbol_pipe | sed '\''s/.* //'\'' > $lib.exp;$CC -o $objdir/$soname$libobjs ${wl}-bE:$lib.exp ${wl}-bM:SRE ${wl}-bnoentry;$AR cru $lib $objdir/$soname'
- hardcode_direct=yes
- hardcode_minus_L=yes
- ;;
-
- amigaos*)
- archive_cmds='$rm $objdir/a2ixlibrary.data;$echo "#define NAME $libname" > $objdir/a2ixlibrary.data;$echo "#define LIBRARY_ID 1" >> $objdir/a2ixlibrary.data;$echo "#define VERSION $major" >> $objdir/a2ixlibrary.data;$echo "#define REVISION $revision" >> $objdir/a2ixlibrary.data;$AR cru $lib$libobjs;$RANLIB $lib;(cd $objdir && a2ixlibrary -32)'
- hardcode_libdir_flag_spec='-L$libdir'
- hardcode_minus_L=yes
- ;;
-
- # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
- # support. Future versions do this automatically, but an explicit c++rt0.o
- # does not break anything, and helps significantly (at the cost of a little
- # extra space).
- freebsd2.2*)
- archive_cmds='$LD -Bshareable -o $lib$libobjs /usr/lib/c++rt0.o'
- hardcode_libdir_flag_spec='-R$libdir'
- hardcode_direct=yes
- hardcode_minus_L=yes
- hardcode_shlibpath_var=no
- ;;
-
- # Unfortunately, older versions of FreeBSD 2 do not have this feature.
- freebsd2*)
- archive_cmds='$LD -Bshareable -o $lib$libobjs'
- hardcode_direct=yes
- hardcode_minus_L=yes
- hardcode_shlibpath_var=no
- ;;
-
- # FreeBSD 3, at last, uses gcc -shared to do shared libraries.
- freebsd3*)
- archive_cmds='$CC -shared -o $lib$libobjs'
- hardcode_libdir_flag_spec='-R$libdir'
- hardcode_direct=yes
- hardcode_minus_L=yes
- hardcode_shlibpath_var=no
- ;;
-
- hpux9*)
- archive_cmds='$rm $objdir/$soname;$LD -b +s +b $install_libdir -o $objdir/$soname$libobjs;mv $objdir/$soname $lib'
- hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
- hardcode_direct=yes
- hardcode_minus_L=yes
- export_dynamic_flag_spec='${wl}-E'
- ;;
-
- hpux10*)
- archive_cmds='$LD -b +h $soname +s +b $install_libdir -o $lib$libobjs'
- hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
- hardcode_direct=yes
- hardcode_minus_L=yes
- export_dynamic_flag_spec='${wl}-E'
- ;;
-
- irix5* | irix6*)
- archive_cmds='$LD -shared -o $lib -soname $soname -set_version $verstring$libobjs'
- hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
- ;;
-
- netbsd*)
- # Tested with NetBSD 1.2 ld
- archive_cmds='$LD -Bshareable -o $lib$libobjs'
- hardcode_libdir_flag_spec='-R$libdir'
- hardcode_direct=yes
- hardcode_shlibpath_var=no
- ;;
-
- openbsd*)
- archive_cmds='$LD -Bshareable -o $lib$libobjs'
- hardcode_libdir_flag_spec='-R$libdir'
- hardcode_direct=yes
- hardcode_shlibpath_var=no
- ;;
-
- os2*)
- hardcode_libdir_flag_spec='-L$libdir'
- hardcode_minus_L=yes
- allow_undefined_flag=unsupported
- archive_cmds='$echo "LIBRARY $libname INITINSTANCE" > $objdir/$libname.def;$echo "DESCRIPTION \"$libname\"" >> $objdir/$libname.def;$echo DATA >> $objdir/$libname.def;$echo " SINGLE NONSHARED" >> $objdir/$libname.def;$echo EXPORTS >> $objdir/$libname.def;emxexp$libobjs >> $objdir/$libname.def;$CC -Zdll -Zcrtdll -o $lib$libobjs $objdir/$libname.def'
- old_archive_from_new_cmds='emximp -o $objdir/$libname.a $objdir/$libname.def'
- ;;
-
- osf3* | osf4*)
- allow_undefined_flag=' -expect_unresolved \*'
- archive_cmds='$LD -shared${allow_undefined_flag} -o $lib -soname $soname -set_version $verstring$libobjs$deplibs'
- hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
- hardcode_libdir_separator=:
- ;;
-
- sco3.2v5*)
- archive_cmds='$LD -G -o $lib$libobjs'
- hardcode_direct=yes
- ;;
-
- solaris2*)
- no_undefined_flag=' -z text'
- archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib$libobjs'
- hardcode_libdir_flag_spec='-R$libdir'
- hardcode_shlibpath_var=no
-
- # Solaris 2 before 2.5 hardcodes -L paths.
- case "$host_os" in
- solaris2.[0-4]*)
- hardcode_minus_L=yes
- ;;
- esac
- ;;
-
- sunos4*)
- if test "$with_gcc" = yes; then
- archive_cmds='$CC -shared -o $lib$libobjs'
- else
- archive_cmds='$LD -assert pure-text -Bstatic -o $lib$libobjs'
- fi
-
- if test "$with_gnu_ld" = yes; then
- export_dynamic_flag_spec='${wl}-export-dynamic'
- fi
- hardcode_libdir_flag_spec='-L$libdir'
- hardcode_direct=yes
- hardcode_minus_L=yes
- hardcode_shlibpath_var=no
- ;;
-
- uts4*)
- archive_cmds='$LD -G -h $soname -o $lib$libobjs'
- hardcode_libdir_flag_spec='-L$libdir'
- hardcode_direct=no
- hardcode_minus_L=no
- hardcode_shlibpath_var=no
- ;;
-
- *)
- ld_shlibs=no
- can_build_shared=no
- ;;
- esac
-fi
-echo "$ac_t$ld_shlibs" 1>&6
-
-if test -z "$NM"; then
- echo $ac_n "checking for BSD-compatible nm... $ac_c" 1>&6
- case "$NM" in
- /* | [A-Za-z]:\\*) ;; # Let the user override the test with a path.
- *)
- IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS="${IFS}:"
- for ac_dir in /usr/ucb /usr/ccs/bin $PATH /bin; do
- test -z "$ac_dir" && ac_dir=.
- if test -f $ac_dir/nm; then
- # Check to see if the nm accepts a BSD-compat flag.
- # Adding the `sed 1q' prevents false positives on HP-UX, which says:
- # nm: unknown option "B" ignored
- if ($ac_dir/nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
- NM="$ac_dir/nm -B"
- elif ($ac_dir/nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
- NM="$ac_dir/nm -p"
- else
- NM="$ac_dir/nm"
- fi
- break
- fi
- done
- IFS="$ac_save_ifs"
- test -z "$NM" && NM=nm
- ;;
- esac
- echo "$ac_t$NM" 1>&6
-fi
-
-# Check for command to grab the raw symbol name followed by C symbol from nm.
-echo $ac_n "checking command to parse $NM output... $ac_c" 1>&6
-
-# These are sane defaults that work on at least a few old systems.
-# [They come from Ultrix. What could be older than Ultrix?!! ;)]
-
-# Character class describing NM global symbol codes.
-symcode='[BCDEGRSTU]'
-
-# Regexp to match symbols that can be accessed directly from C.
-sympat='\([_A-Za-z][_A-Za-z0-9]*\)'
-
-# Transform the above into a raw symbol and a C symbol.
-symxfrm='\1 \1'
-
-# Define system-specific variables.
-case "$host_os" in
-aix*)
- symcode='[BCDTU]'
- ;;
-irix*)
- # Cannot use undefined symbols on IRIX because inlined functions mess us up.
- symcode='[BCDEGRST]'
- ;;
-solaris2*)
- symcode='[BDTU]'
- ;;
-esac
-
-# If we're using GNU nm, then use its standard symbol codes.
-if $NM -V 2>&1 | egrep '(GNU|with BFD)' > /dev/null; then
- symcode='[ABCDGISTUW]'
-fi
-
-# Write the raw and C identifiers.
-global_symbol_pipe="sed -n -e 's/^.* $symcode $sympat$/$symxfrm/p'"
-
-# Check to see that the pipe works correctly.
-pipe_works=no
-$rm conftest*
-cat > conftest.c <<EOF
-#ifdef __cplusplus
-extern "C" {
-#endif
-char nm_test_var;
-void nm_test_func(){}
-#ifdef __cplusplus
-}
-#endif
-main(){nm_test_var='a';nm_test_func();return(0);}
-EOF
-
-echo "$progname:971: checking if global_symbol_pipe works" >&5
-if { (eval echo $progname:972: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; } && test -s conftest.o; then
- # Now try to grab the symbols.
- nlist=conftest.nm
- if { echo "$progname:975: eval \"$NM conftest.o | $global_symbol_pipe > $nlist\"" >&5; eval "$NM conftest.o | $global_symbol_pipe > $nlist 2>&5"; } && test -s "$nlist"; then
-
- # Try sorting and uniquifying the output.
- if sort "$nlist" | uniq > "$nlist"T; then
- mv -f "$nlist"T "$nlist"
- wcout=`wc "$nlist" 2>/dev/null`
- count=`$echo "X$wcout" | $Xsed -e 's/^[ ]*\([0-9][0-9]*\).*$/\1/'`
- (test "$count" -ge 0) 2>/dev/null || count=-1
- else
- rm -f "$nlist"T
- count=-1
- fi
-
- # Make sure that we snagged all the symbols we need.
- if egrep ' nm_test_var$' "$nlist" >/dev/null; then
- if egrep ' nm_test_func$' "$nlist" >/dev/null; then
- cat <<EOF > conftest.c
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-EOF
- # Now generate the symbol file.
- sed 's/^.* \(.*\)$/extern char \1;/' < "$nlist" >> conftest.c
-
- cat <<EOF >> conftest.c
-#if defined (__STDC__) && __STDC__
-# define __ptr_t void *
-#else
-# define __ptr_t char *
-#endif
-
-/* The number of symbols in dld_preloaded_symbols, -1 if unsorted. */
-int dld_preloaded_symbol_count = $count;
-
-/* The mapping between symbol names and symbols. */
-struct {
- char *name;
- __ptr_t address;
-}
-dld_preloaded_symbols[] =
-{
-EOF
- sed 's/^\(.*\) \(.*\)$/ {"\1", (__ptr_t) \&\2},/' < "$nlist" >> conftest.c
- cat <<\EOF >> conftest.c
- {0, (__ptr_t) 0}
-};
-
-#ifdef __cplusplus
-}
-#endif
-EOF
- # Now try linking the two files.
- mv conftest.o conftestm.o
- save_LIBS="$LIBS"
- save_CFLAGS="$CFLAGS"
- LIBS='conftestm.o'
- CFLAGS="$CFLAGS$no_builtin_flag"
- if { (eval echo $progname:1033: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
- pipe_works=yes
- else
- echo "$progname: failed program was:" >&5
- cat conftest.c >&5
- fi
- LIBS="$save_LIBS"
- else
- echo "cannot find nm_test_func in $nlist" >&5
- fi
- else
- echo "cannot find nm_test_var in $nlist" >&5
- fi
- else
- echo "cannot run $global_symbol_pipe" >&5
- fi
-else
- echo "$progname: failed program was:" >&5
- cat conftest.c >&5
-fi
-$rm conftest*
-
-# Do not use the global_symbol_pipe unless it works.
-echo "$ac_t$pipe_works" 1>&6
-test "$pipe_works" = yes || global_symbol_pipe=
-
-# Check hardcoding attributes.
-echo $ac_n "checking how to hardcode library paths into programs... $ac_c" 1>&6
-hardcode_action=
-if test -n "$hardcode_libdir_flag_spec" || \
- test -n "$runpath_var"; then
-
- # We can hardcode non-existant directories.
- if test "$hardcode_direct" != no && \
- test "$hardcode_minus_L" != no && \
- test "$hardcode_shlibpath_var" != no; then
-
- # Linking always hardcodes the temporary library directory.
- hardcode_action=relink
- else
- # We can link without hardcoding, and we can hardcode nonexisting dirs.
- hardcode_action=immediate
- fi
-elif test "$hardcode_direct" != yes && \
- test "$hardcode_minus_L" != yes && \
- test "$hardcode_shlibpath_var" != yes; then
- # We cannot hardcode anything.
- hardcode_action=unsupported
-else
- # We can only hardcode existing directories.
- hardcode_action=relink
-fi
-echo "$ac_t$hardcode_action" 1>&6
-test "$hardcode_action" = unsupported && can_build_shared=no
-
-
-reload_flag=
-reload_cmds='$LD$reload_flag -o $output$reload_objs'
-echo $ac_n "checking for $LD option to reload object files... $ac_c" 1>&6
-# PORTME Some linker may need a different reload flag.
-reload_flag='-r'
-echo "$ac_t$reload_flag"
-test -n "$reload_flag" && reload_flag=" $reload_flag"
-
-# PORTME Fill in your ld.so characteristics
-library_names_spec=
-libname_spec='lib$name'
-soname_spec=
-postinstall_cmds=
-postuninstall_cmds=
-finish_cmds=
-finish_eval=
-shlibpath_var=
-version_type=none
-dynamic_linker="$host_os ld.so"
-
-echo $ac_n "checking dynamic linker characteristics... $ac_c" 1>&6
-case "$host_os" in
-aix3* | aix4*)
- version_type=linux
- library_names_spec='${libname}${release}.so.$versuffix $libname.a'
- shlibpath_var=LIBPATH
-
- # AIX has no versioning support, so we append a major version to the name.
- soname_spec='${libname}${release}.so.$major'
- ;;
-
-amigaos*)
- library_names_spec='$libname.ixlibrary $libname.a'
- # Create ${libname}_ixlibrary.a entries in /sys/libs.
- finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`$echo "X$lib" | $Xsed -e '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $rm /sys/libs/${libname}_ixlibrary.a; $show "(cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a)"; (cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a) || exit 1; done'
- ;;
-
-freebsd2* | freebsd3*)
- version_type=sunos
- library_names_spec='${libname}${release}.so.$versuffix $libname.so'
- finish_cmds='PATH="$PATH:/sbin" ldconfig -m $libdir'
- shlibpath_var=LD_LIBRARY_PATH
- ;;
-
-gnu*)
- version_type=sunos
- library_names_spec='${libname}${release}.so.$versuffix'
- shlibpath_var=LD_LIBRARY_PATH
- ;;
-
-hpux9* | hpux10*)
- # Give a soname corresponding to the major version so that dld.sl refuses to
- # link against other versions.
- dynamic_linker="$host_os dld.sl"
- version_type=sunos
- shlibpath_var=SHLIB_PATH
- library_names_spec='${libname}${release}.sl.$versuffix ${libname}${release}.sl.$major $libname.sl'
- soname_spec='${libname}${release}.sl.$major'
- # HP-UX runs *really* slowly unless shared libraries are mode 555.
- postinstall_cmds='chmod 555 $lib'
- ;;
-
-irix5* | irix6*)
- version_type=osf
- soname_spec='${libname}${release}.so'
- library_names_spec='${libname}${release}.so.$versuffix $libname.so'
- shlibpath_var=LD_LIBRARY_PATH
- ;;
-
-# No shared lib support for Linux oldld, aout, or coff.
-linux-gnuoldld* | linux-gnuaout* | linux-gnucoff*)
- dynamic_linker=no
- ;;
-
-# This must be Linux ELF.
-linux-gnu*)
- version_type=linux
- library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
- soname_spec='${libname}${release}.so.$major'
- finish_cmds='PATH="$PATH:/sbin" ldconfig -n $libdir'
- shlibpath_var=LD_LIBRARY_PATH
-
- if test -f /lib/ld.so.1; then
- dynamic_linker='GNU ld.so'
- else
- # Only the GNU ld.so supports shared libraries on MkLinux.
- case "$host_cpu" in
- powerpc*) dynamic_linker=no ;;
- *) dynamic_linker='Linux ld.so' ;;
- esac
- fi
- ;;
-
-netbsd* | openbsd*)
- version_type=sunos
- library_names_spec='${libname}${release}.so.$versuffix'
- finish_cmds='PATH="$PATH:/sbin" ldconfig -m $libdir'
- shlibpath_var=LD_LIBRARY_PATH
- ;;
-
-os2*)
- libname_spec='$name'
- library_names_spec='$libname.dll $libname.a'
- dynamic_linker='OS/2 ld.exe'
- shlibpath_var=LIBPATH
- ;;
-
-osf3* | osf4*)
- version_type=osf
- soname_spec='${libname}${release}.so'
- library_names_spec='${libname}${release}.so.$versuffix $libname.so'
- shlibpath_var=LD_LIBRARY_PATH
- ;;
-
-sco3.2v5*)
- version_type=osf
- soname_spec='${libname}${release}.so.$major'
- library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
- shlibpath_var=LD_LIBRARY_PATH
- ;;
-
-solaris2*)
- version_type=linux
- library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
- soname_spec='${libname}${release}.so.$major'
- shlibpath_var=LD_LIBRARY_PATH
- ;;
-
-sunos4*)
- version_type=sunos
- library_names_spec='${libname}${release}.so.$versuffix'
- finish_cmds='PATH="$PATH:/usr/etc" ldconfig $libdir'
- shlibpath_var=LD_LIBRARY_PATH
- ;;
-
-sysv4.2uw2*)
- version_type=linux
- library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
- soname_spec='${libname}${release}.so.$major'
- shlibpath_var=LD_LIBRARY_PATH
- ;;
-
-uts4*)
- version_type=linux
- library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
- soname_spec='${libname}${release}.so.$major'
- shlibpath_var=LD_LIBRARY_PATH
- ;;
-
-*)
- dynamic_linker=no
- ;;
-esac
-echo "$ac_t$dynamic_linker"
-test "$dynamic_linker" = no && can_build_shared=no
-
-# Report the final consequences.
-echo "checking if libtool supports shared libraries... $can_build_shared" 1>&6
-
-echo $ac_n "checking whether to build shared libraries... $ac_c" 1>&6
-test "$can_build_shared" = "no" && enable_shared=no
-
-# On AIX, shared libraries and static libraries use the same namespace, and
-# are all built from PIC.
-case "$host_os" in
-aix*)
- test "$enable_shared" = yes && enable_static=no
- if test -n "$RANLIB"; then
- archive_cmds="$archive_cmds;\$RANLIB \$lib"
- postinstall_cmds='$RANLIB $lib'
- fi
- ;;
-esac
-
-echo "$ac_t$enable_shared" 1>&6
-
-# Make sure either enable_shared or enable_static is yes.
-test "$enable_shared" = yes || enable_static=yes
-
-echo "checking whether to build static libraries... $enable_static" 1>&6
-
-echo $ac_n "checking for objdir... $ac_c" 1>&6
-rm -f .libs 2>/dev/null
-mkdir .libs 2>/dev/null
-if test -d .libs; then
- objdir=.libs
-else
- # MS-DOS does not allow filenames that begin with a dot.
- objdir=_libs
-fi
-rmdir .libs 2>/dev/null
-echo "$ac_t$objdir" 1>&6
-
-# Copy echo and quote the copy, instead of the original, because it is
-# used later.
-ltecho="$echo"
-
-# Now quote all the things that may contain metacharacters.
-for var in ltecho old_CC old_CFLAGS old_CPPFLAGS old_LD old_NM old_RANLIB \
- old_LN_S AR CC LD LN_S NM reload_flag reload_cmds wl pic_flag \
- link_static_flag no_builtin_flag export_dynamic_flag_spec \
- libname_spec library_names_spec soname_spec RANLIB \
- old_archive_cmds old_archive_from_new_cmds old_postinstall_cmds \
- old_postuninstall_cmds archive_cmds postinstall_cmds postuninstall_cmds \
- allow_undefined_flag no_undefined_flag \
- finish_cmds finish_eval global_symbol_pipe \
- hardcode_libdir_flag_spec hardcode_libdir_separator; do
-
- case "$var" in
- reload_cmds | old_archive_cmds | old_archive_from_new_cmds | \
- old_postinstall_cmds | old_postuninstall_cmds | archive_cmds | \
- postinstall_cmds | postuninstall_cmds | finish_cmds)
- # Double-quote double-evaled strings.
- eval "$var=\`\$echo \"X\$$var\" | \$Xsed -e \"\$double_quote_subst\" -e \"\$sed_quote_subst\"\`"
- ;;
- *)
- eval "$var=\`\$echo \"X\$$var\" | \$Xsed -e \"\$sed_quote_subst\"\`"
- ;;
- esac
-done
-
-ofile=libtool
-trap "$rm $ofile; exit 1" 1 2 15
-echo creating $ofile
-$rm $ofile
-cat <<EOF > $ofile
-#! /bin/sh
-
-# libtool - Provide generalized library-building support services.
-# Generated automatically by $PROGRAM - GNU $PACKAGE $VERSION
-# NOTE: Changes made to this file will be lost: look at ltconfig or ltmain.sh.
-#
-# Copyright (C) 1996-1998 Free Software Foundation, Inc.
-# Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# This program was configured as follows,
-# on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
-#
-# CC="$old_CC" CFLAGS="$old_CFLAGS" CPPFLAGS="$old_CPPFLAGS" \\
-# LD="$old_LD" NM="$old_NM" RANLIB="$old_RANLIB" LN_S="$old_LN_S" \\
-# $0$ltconfig_args
-#
-# Compiler and other test output produced by $progname, useful for
-# debugging $progname, is in ./config.log if it exists.
-
-# Sed that helps us avoid accidentally triggering echo(1) options like -n.
-Xsed="sed -e s/^X//"
-
-# The HP-UX ksh and POSIX shell print the target directory to stdout
-# if CDPATH is set.
-if test "\${CDPATH+set}" = set; then CDPATH=; export CDPATH; fi
-
-# An echo program that does not interpret backslashes.
-echo="$ltecho"
-
-# The version of $progname that generated this script.
-LTCONFIG_VERSION="$VERSION"
-
-# Shell to use when invoking shell scripts.
-SHELL=${CONFIG_SHELL-/bin/sh}
-
-# Whether or not to build libtool libraries.
-build_libtool_libs=$enable_shared
-
-# Whether or not to build old-style libraries.
-build_old_libs=$enable_static
-
-# The host system.
-host_alias="$host_alias"
-host="$host"
-
-# The archiver.
-AR="$AR"
-
-# The default C compiler.
-CC="$CC"
-
-# The linker used to build libraries.
-LD="$LD"
-
-# Whether we need hard or soft links.
-LN_S="$LN_S"
-
-# A BSD-compatible nm program.
-NM="$NM"
-
-# The name of the directory that contains temporary libtool files.
-objdir="$objdir"
-
-# How to create reloadable object files.
-reload_flag="$reload_flag"
-reload_cmds="$reload_cmds"
-
-# How to pass a linker flag through the compiler.
-wl="$wl"
-
-# Additional compiler flags for building library objects.
-pic_flag="$pic_flag"
-
-# Compiler flag to prevent dynamic linking.
-link_static_flag="$link_static_flag"
-
-# Compiler flag to turn off builtin functions.
-no_builtin_flag="$no_builtin_flag"
-
-# Compiler flag to allow reflexive dlopens.
-export_dynamic_flag_spec="$export_dynamic_flag_spec"
-
-# Library versioning type.
-version_type=$version_type
-
-# Format of library name prefix.
-libname_spec="$libname_spec"
-
-# List of archive names. First name is the real one, the rest are links.
-# The last name is the one that the linker finds with -lNAME.
-library_names_spec="$library_names_spec"
-
-# The coded name of the library, if different from the real name.
-soname_spec="$soname_spec"
-
-# Commands used to build and install an old-style archive.
-RANLIB="$RANLIB"
-old_archive_cmds="$old_archive_cmds"
-old_postinstall_cmds="$old_postinstall_cmds"
-old_postuninstall_cmds="$old_postuninstall_cmds"
-
-# Create an old-style archive from a shared archive.
-old_archive_from_new_cmds="$old_archive_from_new_cmds"
-
-# Commands used to build and install a shared archive.
-archive_cmds="$archive_cmds"
-postinstall_cmds="$postinstall_cmds"
-postuninstall_cmds="$postuninstall_cmds"
-
-# Flag that allows shared libraries with undefined symbols to be built.
-allow_undefined_flag="$allow_undefined_flag"
-
-# Flag that forces no undefined symbols.
-no_undefined_flag="$no_undefined_flag"
-
-# Commands used to finish a libtool library installation in a directory.
-finish_cmds="$finish_cmds"
-
-# Same as above, but a single script fragment to be evaled but not shown.
-finish_eval="$finish_eval"
-
-# Take the output of nm and produce a listing of raw symbols and C names.
-global_symbol_pipe="$global_symbol_pipe"
-
-# This is the shared library runtime path variable.
-runpath_var=$runpath_var
-
-# This is the shared library path variable.
-shlibpath_var=$shlibpath_var
-
-# How to hardcode a shared library path into an executable.
-hardcode_action=$hardcode_action
-
-# Flag to hardcode \$libdir into a binary during linking.
-# This must work even if \$libdir does not exist.
-hardcode_libdir_flag_spec="$hardcode_libdir_flag_spec"
-
-# Whether we need a single -rpath flag with a separated argument.
-hardcode_libdir_separator="$hardcode_libdir_separator"
-
-# Set to yes if using DIR/libNAME.so during linking hardcodes DIR into the
-# resulting binary.
-hardcode_direct=$hardcode_direct
-
-# Set to yes if using the -LDIR flag during linking hardcodes DIR into the
-# resulting binary.
-hardcode_minus_L=$hardcode_minus_L
-
-# Set to yes if using SHLIBPATH_VAR=DIR during linking hardcodes DIR into
-# the resulting binary.
-hardcode_shlibpath_var=$hardcode_shlibpath_var
-
-EOF
-
-case "$host_os" in
-aix3*)
- cat <<\EOF >> $ofile
-# AIX sometimes has problems with the GCC collect2 program. For some
-# reason, if we set the COLLECT_NAMES environment variable, the problems
-# vanish in a puff of smoke.
-if test "${COLLECT_NAMES+set}" != set; then
- COLLECT_NAMES=
- export COLLECT_NAMES
-fi
-
-EOF
- ;;
-esac
-
-# Append the ltmain.sh script.
-cat "$ltmain" >> $ofile || (rm -f $ofile; exit 1)
-
-chmod +x $ofile
-exit 0
-
-# Local Variables:
-# mode:shell-script
-# sh-indentation:2
-# End:
diff --git a/ltmain.sh b/ltmain.sh
deleted file mode 100644
index e9350b3..0000000
--- a/ltmain.sh
+++ /dev/null
@@ -1,2453 +0,0 @@
-# ltmain.sh - Provide generalized library-building support services.
-# NOTE: Changing this file will not affect anything until you rerun ltconfig.
-#
-# Copyright (C) 1996-1998 Free Software Foundation, Inc.
-# Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# The name of this program.
-progname=`$echo "$0" | sed 's%^.*/%%'`
-modename="$progname"
-
-# Constants.
-PROGRAM=ltmain.sh
-PACKAGE=libtool
-VERSION=1.2
-
-default_mode=
-help="Try \`$progname --help' for more information."
-magic="%%%MAGIC variable%%%"
-mkdir="mkdir"
-mv="mv -f"
-rm="rm -f"
-
-# Sed substitution that helps us do robust quoting. It backslashifies
-# metacharacters that are still active within double-quoted strings.
-Xsed='sed -e s/^X//'
-sed_quote_subst='s/\([\\`\\"$\\\\]\)/\\\1/g'
-
-# NLS nuisances.
-# Only set LANG and LC_ALL to C if already set.
-# These must not be set unconditionally because not all systems understand
-# e.g. LANG=C (notably SCO).
-if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi
-if test "${LANG+set}" = set; then LANG=C; export LANG; fi
-
-if test "$LTCONFIG_VERSION" != "$VERSION"; then
- echo "$modename: ltconfig version \`$LTCONFIG_VERSION' does not match $PROGRAM version \`$VERSION'" 1>&2
- echo "Fatal configuration error. See the $PACKAGE docs for more information." 1>&2
- exit 1
-fi
-
-if test "$build_libtool_libs" != yes && test "$build_old_libs" != yes; then
- echo "$modename: not configured to build any kind of library" 1>&2
- echo "Fatal configuration error. See the $PACKAGE docs for more information." 1>&2
- exit 1
-fi
-
-# Global variables.
-mode=$default_mode
-nonopt=
-prev=
-prevopt=
-run=
-show="$echo"
-show_help=
-execute_dlfiles=
-
-# Parse our command line options once, thoroughly.
-while test $# -gt 0
-do
- arg="$1"
- shift
-
- case "$arg" in
- -*=*) optarg=`$echo "X$arg" | $Xsed -e 's/[-_a-zA-Z0-9]*=//'` ;;
- *) optarg= ;;
- esac
-
- # If the previous option needs an argument, assign it.
- if test -n "$prev"; then
- case "$prev" in
- execute_dlfiles)
- eval "$prev=\"\$$prev \$arg\""
- ;;
- *)
- eval "$prev=\$arg"
- ;;
- esac
-
- prev=
- prevopt=
- continue
- fi
-
- # Have we seen a non-optional argument yet?
- case "$arg" in
- --help)
- show_help=yes
- ;;
-
- --version)
- echo "$PROGRAM (GNU $PACKAGE) $VERSION"
- exit 0
- ;;
-
- --dry-run | -n)
- run=:
- ;;
-
- --features)
- echo "host: $host"
- if test "$build_libtool_libs" = yes; then
- echo "enable shared libraries"
- else
- echo "disable shared libraries"
- fi
- if test "$build_old_libs" = yes; then
- echo "enable static libraries"
- else
- echo "disable static libraries"
- fi
- exit 0
- ;;
-
- --finish) mode="finish" ;;
-
- --mode) prevopt="--mode" prev=mode ;;
- --mode=*) mode="$optarg" ;;
-
- --quiet | --silent)
- show=:
- ;;
-
- -dlopen)
- prevopt="-dlopen"
- prev=execute_dlfiles
- ;;
-
- -*)
- $echo "$modename: unrecognized option \`$arg'" 1>&2
- $echo "$help" 1>&2
- exit 1
- ;;
-
- *)
- nonopt="$arg"
- break
- ;;
- esac
-done
-
-if test -n "$prevopt"; then
- $echo "$modename: option \`$prevopt' requires an argument" 1>&2
- $echo "$help" 1>&2
- exit 1
-fi
-
-if test -z "$show_help"; then
-
- # Infer the operation mode.
- if test -z "$mode"; then
- case "$nonopt" in
- *cc | *++ | gcc* | *-gcc*)
- mode=link
- for arg
- do
- case "$arg" in
- -c)
- mode=compile
- break
- ;;
- esac
- done
- ;;
- *db | *dbx)
- mode=execute
- ;;
- *install*|cp|mv)
- mode=install
- ;;
- *rm)
- mode=uninstall
- ;;
- *)
- # If we have no mode, but dlfiles were specified, then do execute mode.
- test -n "$execute_dlfiles" && mode=execute
-
- # Just use the default operation mode.
- if test -z "$mode"; then
- if test -n "$nonopt"; then
- $echo "$modename: warning: cannot infer operation mode from \`$nonopt'" 1>&2
- else
- $echo "$modename: warning: cannot infer operation mode without MODE-ARGS" 1>&2
- fi
- fi
- ;;
- esac
- fi
-
- # Only execute mode is allowed to have -dlopen flags.
- if test -n "$execute_dlfiles" && test "$mode" != execute; then
- $echo "$modename: unrecognized option \`-dlopen'" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
-
- # Change the help message to a mode-specific one.
- generic_help="$help"
- help="Try \`$modename --help --mode=$mode' for more information."
-
- # These modes are in order of execution frequency so that they run quickly.
- case "$mode" in
- # libtool compile mode
- compile)
- modename="$modename: compile"
- # Get the compilation command and the source file.
- base_compile=
- lastarg=
- srcfile="$nonopt"
- suppress_output=
-
- for arg
- do
- # Accept any command-line options.
- case "$arg" in
- -o)
- $echo "$modename: you cannot specify the output filename with \`-o'" 1>&2
- $echo "$help" 1>&2
- exit 1
- ;;
-
- -static)
- build_libtool_libs=no
- build_old_libs=yes
- continue
- ;;
- esac
-
- # Accept the current argument as the source file.
- lastarg="$srcfile"
- srcfile="$arg"
-
- # Aesthetically quote the previous argument.
-
- # Backslashify any backslashes, double quotes, and dollar signs.
- # These are the only characters that are still specially
- # interpreted inside of double-quoted scrings.
- lastarg=`$echo "X$lastarg" | $Xsed -e "$sed_quote_subst"`
-
- # Double-quote args containing other shell metacharacters.
- # Many Bourne shells cannot handle close brackets correctly in scan
- # sets, so we specify it separately.
- case "$lastarg" in
- *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*)
- lastarg="\"$lastarg\""
- ;;
- esac
-
- # Add the previous argument to base_compile.
- if test -z "$base_compile"; then
- base_compile="$lastarg"
- else
- base_compile="$base_compile $lastarg"
- fi
- done
-
- # Get the name of the library object.
- libobj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%'`
-
- # Recognize several different file suffixes.
- xform='[cCFSfms]'
- case "$libobj" in
- *.ada) xform=ada ;;
- *.adb) xform=adb ;;
- *.ads) xform=ads ;;
- *.asm) xform=asm ;;
- *.c++) xform=c++ ;;
- *.cc) xform=cc ;;
- *.cpp) xform=cpp ;;
- *.cxx) xform=cxx ;;
- *.f90) xform=f90 ;;
- *.for) xform=for ;;
- esac
-
- libobj=`$echo "X$libobj" | $Xsed -e "s/\.$xform$/.lo/"`
-
- case "$libobj" in
- *.lo) obj=`$echo "X$libobj" | $Xsed -e 's/\.lo$/.o/'` ;;
- *)
- $echo "$modename: cannot determine name of library object from \`$srcfile'" 1>&2
- exit 1
- ;;
- esac
-
- if test -z "$base_compile"; then
- $echo "$modename: you must specify a compilation command" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
-
- # Delete any leftover library objects.
- if test "$build_old_libs" = yes; then
- $run $rm $obj $libobj
- trap "$run $rm $obj $libobj; exit 1" 1 2 15
- else
- $run $rm $libobj
- trap "$run $rm $libobj; exit 1" 1 2 15
- fi
-
- # Only build a PIC object if we are building libtool libraries.
- if test "$build_libtool_libs" = yes; then
- # Without this assignment, base_compile gets emptied.
- fbsd_hideous_sh_bug=$base_compile
-
- # All platforms use -DPIC, to notify preprocessed assembler code.
- $show "$base_compile$pic_flag -DPIC $srcfile"
- if $run eval "$base_compile\$pic_flag -DPIC \$srcfile"; then :
- else
- test -n "$obj" && $run $rm $obj
- exit 1
- fi
-
- # If we have no pic_flag, then copy the object into place and finish.
- if test -z "$pic_flag"; then
- $show "$LN_S $obj $libobj"
- $run $LN_S $obj $libobj
- exit $?
- fi
-
- # Just move the object, then go on to compile the next one
- $show "$mv $obj $libobj"
- $run $mv $obj $libobj || exit 1
-
- # Allow error messages only from the first compilation.
- suppress_output=' >/dev/null 2>&1'
- fi
-
- # Only build a position-dependent object if we build old libraries.
- if test "$build_old_libs" = yes; then
- # Suppress compiler output if we already did a PIC compilation.
- $show "$base_compile $srcfile$suppress_output"
- if $run eval "$base_compile \$srcfile$suppress_output"; then :
- else
- $run $rm $obj $libobj
- exit 1
- fi
- fi
-
- # Create an invalid libtool object if no PIC, so that we do not
- # accidentally link it into a program.
- if test "$build_libtool_libs" != yes; then
- $show "echo timestamp > $libobj"
- $run eval "echo timestamp > \$libobj" || exit $?
- fi
-
- exit 0
- ;;
-
- # libtool link mode
- link)
- modename="$modename: link"
- CC="$nonopt"
- allow_undefined=yes
- compile_command="$CC"
- finalize_command="$CC"
-
- compile_shlibpath=
- finalize_shlibpath=
- deplibs=
- dlfiles=
- dlprefiles=
- export_dynamic=no
- hardcode_libdirs=
- libobjs=
- link_against_libtool_libs=
- ltlibs=
- objs=
- prev=
- prevarg=
- release=
- rpath=
- perm_rpath=
- temp_rpath=
- vinfo=
-
- # We need to know -static, to get the right output filenames.
- for arg
- do
- case "$arg" in
- -all-static | -static)
- if test "X$arg" = "X-all-static" && test "$build_libtool_libs" = yes && test -z "$link_static_flag"; then
- $echo "$modename: warning: complete static linking is impossible in this configuration" 1>&2
- fi
- build_libtool_libs=no
- build_old_libs=yes
- break
- ;;
- esac
- done
-
- # See if our shared archives depend on static archives.
- test -n "$old_archive_from_new_cmds" && build_old_libs=yes
-
- # Go through the arguments, transforming them on the way.
- for arg
- do
- # If the previous option needs an argument, assign it.
- if test -n "$prev"; then
- case "$prev" in
- output)
- compile_command="$compile_command @OUTPUT@"
- finalize_command="$finalize_command @OUTPUT@"
- ;;
- esac
-
- case "$prev" in
- dlfiles|dlprefiles)
- case "$arg" in
- *.la | *.lo) ;; # We handle these cases below.
- *)
- dlprefiles="$dlprefiles $arg"
- test "$prev" = dlfiles && dlfiles="$dlfiles $arg"
- prev=
- ;;
- esac
- ;;
- release)
- release="-$arg"
- prev=
- continue
- ;;
- rpath)
- rpath="$rpath $arg"
- prev=
- continue
- ;;
- *)
- eval "$prev=\"\$arg\""
- prev=
- continue
- ;;
- esac
- fi
-
- prevarg="$arg"
-
- case "$arg" in
- -all-static)
- if test -n "$link_static_flag"; then
- compile_command="$compile_command $link_static_flag"
- finalize_command="$finalize_command $link_static_flag"
- fi
- continue
- ;;
-
- -allow-undefined)
- # FIXME: remove this flag sometime in the future.
- $echo "$modename: \`-allow-undefined' is deprecated because it is the default" 1>&2
- continue
- ;;
-
- -dlopen)
- prev=dlfiles
- continue
- ;;
-
- -dlpreopen)
- prev=dlprefiles
- continue
- ;;
-
- -export-dynamic)
- if test "$export_dynamic" != yes; then
- export_dynamic=yes
- if test -n "$export_dynamic_flag_spec"; then
- eval arg=\"$export_dynamic_flag_spec\"
- else
- arg=
- fi
-
- # Add the symbol object into the linking commands.
- compile_command="$compile_command @SYMFILE@"
- finalize_command="$finalize_command @SYMFILE@"
- fi
- ;;
-
- -L*)
- dir=`$echo "X$arg" | $Xsed -e 's%^-L\(.*\)$%\1%'`
- case "$dir" in
- /* | [A-Za-z]:\\*)
- # Add the corresponding hardcode_libdir_flag, if it is not identical.
- ;;
- *)
- $echo "$modename: \`-L$dir' cannot specify a relative directory" 1>&2
- exit 1
- ;;
- esac
- deplibs="$deplibs $arg"
- ;;
-
- -l*) deplibs="$deplibs $arg" ;;
-
- -no-undefined)
- allow_undefined=no
- continue
- ;;
-
- -o) prev=output ;;
-
- -release)
- prev=release
- continue
- ;;
-
- -rpath)
- prev=rpath
- continue
- ;;
-
- -static)
- # If we have no pic_flag, then this is the same as -all-static.
- if test -z "$pic_flag" && test -n "$link_static_flag"; then
- compile_command="$compile_command $link_static_flag"
- finalize_command="$finalize_command $link_static_flag"
- fi
- continue
- ;;
-
- -version-info)
- prev=vinfo
- continue
- ;;
-
- # Some other compiler flag.
- -* | +*)
- # Unknown arguments in both finalize_command and compile_command need
- # to be aesthetically quoted because they are evaled later.
- arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
- case "$arg" in
- *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*)
- arg="\"$arg\""
- ;;
- esac
- ;;
-
- *.o | *.a)
- # A standard object.
- objs="$objs $arg"
- ;;
-
- *.lo)
- # A library object.
- if test "$prev" = dlfiles; then
- dlfiles="$dlfiles $arg"
- if test "$build_libtool_libs" = yes; then
- prev=
- continue
- else
- # If libtool objects are unsupported, then we need to preload.
- prev=dlprefiles
- fi
- fi
-
- if test "$prev" = dlprefiles; then
- # Preload the old-style object.
- dlprefiles="$dlprefiles "`$echo "X$arg" | $Xsed -e 's/\.lo$/\.o/'`
- prev=
- fi
- libobjs="$libobjs $arg"
- ;;
-
- *.la)
- # A libtool-controlled library.
-
- dlname=
- libdir=
- library_names=
- old_library=
-
- # Check to see that this really is a libtool archive.
- if (sed -e '2q' $arg | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then :
- else
- $echo "$modename: \`$arg' is not a valid libtool archive" 1>&2
- exit 1
- fi
-
- # If there is no directory component, then add one.
- case "$arg" in
- */* | *\\*) . $arg ;;
- *) . ./$arg ;;
- esac
-
- if test -z "$libdir"; then
- $echo "$modename: \`$arg' contains no -rpath information" 1>&2
- exit 1
- fi
-
- # Get the name of the library we link against.
- linklib=
- for l in $old_library $library_names; do
- linklib="$l"
- done
-
- if test -z "$linklib"; then
- $echo "$modename: cannot find name of link library for \`$arg'" 1>&2
- exit 1
- fi
-
- # Find the relevant object directory and library name.
- name=`$echo "X$arg" | $Xsed -e 's%^.*/%%' -e 's/\.la$//' -e 's/^lib//'`
- dir=`$echo "X$arg" | $Xsed -e 's%/[^/]*$%%'`
- if test "X$dir" = "X$arg"; then
- dir="$objdir"
- else
- dir="$dir/$objdir"
- fi
-
- # This library was specified with -dlopen.
- if test "$prev" = dlfiles; then
- dlfiles="$dlfiles $arg"
- if test -z "$dlname"; then
- # If there is no dlname, we need to preload.
- prev=dlprefiles
- else
- # We should not create a dependency on this library, but we
- # may need any libraries it requires.
- compile_command="$compile_command$dependency_libs"
- finalize_command="$finalize_command$dependency_libs"
- prev=
- continue
- fi
- fi
-
- # The library was specified with -dlpreopen.
- if test "$prev" = dlprefiles; then
- # Prefer using a static library (so that no silly _DYNAMIC symbols
- # are required to link).
- if test -n "$old_library"; then
- dlprefiles="$dlprefiles $dir/$old_library"
- else
- dlprefiles="$dlprefiles $dir/$linklib"
- fi
- prev=
- fi
-
- if test "$build_libtool_libs" = yes && test -n "$library_names"; then
- link_against_libtool_libs="$link_against_libtool_libs $arg"
- if test -n "$shlibpath_var"; then
- # Make sure the rpath contains only unique directories.
- case "$temp_rpath " in
- *" $dir "*) ;;
- *) temp_rpath="$temp_rpath $dir" ;;
- esac
- fi
-
- # This is the magic to use -rpath.
- if test -n "$hardcode_libdir_flag_spec"; then
- if test -n "$hardcode_libdir_separator"; then
- if test -z "$hardcode_libdirs"; then
- # Put the magic libdir with the hardcode flag.
- hardcode_libdirs="$libdir"
- libdir="@HARDCODE_LIBDIRS@"
- else
- # Just accumulate the unique libdirs.
- case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in
- *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
- ;;
- *)
- hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
- ;;
- esac
- libdir=
- fi
- fi
-
- if test -n "$libdir"; then
- eval flag=\"$hardcode_libdir_flag_spec\"
-
- compile_command="$compile_command $flag"
- finalize_command="$finalize_command $flag"
- fi
- elif test -n "$runpath_var"; then
- # Do the same for the permanent run path.
- case "$perm_rpath " in
- *" $libdir "*) ;;
- *) perm_rpath="$perm_rpath $libdir" ;;
- esac
- fi
-
-
- case "$hardcode_action" in
- immediate)
- if test "$hardcode_direct" = no; then
- compile_command="$compile_command $dir/$linklib"
- elif test "$hardcode_minus_L" = no; then
- compile_command="$compile_command -L$dir -l$name"
- elif test "$hardcode_shlibpath_var" = no; then
- compile_shlibpath="$compile_shlibpath$dir:"
- compile_command="$compile_command -l$name"
- fi
- ;;
-
- relink)
- # We need an absolute path.
- case "$dir" in
- /* | [A-Za-z]:\\*) ;;
- *)
- absdir=`cd "$dir" && pwd`
- if test -z "$absdir"; then
- $echo "$modename: cannot determine absolute directory name of \`$dir'" 1>&2
- exit 1
- fi
- dir="$absdir"
- ;;
- esac
-
- if test "$hardcode_direct" = yes; then
- compile_command="$compile_command $dir/$linklib"
- elif test "$hardcode_minus_L" = yes; then
- compile_command="$compile_command -L$dir -l$name"
- elif test "$hardcode_shlibpath_var" = yes; then
- compile_shlibpath="$compile_shlibpath$dir:"
- compile_command="$compile_command -l$name"
- fi
- ;;
-
- *)
- $echo "$modename: \`$hardcode_action' is an unknown hardcode action" 1>&2
- exit 1
- ;;
- esac
-
- # Finalize command for both is simple: just hardcode it.
- if test "$hardcode_direct" = yes; then
- finalize_command="$finalize_command $libdir/$linklib"
- elif test "$hardcode_minus_L" = yes; then
- finalize_command="$finalize_command -L$libdir -l$name"
- elif test "$hardcode_shlibpath_var" = yes; then
- finalize_shlibpath="$finalize_shlibpath$libdir:"
- finalize_command="$finalize_command -l$name"
- else
- # We cannot seem to hardcode it, guess we'll fake it.
- finalize_command="$finalize_command -L$libdir -l$name"
- fi
- else
- # Transform directly to old archives if we don't build new libraries.
- if test -n "$pic_flag" && test -z "$old_library"; then
- $echo "$modename: cannot find static library for \`$arg'" 1>&2
- exit 1
- fi
-
- # Here we assume that one of hardcode_direct or hardcode_minus_L
- # is not unsupported. This is valid on all known static and
- # shared platforms.
- if test "$hardcode_direct" != unsupported; then
- test -n "$old_library" && linklib="$old_library"
- compile_command="$compile_command $dir/$linklib"
- finalize_command="$finalize_command $dir/$linklib"
- else
- compile_command="$compile_command -L$dir -l$name"
- finalize_command="$finalize_command -L$dir -l$name"
- fi
- fi
-
- # Add in any libraries that this one depends upon.
- compile_command="$compile_command$dependency_libs"
- finalize_command="$finalize_command$dependency_libs"
- continue
- ;;
-
- # Some other compiler argument.
- *)
- # Unknown arguments in both finalize_command and compile_command need
- # to be aesthetically quoted because they are evaled later.
- arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
- case "$arg" in
- *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*)
- arg="\"$arg\""
- ;;
- esac
- ;;
- esac
-
- # Now actually substitute the argument into the commands.
- if test -n "$arg"; then
- compile_command="$compile_command $arg"
- finalize_command="$finalize_command $arg"
- fi
- done
-
- if test -n "$prev"; then
- $echo "$modename: the \`$prevarg' option requires an argument" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
-
- if test -n "$vinfo" && test -n "$release"; then
- $echo "$modename: you cannot specify both \`-version-info' and \`-release'" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
-
- oldlib=
- oldobjs=
- case "$output" in
- "")
- $echo "$modename: you must specify an output file" 1>&2
- $echo "$help" 1>&2
- exit 1
- ;;
-
- */* | *\\*)
- $echo "$modename: output file \`$output' must have no directory components" 1>&2
- exit 1
- ;;
-
- *.a)
- # Now set the variables for building old libraries.
- build_libtool_libs=no
- build_old_libs=yes
- oldlib="$output"
- $show "$rm $oldlib"
- $run $rm $oldlib
- ;;
-
- *.la)
- # Make sure we only generate libraries of the form `libNAME.la'.
- case "$output" in
- lib*) ;;
- *)
- $echo "$modename: libtool library \`$arg' must begin with \`lib'" 1>&2
- $echo "$help" 1>&2
- exit 1
- ;;
- esac
-
- name=`$echo "X$output" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
- eval libname=\"$libname_spec\"
-
- # All the library-specific variables (install_libdir is set above).
- library_names=
- old_library=
- dlname=
- current=0
- revision=0
- age=0
-
- if test -n "$objs"; then
- $echo "$modename: cannot build libtool library \`$output' from non-libtool objects:$objs" 2>&1
- exit 1
- fi
-
- # How the heck are we supposed to write a wrapper for a shared library?
- if test -n "$link_against_libtool_libs"; then
- $echo "$modename: libtool library \`$output' may not depend on uninstalled libraries:$link_against_libtool_libs" 1>&2
- exit 1
- fi
-
- if test -n "$dlfiles$dlprefiles"; then
- $echo "$modename: warning: \`-dlopen' is ignored while creating libtool libraries" 1>&2
- # Nullify the symbol file.
- compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
- finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
- fi
-
- if test -z "$rpath"; then
- $echo "$modename: you must specify an installation directory with \`-rpath'" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
-
- set dummy $rpath
- if test $# -gt 2; then
- $echo "$modename: warning: ignoring multiple \`-rpath's for a libtool library" 1>&2
- fi
- install_libdir="$2"
-
- # Parse the version information argument.
- IFS="${IFS= }"; save_ifs="$IFS"; IFS=':'
- set dummy $vinfo
- IFS="$save_ifs"
-
- if test -n "$5"; then
- $echo "$modename: too many parameters to \`-version-info'" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
-
- test -n "$2" && current="$2"
- test -n "$3" && revision="$3"
- test -n "$4" && age="$4"
-
- # Check that each of the things are valid numbers.
- case "$current" in
- 0 | [1-9] | [1-9][0-9]*) ;;
- *)
- $echo "$modename: CURRENT \`$current' is not a nonnegative integer" 1>&2
- $echo "$modename: \`$vinfo' is not valid version information" 1>&2
- exit 1
- ;;
- esac
-
- case "$revision" in
- 0 | [1-9] | [1-9][0-9]*) ;;
- *)
- $echo "$modename: REVISION \`$revision' is not a nonnegative integer" 1>&2
- $echo "$modename: \`$vinfo' is not valid version information" 1>&2
- exit 1
- ;;
- esac
-
- case "$age" in
- 0 | [1-9] | [1-9][0-9]*) ;;
- *)
- $echo "$modename: AGE \`$age' is not a nonnegative integer" 1>&2
- $echo "$modename: \`$vinfo' is not valid version information" 1>&2
- exit 1
- ;;
- esac
-
- if test $age -gt $current; then
- $echo "$modename: AGE \`$age' is greater than the current interface number \`$current'" 1>&2
- $echo "$modename: \`$vinfo' is not valid version information" 1>&2
- exit 1
- fi
-
- # Calculate the version variables.
- version_vars="version_type current age revision"
- case "$version_type" in
- none) ;;
-
- linux)
- version_vars="$version_vars major versuffix"
- major=`expr $current - $age`
- versuffix="$major.$age.$revision"
- ;;
-
- osf)
- version_vars="$version_vars versuffix verstring"
- major=`expr $current - $age`
- versuffix="$current.$age.$revision"
- verstring="$versuffix"
-
- # Add in all the interfaces that we are compatible with.
- loop=$age
- while test $loop != 0; do
- iface=`expr $current - $loop`
- loop=`expr $loop - 1`
- verstring="$verstring:${iface}.0"
- done
-
- # Make executables depend on our current version.
- verstring="$verstring:${current}.0"
- ;;
-
- sunos)
- version_vars="$version_vars major versuffix"
- major="$current"
- versuffix="$current.$revision"
- ;;
-
- *)
- $echo "$modename: unknown library version type \`$version_type'" 1>&2
- echo "Fatal configuration error. See the $PACKAGE docs for more information." 1>&2
- exit 1
- ;;
- esac
-
- # Create the output directory, or remove our outputs if we need to.
- if test -d $objdir; then
- $show "$rm $objdir/$output $objdir/$libname.* $objdir/${libname}${release}.*"
- $run $rm $objdir/$output $objdir/$libname.* $objdir/${libname}${release}.*
- else
- $show "$mkdir $objdir"
- $run $mkdir $objdir
- status=$?
- if test $status -eq 0 || test -d $objdir; then :
- else
- exit $status
- fi
- fi
-
- # Check to see if the archive will have undefined symbols.
- if test "$allow_undefined" = yes; then
- if test "$allow_undefined_flag" = unsupported; then
- $echo "$modename: warning: undefined symbols not allowed in $host shared libraries" 1>&2
- build_libtool_libs=no
- build_old_libs=yes
- fi
- else
- # Don't allow undefined symbols.
- allow_undefined_flag="$no_undefined_flag"
- fi
-
- # Add libc to deplibs on all systems.
- dependency_libs="$deplibs"
- deplibs="$deplibs -lc"
-
- if test "$build_libtool_libs" = yes; then
- # Get the real and link names of the library.
- eval library_names=\"$library_names_spec\"
- set dummy $library_names
- realname="$2"
- shift; shift
-
- if test -n "$soname_spec"; then
- eval soname=\"$soname_spec\"
- else
- soname="$realname"
- fi
-
- lib="$objdir/$realname"
- for link
- do
- linknames="$linknames $link"
- done
-
- # Use standard objects if they are PIC.
- test -z "$pic_flag" && libobjs=`$echo "X$libobjs " | $Xsed -e 's/\.lo /.o /g' -e 's/ $//g'`
-
- # Do each of the archive commands.
- eval cmds=\"$archive_cmds\"
- IFS="${IFS= }"; save_ifs="$IFS"; IFS=';'
- for cmd in $cmds; do
- IFS="$save_ifs"
- $show "$cmd"
- $run eval "$cmd" || exit $?
- done
- IFS="$save_ifs"
-
- # Create links to the real library.
- for linkname in $linknames; do
- $show "(cd $objdir && $LN_S $realname $linkname)"
- $run eval '(cd $objdir && $LN_S $realname $linkname)' || exit $?
- done
-
- # If -export-dynamic was specified, set the dlname.
- if test "$export_dynamic" = yes; then
- # On all known operating systems, these are identical.
- dlname="$soname"
- fi
- fi
-
- # Now set the variables for building old libraries.
- oldlib="$objdir/$libname.a"
- ;;
-
- *.lo | *.o)
- if test -n "$link_against_libtool_libs"; then
- $echo "$modename: error: cannot link libtool libraries into reloadable objects" 1>&2
- exit 1
- fi
-
- if test -n "$deplibs"; then
- $echo "$modename: warning: \`-l' and \`-L' are ignored while creating objects" 1>&2
- fi
-
- if test -n "$dlfiles$dlprefiles"; then
- $echo "$modename: warning: \`-dlopen' is ignored while creating objects" 1>&2
- # Nullify the symbol file.
- compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
- finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
- fi
-
- if test -n "$rpath"; then
- $echo "$modename: warning: \`-rpath' is ignored while creating objects" 1>&2
- fi
-
- if test -n "$vinfo"; then
- $echo "$modename: warning: \`-version-info' is ignored while creating objects" 1>&2
- fi
-
- if test -n "$release"; then
- $echo "$modename: warning: \`-release' is ignored while creating objects" 1>&2
- fi
-
- case "$output" in
- *.lo)
- if test -n "$objs"; then
- $echo "$modename: cannot build library object \`$output' from non-libtool objects" 1>&2
- exit 1
- fi
- libobj="$output"
- obj=`$echo "X$output" | $Xsed -e 's/\.lo$/.o/'`
- ;;
- *)
- libobj=
- obj="$output"
- ;;
- esac
-
- # Delete the old objects.
- $run $rm $obj $libobj
-
- # Create the old-style object.
- reload_objs="$objs"`$echo "X$libobjs " | $Xsed -e 's/[^ ]*\.a //g' -e 's/\.lo /.o /g' -e 's/ $//g'`
-
- output="$obj"
- eval cmds=\"$reload_cmds\"
- IFS="${IFS= }"; save_ifs="$IFS"; IFS=';'
- for cmd in $cmds; do
- IFS="$save_ifs"
- $show "$cmd"
- $run eval "$cmd" || exit $?
- done
- IFS="$save_ifs"
-
- # Exit if we aren't doing a library object file.
- test -z "$libobj" && exit 0
-
- if test "$build_libtool_libs" != yes; then
- # Create an invalid libtool object if no PIC, so that we don't
- # accidentally link it into a program.
- $show "echo timestamp > $libobj"
- $run eval "echo timestamp > $libobj" || exit $?
- exit 0
- fi
-
- if test -n "$pic_flag"; then
- # Only do commands if we really have different PIC objects.
- reload_objs="$libobjs"
- output="$libobj"
- eval cmds=\"$reload_cmds\"
- IFS="${IFS= }"; save_ifs="$IFS"; IFS=';'
- for cmd in $cmds; do
- IFS="$save_ifs"
- $show "$cmd"
- $run eval "$cmd" || exit $?
- done
- IFS="$save_ifs"
- else
- # Just create a symlink.
- $show "$LN_S $obj $libobj"
- $run $LN_S $obj $libobj || exit 1
- fi
-
- exit 0
- ;;
-
- *)
- if test -n "$vinfo"; then
- $echo "$modename: warning: \`-version-info' is ignored while linking programs" 1>&2
- fi
-
- if test -n "$release"; then
- $echo "$modename: warning: \`-release' is ignored while creating objects" 1>&2
- fi
-
- if test -n "$rpath"; then
- # If the user specified any rpath flags, then add them.
- for libdir in $rpath; do
- if test -n "$hardcode_libdir_flag_spec"; then
- if test -n "$hardcode_libdir_separator"; then
- if test -z "$hardcode_libdirs"; then
- # Put the magic libdir with the hardcode flag.
- hardcode_libdirs="$libdir"
- libdir="@HARDCODE_LIBDIRS@"
- else
- # Just accumulate the unique libdirs.
- case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in
- *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
- ;;
- *)
- hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
- ;;
- esac
- libdir=
- fi
- fi
-
- if test -n "$libdir"; then
- eval flag=\"$hardcode_libdir_flag_spec\"
-
- compile_command="$compile_command $flag"
- finalize_command="$finalize_command $flag"
- fi
- elif test -n "$runpath_var"; then
- case "$perm_rpath " in
- *" $libdir "*) ;;
- *) perm_rpath="$perm_rpath $libdir" ;;
- esac
- fi
- done
- fi
-
- # Substitute the hardcoded libdirs into the compile commands.
- if test -n "$hardcode_libdir_separator"; then
- compile_command=`$echo "X$compile_command" | $Xsed -e "s%@HARDCODE_LIBDIRS@%$hardcode_libdirs%g"`
- finalize_command=`$echo "X$finalize_command" | $Xsed -e "s%@HARDCODE_LIBDIRS@%$hardcode_libdirs%g"`
- fi
-
- if test -n "$libobjs" && test "$build_old_libs" = yes; then
- # Transform all the library objects into standard objects.
- compile_command=`$echo "X$compile_command " | $Xsed -e 's/\.lo /.o /g' -e 's/ $//'`
- finalize_command=`$echo "X$finalize_command " | $Xsed -e 's/\.lo /.o /g' -e 's/ $//'`
- fi
-
- if test "$export_dynamic" = yes && test -n "$NM" && test -n "$global_symbol_pipe"; then
- dlsyms="${output}S.c"
- else
- dlsyms=
- fi
-
- if test -n "$dlsyms"; then
- # Add our own program objects to the preloaded list.
- dlprefiles=`$echo "X$objs$dlprefiles " | $Xsed -e 's/\.lo /.o /g' -e 's/ $//'`
-
- # Discover the nlist of each of the dlfiles.
- nlist="$objdir/${output}.nm"
-
- if test -d $objdir; then
- $show "$rm $nlist ${nlist}T"
- $run $rm "$nlist" "${nlist}T"
- else
- $show "$mkdir $objdir"
- $run $mkdir $objdir
- status=$?
- if test $status -eq 0 || test -d $objdir; then :
- else
- exit $status
- fi
- fi
-
- for arg in $dlprefiles; do
- $show "extracting global C symbols from \`$arg'"
- $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
- done
-
- # Parse the name list into a source file.
- $show "creating $objdir/$dlsyms"
- if test -z "$run"; then
- # Make sure we at least have an empty file.
- test -f "$nlist" || : > "$nlist"
-
- # Try sorting and uniquifying the output.
- if sort "$nlist" | uniq > "$nlist"T; then
- mv -f "$nlist"T "$nlist"
- wcout=`wc "$nlist" 2>/dev/null`
- count=`echo "X$wcout" | $Xsed -e 's/^[ ]*\([0-9][0-9]*\).*$/\1/'`
- (test "$count" -ge 0) 2>/dev/null || count=-1
- else
- $rm "$nlist"T
- count=-1
- fi
-
- case "$dlsyms" in
- "") ;;
- *.c)
- $echo > "$objdir/$dlsyms" "\
-/* $dlsyms - symbol resolution table for \`$output' dlsym emulation. */
-/* Generated by $PROGRAM - GNU $PACKAGE $VERSION */
-
-#ifdef __cplusplus
-extern \"C\" {
-#endif
-
-/* Prevent the only kind of declaration conflicts we can make. */
-#define dld_preloaded_symbol_count some_other_symbol
-#define dld_preloaded_symbols some_other_symbol
-
-/* External symbol declarations for the compiler. */\
-"
-
- if test -f "$nlist"; then
- sed -e 's/^.* \(.*\)$/extern char \1;/' < "$nlist" >> "$objdir/$dlsyms"
- else
- echo '/* NONE */' >> "$objdir/$dlsyms"
- fi
-
- $echo >> "$objdir/$dlsyms" "\
-
-#undef dld_preloaded_symbol_count
-#undef dld_preloaded_symbols
-
-#if defined (__STDC__) && __STDC__
-# define __ptr_t void *
-#else
-# define __ptr_t char *
-#endif
-
-/* The number of symbols in dld_preloaded_symbols, -1 if unsorted. */
-int dld_preloaded_symbol_count = $count;
-
-/* The mapping between symbol names and symbols. */
-struct {
- char *name;
- __ptr_t address;
-}
-dld_preloaded_symbols[] =
-{\
-"
-
- if test -f "$nlist"; then
- sed 's/^\(.*\) \(.*\)$/ {"\1", (__ptr_t) \&\2},/' < "$nlist" >> "$objdir/$dlsyms"
- fi
-
- $echo >> "$objdir/$dlsyms" "\
- {0, (__ptr_t) 0}
-};
-
-#ifdef __cplusplus
-}
-#endif\
-"
- ;;
-
- *)
- $echo "$modename: unknown suffix for \`$dlsyms'" 1>&2
- exit 1
- ;;
- esac
- fi
-
- # Now compile the dynamic symbol file.
- $show "(cd $objdir && $CC -c$no_builtin_flag \"$dlsyms\")"
- $run eval '(cd $objdir && $CC -c$no_builtin_flag "$dlsyms")' || exit $?
-
- # Transform the symbol file into the correct name.
- compile_command=`$echo "X$compile_command" | $Xsed -e "s%@SYMFILE@%$objdir/${output}S.o%"`
- finalize_command=`$echo "X$finalize_command" | $Xsed -e "s%@SYMFILE@%$objdir/${output}S.o%"`
- elif test "$export_dynamic" != yes; then
- test -n "$dlfiles$dlprefiles" && $echo "$modename: warning: \`-dlopen' and \`-dlpreopen' are ignored without \`-export-dynamic'" 1>&2
- else
- # We keep going just in case the user didn't refer to
- # dld_preloaded_symbols. The linker will fail if global_symbol_pipe
- # really was required.
- $echo "$modename: not configured to extract global symbols from dlpreopened files" 1>&2
-
- # Nullify the symbol file.
- compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
- finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
- fi
-
- if test -z "$link_against_libtool_libs" || test "$build_libtool_libs" != yes; then
- # Replace the output file specification.
- compile_command=`$echo "X$compile_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
- finalize_command=`$echo "X$finalize_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
-
- # We have no uninstalled library dependencies, so finalize right now.
- $show "$compile_command"
- $run eval "$compile_command"
- exit $?
- fi
-
- # Replace the output file specification.
- compile_command=`$echo "X$compile_command" | $Xsed -e 's%@OUTPUT@%'"$objdir/$output"'%g'`
- finalize_command=`$echo "X$finalize_command" | $Xsed -e 's%@OUTPUT@%'"$objdir/$output"'T%g'`
-
- # Create the binary in the object directory, then wrap it.
- if test -d $objdir; then :
- else
- $show "$mkdir $objdir"
- $run $mkdir $objdir
- status=$?
- if test $status -eq 0 || test -d $objdir; then :
- else
- exit $status
- fi
- fi
-
- if test -n "$shlibpath_var"; then
- # We should set the shlibpath_var
- rpath=
- for dir in $temp_rpath; do
- case "$dir" in
- /* | [A-Za-z]:\\*)
- # Absolute path.
- rpath="$rpath$dir:"
- ;;
- *)
- # Relative path: add a thisdir entry.
- rpath="$rpath\$thisdir/$dir:"
- ;;
- esac
- done
- temp_rpath="$rpath"
- fi
-
- # Delete the old output file.
- $run $rm $output
-
- if test -n "$compile_shlibpath"; then
- compile_command="$shlibpath_var=\"$compile_shlibpath\$$shlibpath_var\" $compile_command"
- fi
- if test -n "$finalize_shlibpath"; then
- finalize_command="$shlibpath_var=\"$finalize_shlibpath\$$shlibpath_var\" $finalize_command"
- fi
-
- if test -n "$runpath_var" && test -n "$perm_rpath"; then
- # We should set the runpath_var.
- rpath=
- for dir in $perm_rpath; do
- rpath="$rpath$dir:"
- done
- compile_command="$runpath_var=\"$rpath\$$runpath_var\" $compile_command"
- finalize_command="$runpath_var=\"$rpath\$$runpath_var\" $finalize_command"
- fi
-
- case "$hardcode_action" in
- relink)
- # AGH! Flame the AIX and HP-UX people for me, will ya?
- $echo "$modename: warning: using a buggy system linker" 1>&2
- $echo "$modename: relinking will be required before \`$output' can be installed" 1>&2
- ;;
- esac
-
- $show "$compile_command"
- $run eval "$compile_command" || exit $?
-
- # Now create the wrapper script.
- $show "creating $output"
-
- # Quote the finalize command for shipping.
- finalize_command=`$echo "X$finalize_command" | $Xsed -e "$sed_quote_subst"`
-
- # Quote $echo for shipping.
- qecho=`$echo "X$echo" | $Xsed -e "$sed_quote_subst"`
-
- # Only actually do things if our run command is non-null.
- if test -z "$run"; then
- $rm $output
- trap "$rm $output; exit 1" 1 2 15
-
- $echo > $output "\
-#! /bin/sh
-
-# $output - temporary wrapper script for $objdir/$output
-# Generated by ltmain.sh - GNU $PACKAGE $VERSION
-#
-# The $output program cannot be directly executed until all the libtool
-# libraries that it depends on are installed.
-#
-# This wrapper script should never be moved out of \``pwd`'.
-# If it is, it will not operate correctly.
-
-# Sed substitution that helps us do robust quoting. It backslashifies
-# metacharacters that are still active within double-quoted strings.
-Xsed='sed -e s/^X//'
-sed_quote_subst='$sed_quote_subst'
-
-# The HP-UX ksh and POSIX shell print the target directory to stdout
-# if CDPATH is set.
-if test \"\${CDPATH+set}\" = set; then CDPATH=; export CDPATH; fi
-
-# This environment variable determines our operation mode.
-if test \"\$libtool_install_magic\" = \"$magic\"; then
- # install mode needs the following variables:
- link_against_libtool_libs='$link_against_libtool_libs'
- finalize_command=\"$finalize_command\"
-else
- # When we are sourced in execute mode, \$file and \$echo are already set.
- if test \"\$libtool_execute_magic\" = \"$magic\"; then :
- else
- echo=\"$qecho\"
- file=\"\$0\"
- fi\
-"
- $echo >> $output "\
-
- # Find the directory that this script lives in.
- thisdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*$%%'\`
- test \"x\$thisdir\" = \"x\$file\" && thisdir=.
-
- # Follow symbolic links until we get to the real thisdir.
- file=\`ls -ld \"\$file\" | sed -n 's/.*-> //p'\`
- while test -n \"\$file\"; do
- destdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*\$%%'\`
-
- # If there was a directory component, then change thisdir.
- if test \"x\$destdir\" != \"x\$file\"; then
- case \"\$destdir\" in
- /* | [A-Za-z]:\\*) thisdir=\"\$destdir\" ;;
- *) thisdir=\"\$thisdir/\$destdir\" ;;
- esac
- fi
-
- file=\`\$echo \"X\$file\" | \$Xsed -e 's%^.*/%%'\`
- file=\`ls -ld \"\$thisdir/\$file\" | sed -n 's/.*-> //p'\`
- done
-
- # Try to get the absolute directory name.
- absdir=\`cd \"\$thisdir\" && pwd\`
- test -n \"\$absdir\" && thisdir=\"\$absdir\"
-
- progdir=\"\$thisdir/$objdir\"
- program='$output'
-
- if test -f \"\$progdir/\$program\"; then"
-
- # Export our shlibpath_var if we have one.
- if test -n "$shlibpath_var" && test -n "$temp_rpath"; then
- $echo >> $output "\
- # Add our own library path to $shlibpath_var
- $shlibpath_var=\"$temp_rpath\$$shlibpath_var\"
-
- # Some systems cannot cope with colon-terminated $shlibpath_var
- $shlibpath_var=\`\$echo \"X\$$shlibpath_var\" | \$Xsed -e 's/:*\$//'\`
-
- export $shlibpath_var
-"
- fi
-
- $echo >> $output "\
- if test \"\$libtool_execute_magic\" != \"$magic\"; then
- # Run the actual program with our arguments.
-
- # Export the path to the program.
- PATH=\"\$progdir:\$PATH\"
- export PATH
-
- exec \$program \${1+\"\$@\"}
-
- \$echo \"\$0: cannot exec \$program \${1+\"\$@\"}\"
- exit 1
- fi
- else
- # The program doesn't exist.
- \$echo \"\$0: error: \$progdir/\$program does not exist\" 1>&2
- \$echo \"This script is just a wrapper for \$program.\" 1>&2
- echo \"See the $PACKAGE documentation for more information.\" 1>&2
- exit 1
- fi
-fi\
-"
- chmod +x $output
- fi
- exit 0
- ;;
- esac
-
- # See if we need to build an old-fashioned archive.
- if test "$build_old_libs" = "yes"; then
- # Transform .lo files to .o files.
- oldobjs="$objs"`$echo "X$libobjs " | $Xsed -e 's/[^ ]*\.a //g' -e 's/\.lo /.o /g' -e 's/ $//g'`
-
- # Do each command in the archive commands.
- if test -n "$old_archive_from_new_cmds" && test "$build_libtool_libs" = yes; then
- eval cmds=\"$old_archive_from_new_cmds\"
- else
- eval cmds=\"$old_archive_cmds\"
- fi
- IFS="${IFS= }"; save_ifs="$IFS"; IFS=';'
- for cmd in $cmds; do
- IFS="$save_ifs"
- $show "$cmd"
- $run eval "$cmd" || exit $?
- done
- IFS="$save_ifs"
- fi
-
- # Now create the libtool archive.
- case "$output" in
- *.la)
- old_library=
- test "$build_old_libs" = yes && old_library="$libname.a"
-
- $show "creating $output"
-
- # Only create the output if not a dry run.
- if test -z "$run"; then
- $echo > $output "\
-# $output - a libtool library file
-# Generated by ltmain.sh - GNU $PACKAGE $VERSION
-
-# The name that we can dlopen(3).
-dlname='$dlname'
-
-# Names of this library.
-library_names='$library_names'
-
-# The name of the static archive.
-old_library='$old_library'
-
-# Libraries that this one depends upon.
-dependency_libs='$dependency_libs'
-
-# Version information for $libname.
-current=$current
-age=$age
-revision=$revision
-
-# Directory that this library needs to be installed in:
-libdir='$install_libdir'\
-"
- fi
-
- # Do a symbolic link so that the libtool archive can be found in
- # LD_LIBRARY_PATH before the program is installed.
- $show "(cd $objdir && $LN_S ../$output $output)"
- $run eval "(cd $objdir && $LN_S ../$output $output)" || exit 1
- ;;
- esac
- exit 0
- ;;
-
- # libtool install mode
- install)
- modename="$modename: install"
-
- # There may be an optional /bin/sh argument at the beginning of
- # install_prog (especially on Windows NT).
- if test "$nonopt" = "$SHELL"; then
- # Aesthetically quote it.
- arg=`$echo "X$nonopt" | $Xsed -e "$sed_quote_subst"`
- case "$arg" in
- *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*)
- arg="\"$arg\""
- ;;
- esac
- install_prog="$arg "
- arg="$1"
- shift
- else
- install_prog=
- arg="$nonopt"
- fi
-
- # The real first argument should be the name of the installation program.
- # Aesthetically quote it.
- arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
- case "$arg" in
- *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*)
- arg="\"$arg\""
- ;;
- esac
- install_prog="$install_prog$arg"
-
- # We need to accept at least all the BSD install flags.
- dest=
- files=
- opts=
- prev=
- install_type=
- isdir=
- stripme=
- for arg
- do
- if test -n "$dest"; then
- files="$files $dest"
- dest="$arg"
- continue
- fi
-
- case "$arg" in
- -d) isdir=yes ;;
- -f) prev="-f" ;;
- -g) prev="-g" ;;
- -m) prev="-m" ;;
- -o) prev="-o" ;;
- -s)
- stripme=" -s"
- continue
- ;;
- -*) ;;
-
- *)
- # If the previous option needed an argument, then skip it.
- if test -n "$prev"; then
- prev=
- else
- dest="$arg"
- continue
- fi
- ;;
- esac
-
- # Aesthetically quote the argument.
- arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
- case "$arg" in
- *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*)
- arg="\"$arg\""
- ;;
- esac
- install_prog="$install_prog $arg"
- done
-
- if test -z "$install_prog"; then
- $echo "$modename: you must specify an install program" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
-
- if test -n "$prev"; then
- $echo "$modename: the \`$prev' option requires an argument" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
-
- if test -z "$files"; then
- if test -z "$dest"; then
- $echo "$modename: no file or destination specified" 1>&2
- else
- $echo "$modename: you must specify a destination" 1>&2
- fi
- $echo "$help" 1>&2
- exit 1
- fi
-
- # Strip any trailing slash from the destination.
- dest=`$echo "X$dest" | $Xsed -e 's%/$%%'`
-
- # Check to see that the destination is a directory.
- test -d "$dest" && isdir=yes
- if test -n "$isdir"; then
- destdir="$dest"
- destname=
- else
- destdir=`$echo "X$dest" | $Xsed -e 's%/[^/]*$%%'`
- test "X$destdir" = "X$dest" && destdir=.
- destname=`$echo "X$dest" | $Xsed -e 's%^.*/%%'`
-
- # Not a directory, so check to see that there is only one file specified.
- set dummy $files
- if test $# -gt 2; then
- $echo "$modename: \`$dest' is not a directory" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
- fi
- case "$destdir" in
- /* | [A-Za-z]:\\*) ;;
- *)
- for file in $files; do
- case "$file" in
- *.lo) ;;
- *)
- $echo "$modename: \`$destdir' must be an absolute directory name" 1>&2
- $echo "$help" 1>&2
- exit 1
- ;;
- esac
- done
- ;;
- esac
-
- # This variable tells wrapper scripts just to set variables rather
- # than running their programs.
- libtool_install_magic="$magic"
-
- staticlibs=
- future_libdirs=
- current_libdirs=
- for file in $files; do
-
- # Do each installation.
- case "$file" in
- *.a)
- # Do the static libraries later.
- staticlibs="$staticlibs $file"
- ;;
-
- *.la)
- # Check to see that this really is a libtool archive.
- if (sed -e '2q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then :
- else
- $echo "$modename: \`$file' is not a valid libtool archive" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
-
- library_names=
- old_library=
- # If there is no directory component, then add one.
- case "$file" in
- */* | *\\*) . $file ;;
- *) . ./$file ;;
- esac
-
- # Add the libdir to current_libdirs if it is the destination.
- if test "X$destdir" = "X$libdir"; then
- case "$current_libdirs " in
- *" $libdir "*) ;;
- *) current_libdirs="$current_libdirs $libdir" ;;
- esac
- else
- # Note the libdir as a future libdir.
- case "$future_libdirs " in
- *" $libdir "*) ;;
- *) future_libdirs="$future_libdirs $libdir" ;;
- esac
- fi
-
- dir="`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`/"
- test "X$dir" = "X$file/" && dir=
- dir="$dir$objdir"
-
- # See the names of the shared library.
- set dummy $library_names
- if test -n "$2"; then
- realname="$2"
- shift
- shift
-
- # Install the shared library and build the symlinks.
- $show "$install_prog $dir/$realname $destdir/$realname"
- $run eval "$install_prog $dir/$realname $destdir/$realname" || exit $?
- test "X$dlname" = "X$realname" && dlname=
-
- if test $# -gt 0; then
- # Delete the old symlinks.
- rmcmd="$rm"
- for linkname
- do
- rmcmd="$rmcmd $destdir/$linkname"
- done
- $show "$rmcmd"
- $run $rmcmd
-
- # ... and create new ones.
- for linkname
- do
- test "X$dlname" = "X$linkname" && dlname=
- $show "(cd $destdir && $LN_S $realname $linkname)"
- $run eval "(cd $destdir && $LN_S $realname $linkname)"
- done
- fi
-
- if test -n "$dlname"; then
- # Install the dynamically-loadable library.
- $show "$install_prog $dir/$dlname $destdir/$dlname"
- $run eval "$install_prog $dir/$dlname $destdir/$dlname" || exit $?
- fi
-
- # Do each command in the postinstall commands.
- lib="$destdir/$realname"
- eval cmds=\"$postinstall_cmds\"
- IFS="${IFS= }"; save_ifs="$IFS"; IFS=';'
- for cmd in $cmds; do
- IFS="$save_ifs"
- $show "$cmd"
- $run eval "$cmd" || exit $?
- done
- IFS="$save_ifs"
- fi
-
- # Install the pseudo-library for information purposes.
- name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
- $show "$install_prog $file $destdir/$name"
- $run eval "$install_prog $file $destdir/$name" || exit $?
-
- # Maybe install the static library, too.
- test -n "$old_library" && staticlibs="$staticlibs $dir/$old_library"
- ;;
-
- *.lo)
- # Install (i.e. copy) a libtool object.
-
- # Figure out destination file name, if it wasn't already specified.
- if test -n "$destname"; then
- destfile="$destdir/$destname"
- else
- destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
- destfile="$destdir/$destfile"
- fi
-
- # Deduce the name of the destination old-style object file.
- case "$destfile" in
- *.lo)
- staticdest=`$echo "X$destfile" | $Xsed -e 's/\.lo$/\.o/'`
- ;;
- *.o)
- staticdest="$destfile"
- destfile=
- ;;
- *)
- $echo "$modename: cannot copy a libtool object to \`$destfile'" 1>&2
- $echo "$help" 1>&2
- exit 1
- ;;
- esac
-
- # Install the libtool object if requested.
- if test -n "$destfile"; then
- $show "$install_prog $file $destfile"
- $run eval "$install_prog $file $destfile" || exit $?
- fi
-
- # Install the old object if enabled.
- if test "$build_old_libs" = yes; then
- # Deduce the name of the old-style object file.
- staticobj=`$echo "X$file" | $Xsed -e 's/\.lo$/\.o/'`
-
- $show "$install_prog $staticobj $staticdest"
- $run eval "$install_prog \$staticobj \$staticdest" || exit $?
- fi
- exit 0
- ;;
-
- *)
- # Do a test to see if this is really a libtool program.
- if (sed -e '4q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then
- link_against_libtool_libs=
- finalize_command=
-
- # If there is no directory component, then add one.
- case "$file" in
- */* | *\\*) . $file ;;
- *) . ./$file ;;
- esac
-
- # Check the variables that should have been set.
- if test -z "$link_against_libtool_libs" || test -z "$finalize_command"; then
- $echo "$modename: invalid libtool wrapper script \`$file'" 1>&2
- exit 1
- fi
-
- finalize=yes
- for lib in $link_against_libtool_libs; do
- # Check to see that each library is installed.
- libdir=
- if test -f "$lib"; then
- # If there is no directory component, then add one.
- case "$lib" in
- */* | *\\*) . $lib ;;
- *) . ./$lib ;;
- esac
- fi
- libfile="$libdir/`$echo "X$lib" | $Xsed -e 's%^.*/%%g'`"
- if test -z "$libdir"; then
- $echo "$modename: warning: \`$lib' contains no -rpath information" 1>&2
- elif test -f "$libfile"; then :
- else
- $echo "$modename: warning: \`$lib' has not been installed in \`$libdir'" 1>&2
- finalize=no
- fi
- done
-
- if test "$hardcode_action" = relink; then
- if test "$finalize" = yes; then
- $echo "$modename: warning: relinking \`$file' on behalf of your buggy system linker" 1>&2
- $show "$finalize_command"
- if $run eval "$finalize_command"; then :
- else
- $echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
- continue
- fi
- file="$objdir/$file"T
- else
- $echo "$modename: warning: cannot relink \`$file' on behalf of your buggy system linker" 1>&2
- fi
- else
- # Install the binary that we compiled earlier.
- file=`$echo "X$file" | $Xsed -e "s%\([^/]*\)$%$objdir/\1%"`
- fi
- fi
-
- $show "$install_prog$stripme $file $dest"
- $run eval "$install_prog\$stripme \$file \$dest" || exit $?
- ;;
- esac
- done
-
- for file in $staticlibs; do
- name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
-
- # Set up the ranlib parameters.
- oldlib="$destdir/$name"
-
- $show "$install_prog $file $oldlib"
- $run eval "$install_prog \$file \$oldlib" || exit $?
-
- # Do each command in the postinstall commands.
- eval cmds=\"$old_postinstall_cmds\"
- IFS="${IFS= }"; save_ifs="$IFS"; IFS=';'
- for cmd in $cmds; do
- IFS="$save_ifs"
- $show "$cmd"
- $run eval "$cmd" || exit $?
- done
- IFS="$save_ifs"
- done
-
- if test -n "$future_libdirs"; then
- $echo "$modename: warning: remember to run \`$progname --finish$future_libdirs'" 1>&2
- fi
-
- if test -n "$current_libdirs"; then
- # Maybe just do a dry run.
- test -n "$run" && current_libdirs=" -n$current_libdirs"
- exec $SHELL $0 --finish$current_libdirs
- exit 1
- fi
-
- exit 0
- ;;
-
- # libtool finish mode
- finish)
- modename="$modename: finish"
- libdirs="$nonopt"
-
- if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then
- for dir
- do
- libdirs="$libdirs $dir"
- done
-
- for libdir in $libdirs; do
- if test -n "$finish_cmds"; then
- # Do each command in the finish commands.
- eval cmds=\"$finish_cmds\"
- IFS="${IFS= }"; save_ifs="$IFS"; IFS=';'
- for cmd in $cmds; do
- IFS="$save_ifs"
- $show "$cmd"
- $run eval "$cmd"
- done
- IFS="$save_ifs"
- fi
- if test -n "$finish_eval"; then
- # Do the single finish_eval.
- eval cmds=\"$finish_eval\"
- $run eval "$cmds"
- fi
- done
- fi
-
- echo "------------------------------------------------------------------------------"
- echo "Libraries have been installed in:"
- for libdir in $libdirs; do
- echo " $libdir"
- done
- echo
- echo "To link against installed libraries in a given directory, LIBDIR,"
- echo "you must use the \`-LLIBDIR' flag during linking."
- echo
- echo " You will also need to do one of the following:"
- if test -n "$shlibpath_var"; then
- echo " - add LIBDIR to the \`$shlibpath_var' environment variable"
- echo " during execution"
- fi
- if test -n "$runpath_var"; then
- echo " - add LIBDIR to the \`$runpath_var' environment variable"
- echo " during linking"
- fi
- if test -n "$hardcode_libdir_flag_spec"; then
- libdir=LIBDIR
- eval flag=\"$hardcode_libdir_flag_spec\"
-
- echo " - use the \`$flag' linker flag"
- fi
- if test -f /etc/ld.so.conf; then
- echo " - have your system administrator add LIBDIR to \`/etc/ld.so.conf'"
- fi
- echo
- echo "See any operating system documentation about shared libraries for"
- echo "more information, such as the ld(1) and ld.so(8) manual pages."
- echo "------------------------------------------------------------------------------"
- exit 0
- ;;
-
- # libtool execute mode
- execute)
- modename="$modename: execute"
-
- # The first argument is the command name.
- cmd="$nonopt"
- if test -z "$cmd"; then
- $echo "$modename: you must specify a COMMAND" 1>&2
- $echo "$help"
- exit 1
- fi
-
- # Handle -dlopen flags immediately.
- for file in $execute_dlfiles; do
- if test -f "$file"; then :
- else
- $echo "$modename: \`$file' is not a file" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
-
- dir=
- case "$file" in
- *.la)
- # Check to see that this really is a libtool archive.
- if (sed -e '2q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then :
- else
- $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
-
- # Read the libtool library.
- dlname=
- library_names=
-
- # If there is no directory component, then add one.
- case "$file" in
- */* | *\\*) . $file ;;
- *) . ./$file ;;
- esac
-
- # Skip this library if it cannot be dlopened.
- if test -z "$dlname"; then
- # Warn if it was a shared library.
- test -n "$library_names" && $echo "$modename: warning: \`$file' was not linked with \`-export-dynamic'"
- continue
- fi
-
- dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
- test "X$dir" = "X$file" && dir=.
-
- if test -f "$dir/$objdir/$dlname"; then
- dir="$dir/$objdir"
- else
- $echo "$modename: cannot find \`$dlname' in \`$dir' or \`$dir/$objdir'" 1>&2
- exit 1
- fi
- ;;
-
- *.lo)
- # Just add the directory containing the .lo file.
- dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
- test "X$dir" = "X$file" && dir=.
- ;;
-
- *)
- $echo "$modename: warning \`-dlopen' is ignored for non-libtool libraries and objects" 1>&2
- continue
- ;;
- esac
-
- # Get the absolute pathname.
- absdir=`cd "$dir" && pwd`
- test -n "$absdir" && dir="$absdir"
-
- # Now add the directory to shlibpath_var.
- if eval "test -z \"\$$shlibpath_var\""; then
- eval "$shlibpath_var=\"\$dir\""
- else
- eval "$shlibpath_var=\"\$dir:\$$shlibpath_var\""
- fi
- done
-
- # This variable tells wrapper scripts just to set shlibpath_var
- # rather than running their programs.
- libtool_execute_magic="$magic"
-
- # Check if any of the arguments is a wrapper script.
- args=
- for file
- do
- case "$file" in
- -*) ;;
- *)
- # Do a test to see if this is really a libtool program.
- if (sed -e '4q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then
- # If there is no directory component, then add one.
- case "$file" in
- */* | *\\*) . $file ;;
- *) . ./$file ;;
- esac
-
- # Transform arg to wrapped name.
- file="$progdir/$program"
- fi
- ;;
- esac
- # Quote arguments (to preserve shell metacharacters).
- file=`$echo "X$file" | $Xsed -e "$sed_quote_subst"`
- args="$args \"$file\""
- done
-
- if test -z "$run"; then
- # Export the shlibpath_var.
- eval "export $shlibpath_var"
-
- # Now actually exec the command.
- eval "exec \$cmd$args"
-
- $echo "$modename: cannot exec \$cmd$args"
- exit 1
- else
- # Display what would be done.
- eval "\$echo \"\$shlibpath_var=\$$shlibpath_var\""
- $echo "export $shlibpath_var"
- $echo "$cmd$args"
- exit 0
- fi
- ;;
-
- # libtool uninstall mode
- uninstall)
- modename="$modename: uninstall"
- rm="$nonopt"
- files=
-
- for arg
- do
- case "$arg" in
- -*) rm="$rm $arg" ;;
- *) files="$files $arg" ;;
- esac
- done
-
- if test -z "$rm"; then
- $echo "$modename: you must specify an RM program" 1>&2
- $echo "$help" 1>&2
- exit 1
- fi
-
- for file in $files; do
- dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
- test "X$dir" = "X$file" && dir=.
- name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
-
- rmfiles="$file"
-
- case "$name" in
- *.la)
- # Possibly a libtool archive, so verify it.
- if (sed -e '2q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then
- . $dir/$name
-
- # Delete the libtool libraries and symlinks.
- for n in $library_names; do
- rmfiles="$rmfiles $dir/$n"
- test "X$n" = "X$dlname" && dlname=
- done
- test -n "$dlname" && rmfiles="$rmfiles $dir/$dlname"
- test -n "$old_library" && rmfiles="$rmfiles $dir/$old_library"
-
- $show "$rm $rmfiles"
- $run $rm $rmfiles
-
- if test -n "$library_names"; then
- # Do each command in the postuninstall commands.
- eval cmds=\"$postuninstall_cmds\"
- IFS="${IFS= }"; save_ifs="$IFS"; IFS=';'
- for cmd in $cmds; do
- IFS="$save_ifs"
- $show "$cmd"
- $run eval "$cmd"
- done
- IFS="$save_ifs"
- fi
-
- if test -n "$old_library"; then
- # Do each command in the old_postuninstall commands.
- eval cmds=\"$old_postuninstall_cmds\"
- IFS="${IFS= }"; save_ifs="$IFS"; IFS=';'
- for cmd in $cmds; do
- IFS="$save_ifs"
- $show "$cmd"
- $run eval "$cmd"
- done
- IFS="$save_ifs"
- fi
-
- # FIXME: should reinstall the best remaining shared library.
- fi
- ;;
-
- *.lo)
- if test "$build_old_libs" = yes; then
- oldobj=`$echo "X$name" | $Xsed -e 's/\.lo$/\.o/'`
- rmfiles="$rmfiles $dir/$oldobj"
- fi
- $show "$rm $rmfiles"
- $run $rm $rmfiles
- ;;
-
- *)
- $show "$rm $rmfiles"
- $run $rm $rmfiles
- ;;
- esac
- done
- exit 0
- ;;
-
- "")
- $echo "$modename: you must specify a MODE" 1>&2
- $echo "$generic_help" 1>&2
- exit 1
- ;;
- esac
-
- $echo "$modename: invalid operation mode \`$mode'" 1>&2
- $echo "$generic_help" 1>&2
- exit 1
-fi # test -z "$show_help"
-
-# We need to display help for each of the modes.
-case "$mode" in
-"") $echo \
-"Usage: $modename [OPTION]... [MODE-ARG]...
-
-Provide generalized library-building support services.
-
--n, --dry-run display commands without modifying any files
- --features display configuration information and exit
- --finish same as \`--mode=finish'
- --help display this help message and exit
- --mode=MODE use operation mode MODE [default=inferred from MODE-ARGS]
- --quiet same as \`--silent'
- --silent don't print informational messages
- --version print version information
-
-MODE must be one of the following:
-
- compile compile a source file into a libtool object
- execute automatically set library path, then run a program
- finish complete the installation of libtool libraries
- install install libraries or executables
- link create a library or an executable
- uninstall remove libraries from an installed directory
-
-MODE-ARGS vary depending on the MODE. Try \`$modename --help --mode=MODE' for
-a more detailed description of MODE."
- exit 0
- ;;
-
-compile)
- $echo \
-"Usage: $modename [OPTION]... --mode=compile COMPILE-COMMAND... SOURCEFILE
-
-Compile a source file into a libtool library object.
-
-COMPILE-COMMAND is a command to be used in creating a \`standard' object file
-from the given SOURCEFILE.
-
-The output file name is determined by removing the directory component from
-SOURCEFILE, then substituting the C source code suffix \`.c' with the
-library object suffix, \`.lo'."
- ;;
-
-execute)
- $echo \
-"Usage: $modename [OPTION]... --mode=execute COMMAND [ARGS]...
-
-Automatically set library path, then run a program.
-
-This mode accepts the following additional options:
-
- -dlopen FILE add the directory containing FILE to the library path
-
-This mode sets the library path environment variable according to \`-dlopen'
-flags.
-
-If any of the ARGS are libtool executable wrappers, then they are translated
-into their corresponding uninstalled binary, and any of their required library
-directories are added to the library path.
-
-Then, COMMAND is executed, with ARGS as arguments."
- ;;
-
-finish)
- $echo \
-"Usage: $modename [OPTION]... --mode=finish [LIBDIR]...
-
-Complete the installation of libtool libraries.
-
-Each LIBDIR is a directory that contains libtool libraries.
-
-The commands that this mode executes may require superuser privileges. Use
-the \`--dry-run' option if you just want to see what would be executed."
- ;;
-
-install)
- $echo \
-"Usage: $modename [OPTION]... --mode=install INSTALL-COMMAND...
-
-Install executables or libraries.
-
-INSTALL-COMMAND is the installation command. The first component should be
-either the \`install' or \`cp' program.
-
-The rest of the components are interpreted as arguments to that command (only
-BSD-compatible install options are recognized)."
- ;;
-
-link)
- $echo \
-"Usage: $modename [OPTION]... --mode=link LINK-COMMAND...
-
-Link object files or libraries together to form another library, or to
-create an executable program.
-
-LINK-COMMAND is a command using the C compiler that you would use to create
-a program from several object files.
-
-The following components of LINK-COMMAND are treated specially:
-
- -all-static do not do any dynamic linking at all
- -dlopen FILE \`-dlpreopen' FILE if it cannot be dlopened at runtime
- -dlpreopen FILE link in FILE and add its symbols to dld_preloaded_symbols
- -export-dynamic allow symbols from OUTPUT-FILE to be resolved with dlsym(3)
- -LLIBDIR search LIBDIR for required installed libraries
- -lNAME OUTPUT-FILE requires the installed library libNAME
- -no-undefined declare that a library does not refer to external symbols
- -o OUTPUT-FILE create OUTPUT-FILE from the specified objects
- -release RELEASE specify package release information
- -rpath LIBDIR the created library will eventually be installed in LIBDIR
- -static do not do any dynamic linking of libtool libraries
- -version-info CURRENT[:REVISION[:AGE]]
- specify library version info [each variable defaults to 0]
-
-All other options (arguments beginning with \`-') are ignored.
-
-Every other argument is treated as a filename. Files ending in \`.la' are
-treated as uninstalled libtool libraries, other files are standard or library
-object files.
-
-If the OUTPUT-FILE ends in \`.la', then a libtool library is created, only
-library objects (\`.lo' files) may be specified, and \`-rpath' is required.
-
-If OUTPUT-FILE ends in \`.a', then a standard library is created using \`ar'
-and \`ranlib'.
-
-If OUTPUT-FILE ends in \`.lo' or \`.o', then a reloadable object file is
-created, otherwise an executable program is created."
- ;;
-
-uninstall)
- $echo
-"Usage: $modename [OPTION]... --mode=uninstall RM [RM-OPTION]... FILE...
-
-Remove libraries from an installation directory.
-
-RM is the name of the program to use to delete files associated with each FILE
-(typically \`/bin/rm'). RM-OPTIONS are options (such as \`-f') to be passed
-to RM.
-
-If FILE is a libtool library, all the files associated with it are deleted.
-Otherwise, only FILE itself is deleted using RM."
- ;;
-
-*)
- $echo "$modename: invalid operation mode \`$mode'" 1>&2
- $echo "$help" 1>&2
- exit 1
- ;;
-esac
-
-echo
-$echo "Try \`$modename --help' for more information about other modes."
-
-exit 0
-
-# Local Variables:
-# mode:shell-script
-# sh-indentation:2
-# End:
diff --git a/makcjpeg.st b/makcjpeg.st
deleted file mode 100644
index fc72c89..0000000
--- a/makcjpeg.st
+++ /dev/null
@@ -1,38 +0,0 @@
-; Project file for Independent JPEG Group's software
-;
-; This project file is for Atari ST/STE/TT systems using Pure C or Turbo C.
-; Thanks to Frank Moehle (Frank.Moehle@arbi.informatik.uni-oldenburg.de),
-; Dr. B. Setzepfandt (bernd@gina.uni-muenster.de),
-; and Guido Vollbeding (guivol@esc.de).
-;
-; To use this file, rename it to cjpeg.prj.
-; If you are using Turbo C, change filenames beginning with "pc..." to "tc..."
-; Read installation instructions before trying to make the program!
-;
-;
-; * * * Output file * * *
-cjpeg.ttp
-;
-; * * * COMPILER OPTIONS * * *
-.C[-P] ; absolute calls
-.C[-M] ; and no string merging, folks
-.C[-w-cln] ; no "constant is long" warnings
-.C[-w-par] ; no "parameter xxxx unused"
-.C[-w-rch] ; no "unreachable code"
-.C[-wsig] ; warn if significant digits may be lost
-=
-; * * * * List of modules * * * *
-pcstart.o
-cjpeg.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h,jversion.h)
-cdjpeg.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdswitch.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdppm.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdgif.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdtarga.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdbmp.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdrle.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-libjpeg.lib ; built by libjpeg.prj
-pcfltlib.lib ; floating point library
-; the float library can be omitted if you've turned off DCT_FLOAT_SUPPORTED
-pcstdlib.lib ; standard library
-pcextlib.lib ; extended library
diff --git a/makdjpeg.st b/makdjpeg.st
deleted file mode 100644
index 3226726..0000000
--- a/makdjpeg.st
+++ /dev/null
@@ -1,38 +0,0 @@
-; Project file for Independent JPEG Group's software
-;
-; This project file is for Atari ST/STE/TT systems using Pure C or Turbo C.
-; Thanks to Frank Moehle (Frank.Moehle@arbi.informatik.uni-oldenburg.de),
-; Dr. B. Setzepfandt (bernd@gina.uni-muenster.de),
-; and Guido Vollbeding (guivol@esc.de).
-;
-; To use this file, rename it to djpeg.prj.
-; If you are using Turbo C, change filenames beginning with "pc..." to "tc..."
-; Read installation instructions before trying to make the program!
-;
-;
-; * * * Output file * * *
-djpeg.ttp
-;
-; * * * COMPILER OPTIONS * * *
-.C[-P] ; absolute calls
-.C[-M] ; and no string merging, folks
-.C[-w-cln] ; no "constant is long" warnings
-.C[-w-par] ; no "parameter xxxx unused"
-.C[-w-rch] ; no "unreachable code"
-.C[-wsig] ; warn if significant digits may be lost
-=
-; * * * * List of modules * * * *
-pcstart.o
-djpeg.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h,jversion.h)
-cdjpeg.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdcolmap.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-wrppm.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-wrgif.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-wrtarga.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-wrbmp.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-wrrle.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-libjpeg.lib ; built by libjpeg.prj
-pcfltlib.lib ; floating point library
-; the float library can be omitted if you've turned off DCT_FLOAT_SUPPORTED
-pcstdlib.lib ; standard library
-pcextlib.lib ; extended library
diff --git a/makeapps.ds b/makeapps.ds
deleted file mode 100644
index bedd038..0000000
--- a/makeapps.ds
+++ /dev/null
@@ -1,828 +0,0 @@
-# Microsoft Developer Studio Generated NMAKE File, Format Version 4.20
-# ** DO NOT EDIT **
-
-# TARGTYPE "Win32 (x86) Console Application" 0x0103
-
-!IF "$(CFG)" == ""
-CFG=cjpeg - Win32
-!MESSAGE No configuration specified. Defaulting to cjpeg - Win32.
-!ENDIF
-
-!IF "$(CFG)" != "cjpeg - Win32" && "$(CFG)" != "djpeg - Win32" &&\
- "$(CFG)" != "jpegtran - Win32" && "$(CFG)" != "rdjpgcom - Win32" &&\
- "$(CFG)" != "wrjpgcom - Win32"
-!MESSAGE Invalid configuration "$(CFG)" specified.
-!MESSAGE You can specify a configuration when running NMAKE on this makefile
-!MESSAGE by defining the macro CFG on the command line. For example:
-!MESSAGE
-!MESSAGE NMAKE /f "apps.mak" CFG="cjpeg - Win32"
-!MESSAGE
-!MESSAGE Possible choices for configuration are:
-!MESSAGE
-!MESSAGE "cjpeg - Win32" (based on "Win32 (x86) Console Application")
-!MESSAGE "djpeg - Win32" (based on "Win32 (x86) Console Application")
-!MESSAGE "jpegtran - Win32" (based on "Win32 (x86) Console Application")
-!MESSAGE "rdjpgcom - Win32" (based on "Win32 (x86) Console Application")
-!MESSAGE "wrjpgcom - Win32" (based on "Win32 (x86) Console Application")
-!MESSAGE
-!ERROR An invalid configuration is specified.
-!ENDIF
-
-!IF "$(OS)" == "Windows_NT"
-NULL=
-!ELSE
-NULL=nul
-!ENDIF
-################################################################################
-# Begin Project
-# PROP Target_Last_Scanned "cjpeg - Win32"
-CPP=cl.exe
-RSC=rc.exe
-
-!IF "$(CFG)" == "cjpeg - Win32"
-
-# PROP BASE Use_MFC 0
-# PROP BASE Use_Debug_Libraries 0
-# PROP BASE Output_Dir "cjpeg\Release"
-# PROP BASE Intermediate_Dir "cjpeg\Release"
-# PROP BASE Target_Dir "cjpeg"
-# PROP Use_MFC 0
-# PROP Use_Debug_Libraries 0
-# PROP Output_Dir "cjpeg\Release"
-# PROP Intermediate_Dir "cjpeg\Release"
-# PROP Target_Dir "cjpeg"
-OUTDIR=.\cjpeg\Release
-INTDIR=.\cjpeg\Release
-
-ALL : "$(OUTDIR)\cjpeg.exe"
-
-CLEAN :
- -@erase "$(INTDIR)\cjpeg.obj"
- -@erase "$(INTDIR)\rdppm.obj"
- -@erase "$(INTDIR)\rdgif.obj"
- -@erase "$(INTDIR)\rdtarga.obj"
- -@erase "$(INTDIR)\rdrle.obj"
- -@erase "$(INTDIR)\rdbmp.obj"
- -@erase "$(INTDIR)\rdswitch.obj"
- -@erase "$(INTDIR)\cdjpeg.obj"
- -@erase "$(OUTDIR)\cjpeg.exe"
-
-"$(OUTDIR)" :
- if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)"
-
-# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE"\
- /Fp"$(INTDIR)/cjpeg.pch" /YX /Fo"$(INTDIR)/" /c
-CPP_OBJS=.\cjpeg\Release/
-CPP_SBRS=.\.
-# ADD BASE RSC /l 0x409 /d "NDEBUG"
-# ADD RSC /l 0x409 /d "NDEBUG"
-BSC32=bscmake.exe
-# ADD BASE BSC32 /nologo
-# ADD BSC32 /nologo
-BSC32_FLAGS=/nologo /o"$(OUTDIR)/cjpeg.bsc"
-BSC32_SBRS= \
-
-LINK32=link.exe
-# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-# ADD LINK32 Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-LINK32_FLAGS=Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib\
- comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib\
- odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no\
- /pdb:"$(OUTDIR)/cjpeg.pdb" /machine:I386 /out:"$(OUTDIR)/cjpeg.exe"
-LINK32_OBJS= \
- "$(INTDIR)\cjpeg.obj" \
- "$(INTDIR)\rdppm.obj" \
- "$(INTDIR)\rdgif.obj" \
- "$(INTDIR)\rdtarga.obj" \
- "$(INTDIR)\rdrle.obj" \
- "$(INTDIR)\rdbmp.obj" \
- "$(INTDIR)\rdswitch.obj" \
- "$(INTDIR)\cdjpeg.obj" \
-
-
-"$(OUTDIR)\cjpeg.exe" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
- $(LINK32) @<<
- $(LINK32_FLAGS) $(LINK32_OBJS)
-<<
-
-!ELSEIF "$(CFG)" == "djpeg - Win32"
-
-# PROP BASE Use_MFC 0
-# PROP BASE Use_Debug_Libraries 0
-# PROP BASE Output_Dir "djpeg\Release"
-# PROP BASE Intermediate_Dir "djpeg\Release"
-# PROP BASE Target_Dir "djpeg"
-# PROP Use_MFC 0
-# PROP Use_Debug_Libraries 0
-# PROP Output_Dir "djpeg\Release"
-# PROP Intermediate_Dir "djpeg\Release"
-# PROP Target_Dir "djpeg"
-OUTDIR=.\djpeg\Release
-INTDIR=.\djpeg\Release
-
-ALL : "$(OUTDIR)\djpeg.exe"
-
-CLEAN :
- -@erase "$(INTDIR)\djpeg.obj"
- -@erase "$(INTDIR)\wrppm.obj"
- -@erase "$(INTDIR)\wrgif.obj"
- -@erase "$(INTDIR)\wrtarga.obj"
- -@erase "$(INTDIR)\wrrle.obj"
- -@erase "$(INTDIR)\wrbmp.obj"
- -@erase "$(INTDIR)\rdcolmap.obj"
- -@erase "$(INTDIR)\cdjpeg.obj"
- -@erase "$(OUTDIR)\djpeg.exe"
-
-"$(OUTDIR)" :
- if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)"
-
-# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE"\
- /Fp"$(INTDIR)/djpeg.pch" /YX /Fo"$(INTDIR)/" /c
-CPP_OBJS=.\djpeg\Release/
-CPP_SBRS=.\.
-# ADD BASE RSC /l 0x409 /d "NDEBUG"
-# ADD RSC /l 0x409 /d "NDEBUG"
-BSC32=bscmake.exe
-# ADD BASE BSC32 /nologo
-# ADD BSC32 /nologo
-BSC32_FLAGS=/nologo /o"$(OUTDIR)/djpeg.bsc"
-BSC32_SBRS= \
-
-LINK32=link.exe
-# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-# ADD LINK32 Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-LINK32_FLAGS=Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib\
- comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib\
- odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no\
- /pdb:"$(OUTDIR)/djpeg.pdb" /machine:I386 /out:"$(OUTDIR)/djpeg.exe"
-LINK32_OBJS= \
- "$(INTDIR)\djpeg.obj" \
- "$(INTDIR)\wrppm.obj" \
- "$(INTDIR)\wrgif.obj" \
- "$(INTDIR)\wrtarga.obj" \
- "$(INTDIR)\wrrle.obj" \
- "$(INTDIR)\wrbmp.obj" \
- "$(INTDIR)\rdcolmap.obj" \
- "$(INTDIR)\cdjpeg.obj" \
-
-
-"$(OUTDIR)\djpeg.exe" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
- $(LINK32) @<<
- $(LINK32_FLAGS) $(LINK32_OBJS)
-<<
-
-!ELSEIF "$(CFG)" == "jpegtran - Win32"
-
-# PROP BASE Use_MFC 0
-# PROP BASE Use_Debug_Libraries 0
-# PROP BASE Output_Dir "jpegtran\Release"
-# PROP BASE Intermediate_Dir "jpegtran\Release"
-# PROP BASE Target_Dir "jpegtran"
-# PROP Use_MFC 0
-# PROP Use_Debug_Libraries 0
-# PROP Output_Dir "jpegtran\Release"
-# PROP Intermediate_Dir "jpegtran\Release"
-# PROP Target_Dir "jpegtran"
-OUTDIR=.\jpegtran\Release
-INTDIR=.\jpegtran\Release
-
-ALL : "$(OUTDIR)\jpegtran.exe"
-
-CLEAN :
- -@erase "$(INTDIR)\jpegtran.obj"
- -@erase "$(INTDIR)\rdswitch.obj"
- -@erase "$(INTDIR)\cdjpeg.obj"
- -@erase "$(INTDIR)\transupp.obj"
- -@erase "$(OUTDIR)\jpegtran.exe"
-
-"$(OUTDIR)" :
- if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)"
-
-# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE"\
- /Fp"$(INTDIR)/jpegtran.pch" /YX /Fo"$(INTDIR)/" /c
-CPP_OBJS=.\jpegtran\Release/
-CPP_SBRS=.\.
-# ADD BASE RSC /l 0x409 /d "NDEBUG"
-# ADD RSC /l 0x409 /d "NDEBUG"
-BSC32=bscmake.exe
-# ADD BASE BSC32 /nologo
-# ADD BSC32 /nologo
-BSC32_FLAGS=/nologo /o"$(OUTDIR)/jpegtran.bsc"
-BSC32_SBRS= \
-
-LINK32=link.exe
-# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-# ADD LINK32 Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-LINK32_FLAGS=Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib\
- comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib\
- odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no\
- /pdb:"$(OUTDIR)/jpegtran.pdb" /machine:I386 /out:"$(OUTDIR)/jpegtran.exe"
-LINK32_OBJS= \
- "$(INTDIR)\jpegtran.obj" \
- "$(INTDIR)\rdswitch.obj" \
- "$(INTDIR)\cdjpeg.obj" \
- "$(INTDIR)\transupp.obj" \
-
-
-"$(OUTDIR)\jpegtran.exe" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
- $(LINK32) @<<
- $(LINK32_FLAGS) $(LINK32_OBJS)
-<<
-
-!ELSEIF "$(CFG)" == "rdjpgcom - Win32"
-
-# PROP BASE Use_MFC 0
-# PROP BASE Use_Debug_Libraries 0
-# PROP BASE Output_Dir "rdjpgcom\Release"
-# PROP BASE Intermediate_Dir "rdjpgcom\Release"
-# PROP BASE Target_Dir "rdjpgcom"
-# PROP Use_MFC 0
-# PROP Use_Debug_Libraries 0
-# PROP Output_Dir "rdjpgcom\Release"
-# PROP Intermediate_Dir "rdjpgcom\Release"
-# PROP Target_Dir "rdjpgcom"
-OUTDIR=.\rdjpgcom\Release
-INTDIR=.\rdjpgcom\Release
-
-ALL : "$(OUTDIR)\rdjpgcom.exe"
-
-CLEAN :
- -@erase "$(INTDIR)\rdjpgcom.obj"
- -@erase "$(OUTDIR)\rdjpgcom.exe"
-
-"$(OUTDIR)" :
- if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)"
-
-# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE"\
- /Fp"$(INTDIR)/rdjpgcom.pch" /YX /Fo"$(INTDIR)/" /c
-CPP_OBJS=.\rdjpgcom\Release/
-CPP_SBRS=.\.
-# ADD BASE RSC /l 0x409 /d "NDEBUG"
-# ADD RSC /l 0x409 /d "NDEBUG"
-BSC32=bscmake.exe
-# ADD BASE BSC32 /nologo
-# ADD BSC32 /nologo
-BSC32_FLAGS=/nologo /o"$(OUTDIR)/rdjpgcom.bsc"
-BSC32_SBRS= \
-
-LINK32=link.exe
-# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-# ADD LINK32 Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-LINK32_FLAGS=Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib\
- comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib\
- odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no\
- /pdb:"$(OUTDIR)/rdjpgcom.pdb" /machine:I386 /out:"$(OUTDIR)/rdjpgcom.exe"
-LINK32_OBJS= \
- "$(INTDIR)\rdjpgcom.obj"
-
-"$(OUTDIR)\rdjpgcom.exe" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
- $(LINK32) @<<
- $(LINK32_FLAGS) $(LINK32_OBJS)
-<<
-
-!ELSEIF "$(CFG)" == "wrjpgcom - Win32"
-
-# PROP BASE Use_MFC 0
-# PROP BASE Use_Debug_Libraries 0
-# PROP BASE Output_Dir "wrjpgcom\Release"
-# PROP BASE Intermediate_Dir "wrjpgcom\Release"
-# PROP BASE Target_Dir "wrjpgcom"
-# PROP Use_MFC 0
-# PROP Use_Debug_Libraries 0
-# PROP Output_Dir "wrjpgcom\Release"
-# PROP Intermediate_Dir "wrjpgcom\Release"
-# PROP Target_Dir "wrjpgcom"
-OUTDIR=.\wrjpgcom\Release
-INTDIR=.\wrjpgcom\Release
-
-ALL : "$(OUTDIR)\wrjpgcom.exe"
-
-CLEAN :
- -@erase "$(INTDIR)\wrjpgcom.obj"
- -@erase "$(OUTDIR)\wrjpgcom.exe"
-
-"$(OUTDIR)" :
- if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)"
-
-# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /c
-CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE"\
- /Fp"$(INTDIR)/wrjpgcom.pch" /YX /Fo"$(INTDIR)/" /c
-CPP_OBJS=.\wrjpgcom\Release/
-CPP_SBRS=.\.
-# ADD BASE RSC /l 0x409 /d "NDEBUG"
-# ADD RSC /l 0x409 /d "NDEBUG"
-BSC32=bscmake.exe
-# ADD BASE BSC32 /nologo
-# ADD BSC32 /nologo
-BSC32_FLAGS=/nologo /o"$(OUTDIR)/wrjpgcom.bsc"
-BSC32_SBRS= \
-
-LINK32=link.exe
-# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-# ADD LINK32 Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-LINK32_FLAGS=Release\jpeg.lib kernel32.lib user32.lib gdi32.lib winspool.lib\
- comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib\
- odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no\
- /pdb:"$(OUTDIR)/wrjpgcom.pdb" /machine:I386 /out:"$(OUTDIR)/wrjpgcom.exe"
-LINK32_OBJS= \
- "$(INTDIR)\wrjpgcom.obj"
-
-"$(OUTDIR)\wrjpgcom.exe" : "$(OUTDIR)" $(DEF_FILE) $(LINK32_OBJS)
- $(LINK32) @<<
- $(LINK32_FLAGS) $(LINK32_OBJS)
-<<
-
-!ENDIF
-
-.c{$(CPP_OBJS)}.obj:
- $(CPP) $(CPP_PROJ) $<
-
-.cpp{$(CPP_OBJS)}.obj:
- $(CPP) $(CPP_PROJ) $<
-
-.cxx{$(CPP_OBJS)}.obj:
- $(CPP) $(CPP_PROJ) $<
-
-.c{$(CPP_SBRS)}.sbr:
- $(CPP) $(CPP_PROJ) $<
-
-.cpp{$(CPP_SBRS)}.sbr:
- $(CPP) $(CPP_PROJ) $<
-
-.cxx{$(CPP_SBRS)}.sbr:
- $(CPP) $(CPP_PROJ) $<
-
-################################################################################
-# Begin Target
-
-# Name "cjpeg - Win32"
-
-!IF "$(CFG)" == "cjpeg - Win32"
-
-!ENDIF
-
-################################################################################
-# Begin Source File
-
-SOURCE="cjpeg.c"
-DEP_CPP_CJPEG=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
- "jversion.h"\
-
-
-"$(INTDIR)\cjpeg.obj" : $(SOURCE) $(DEP_CPP_CJPEG) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="cdjpeg.c"
-DEP_CPP_CDJPE=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\cdjpeg.obj" : $(SOURCE) $(DEP_CPP_CDJPE) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdswitch.c"
-DEP_CPP_RDSWI=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\rdswitch.obj" : $(SOURCE) $(DEP_CPP_RDSWI) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdppm.c"
-DEP_CPP_RDPPM=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\rdppm.obj" : $(SOURCE) $(DEP_CPP_RDPPM) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdgif.c"
-DEP_CPP_RDGIF=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\rdgif.obj" : $(SOURCE) $(DEP_CPP_RDGIF) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdtarga.c"
-DEP_CPP_RDTAR=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\rdtarga.obj" : $(SOURCE) $(DEP_CPP_RDTAR) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdbmp.c"
-DEP_CPP_RDBMP=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\rdbmp.obj" : $(SOURCE) $(DEP_CPP_RDBMP) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdrle.c"
-DEP_CPP_RDRLE=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\rdrle.obj" : $(SOURCE) $(DEP_CPP_RDRLE) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-# End Target
-################################################################################
-# Begin Target
-
-# Name "djpeg - Win32"
-
-!IF "$(CFG)" == "djpeg - Win32"
-
-!ENDIF
-
-################################################################################
-# Begin Source File
-
-SOURCE="djpeg.c"
-DEP_CPP_DJPEG=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
- "jversion.h"\
-
-
-"$(INTDIR)\djpeg.obj" : $(SOURCE) $(DEP_CPP_DJPEG) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="cdjpeg.c"
-DEP_CPP_CDJPE=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\cdjpeg.obj" : $(SOURCE) $(DEP_CPP_CDJPE) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdcolmap.c"
-DEP_CPP_RDCOL=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\rdcolmap.obj" : $(SOURCE) $(DEP_CPP_RDCOL) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="wrppm.c"
-DEP_CPP_WRPPM=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\wrppm.obj" : $(SOURCE) $(DEP_CPP_WRPPM) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="wrgif.c"
-DEP_CPP_WRGIF=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\wrgif.obj" : $(SOURCE) $(DEP_CPP_WRGIF) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="wrtarga.c"
-DEP_CPP_WRTAR=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\wrtarga.obj" : $(SOURCE) $(DEP_CPP_WRTAR) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="wrbmp.c"
-DEP_CPP_WRBMP=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\wrbmp.obj" : $(SOURCE) $(DEP_CPP_WRBMP) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="wrrle.c"
-DEP_CPP_WRRLE=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\wrrle.obj" : $(SOURCE) $(DEP_CPP_WRRLE) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-# End Target
-################################################################################
-# Begin Target
-
-# Name "jpegtran - Win32"
-
-!IF "$(CFG)" == "jpegtran - Win32"
-
-!ENDIF
-
-################################################################################
-# Begin Source File
-
-SOURCE="jpegtran.c"
-DEP_CPP_JPEGT=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
- "transupp.h"\
- "jversion.h"\
-
-
-"$(INTDIR)\jpegtran.obj" : $(SOURCE) $(DEP_CPP_JPEGT) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="cdjpeg.c"
-DEP_CPP_CDJPE=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\cdjpeg.obj" : $(SOURCE) $(DEP_CPP_CDJPE) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="rdswitch.c"
-DEP_CPP_RDSWI=\
- "cdjpeg.h"\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
- "cderror.h"\
-
-
-"$(INTDIR)\rdswitch.obj" : $(SOURCE) $(DEP_CPP_RDSWI) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="transupp.c"
-DEP_CPP_TRANS=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "transupp.h"\
-
-
-"$(INTDIR)\transupp.obj" : $(SOURCE) $(DEP_CPP_TRANS) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-# End Target
-################################################################################
-# Begin Target
-
-# Name "rdjpgcom - Win32"
-
-!IF "$(CFG)" == "rdjpgcom - Win32"
-
-!ENDIF
-
-################################################################################
-# Begin Source File
-
-SOURCE="rdjpgcom.c"
-DEP_CPP_RDJPG=\
- "jinclude.h"\
- "jconfig.h"\
-
-
-"$(INTDIR)\rdjpgcom.obj" : $(SOURCE) $(DEP_CPP_RDJPG) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-# End Target
-################################################################################
-# Begin Target
-
-# Name "wrjpgcom - Win32"
-
-!IF "$(CFG)" == "wrjpgcom - Win32"
-
-!ENDIF
-
-################################################################################
-# Begin Source File
-
-SOURCE="wrjpgcom.c"
-DEP_CPP_WRJPG=\
- "jinclude.h"\
- "jconfig.h"\
-
-
-"$(INTDIR)\wrjpgcom.obj" : $(SOURCE) $(DEP_CPP_WRJPG) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-# End Target
-# End Project
-################################################################################
-
diff --git a/makefile.ansi b/makefile.ansi
deleted file mode 100644
index 8291913..0000000
--- a/makefile.ansi
+++ /dev/null
@@ -1,214 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is suitable for Unix-like systems with ANSI-capable compilers.
-# If you have a non-ANSI compiler, makefile.unix is a better starting point.
-
-# Read installation instructions before saying "make" !!
-
-# The name of your C compiler:
-CC= cc
-
-# You may need to adjust these cc options:
-CFLAGS= -O
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-
-# Link-time cc options:
-LDFLAGS=
-
-# To link any special libraries, add the necessary -l commands here.
-LDLIBS=
-
-# Put here the object file name for the correct system-dependent memory
-# manager file. For Unix this is usually jmemnobs.o, but you may want
-# to use jmemansi.o or jmemname.o if you have limited swap space.
-SYSDEPMEM= jmemnobs.o
-
-# miscellaneous OS-dependent stuff
-# linker
-LN= $(CC)
-# file deletion command
-RM= rm -f
-# library (.a) file creation command
-AR= ar rc
-# second step in .a creation (use "touch" if not needed)
-AR2= ranlib
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
- jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
- jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
- jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
- jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
- jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
- jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
- jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
- rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
- rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
- jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
- wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
- coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
- makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
- makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
- maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
- makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
- jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
- jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
- testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
- $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
- jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
- jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
- jfdctint.o
-# decompression library object files
-DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
- jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
- jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
- jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
-# These objectfiles are included in libjpeg.a
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
- cdjpeg.o
-DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
- cdjpeg.o
-TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
-
-
-all: libjpeg.a cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-
-libjpeg.a: $(LIBOBJECTS)
- $(RM) libjpeg.a
- $(AR) libjpeg.a $(LIBOBJECTS)
- $(AR2) libjpeg.a
-
-cjpeg: $(COBJECTS) libjpeg.a
- $(LN) $(LDFLAGS) -o cjpeg $(COBJECTS) libjpeg.a $(LDLIBS)
-
-djpeg: $(DOBJECTS) libjpeg.a
- $(LN) $(LDFLAGS) -o djpeg $(DOBJECTS) libjpeg.a $(LDLIBS)
-
-jpegtran: $(TROBJECTS) libjpeg.a
- $(LN) $(LDFLAGS) -o jpegtran $(TROBJECTS) libjpeg.a $(LDLIBS)
-
-rdjpgcom: rdjpgcom.o
- $(LN) $(LDFLAGS) -o rdjpgcom rdjpgcom.o $(LDLIBS)
-
-wrjpgcom: wrjpgcom.o
- $(LN) $(LDFLAGS) -o wrjpgcom wrjpgcom.o $(LDLIBS)
-
-jconfig.h: jconfig.doc
- echo You must prepare a system-dependent jconfig.h file.
- echo Please read the installation directions in install.doc.
- exit 1
-
-clean:
- $(RM) *.o cjpeg djpeg jpegtran libjpeg.a rdjpgcom wrjpgcom
- $(RM) core testout*
-
-test: cjpeg djpeg jpegtran
- $(RM) testout*
- ./djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
- ./djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
- ./cjpeg -dct int -outfile testout.jpg testimg.ppm
- ./djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
- ./cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
- ./jpegtran -outfile testoutt.jpg testprog.jpg
- cmp testimg.ppm testout.ppm
- cmp testimg.bmp testout.bmp
- cmp testimg.jpg testout.jpg
- cmp testimg.ppm testoutp.ppm
- cmp testimgp.jpg testoutp.jpg
- cmp testorig.jpg testoutt.jpg
-
-
-jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makefile.bcc b/makefile.bcc
deleted file mode 100644
index a1cfcde..0000000
--- a/makefile.bcc
+++ /dev/null
@@ -1,285 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is suitable for Borland C on MS-DOS or OS/2.
-# It works with Borland C++ for DOS, revision 3.0 or later,
-# and has been tested with Borland C++ for OS/2.
-# Watch out for optimization bugs in the OS/2 compilers --- see notes below!
-# Thanks to Tom Wright and Ge' Weijers (original DOS) and
-# Ken Porter (OS/2) for this file.
-
-# Read installation instructions before saying "make" !!
-
-# Are we under DOS or OS/2?
-!if !$d(DOS) && !$d(OS2)
-!if $d(__OS2__)
-OS2=1
-!else
-DOS=1
-!endif
-!endif
-
-# The name of your C compiler:
-CC= bcc
-
-# You may need to adjust these cc options:
-!if $d(DOS)
-CFLAGS= -O2 -mm -w-par -w-stu -w-ccc -w-rch
-!else
-CFLAGS= -O1 -w-par -w-stu -w-ccc -w-rch
-!endif
-# -O2 enables full code optimization (for pre-3.0 Borland C++, use -O -G -Z).
-# -O2 is buggy in Borland OS/2 C++ revision 2.0, so use -O1 there for now.
-# If you have Borland OS/2 C++ revision 1.0, use -O or no optimization at all.
-# -mm selects medium memory model (near data, far code pointers; DOS only!)
-# -w-par suppresses warnings about unused function parameters
-# -w-stu suppresses warnings about incomplete structures
-# -w-ccc suppresses warnings about compile-time-constant conditions
-# -w-rch suppresses warnings about unreachable code
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-
-# Link-time cc options:
-!if $d(DOS)
-LDFLAGS= -mm
-# memory model option here must match CFLAGS!
-!else
-LDFLAGS=
-# -lai full-screen app
-# -lc case-significant link
-!endif
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.
-# For DOS, we recommend jmemdos.c and jmemdosa.asm.
-# For OS/2, we recommend jmemnobs.c (flat memory!)
-# SYSDEPMEMLIB must list the same files with "+" signs for the librarian.
-!if $d(DOS)
-SYSDEPMEM= jmemdos.obj jmemdosa.obj
-SYSDEPMEMLIB= +jmemdos.obj +jmemdosa.obj
-!else
-SYSDEPMEM= jmemnobs.obj
-SYSDEPMEMLIB= +jmemnobs.obj
-!endif
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
- jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
- jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
- jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
- jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
- jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
- jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
- jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
- rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
- rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
- jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
- wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
- coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
- makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
- makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
- maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
- makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
- jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
- jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
- testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
- $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
- jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
- jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
- jcdctmgr.obj jfdctfst.obj jfdctflt.obj jfdctint.obj
-# decompression library object files
-DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
- jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
- jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jidctfst.obj \
- jidctflt.obj jidctint.obj jidctred.obj jdsample.obj jdcolor.obj \
- jquant1.obj jquant2.obj jdmerge.obj
-# These objectfiles are included in libjpeg.lib
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj \
- rdswitch.obj cdjpeg.obj
-DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
- rdcolmap.obj cdjpeg.obj
-TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
-
-
-all: libjpeg.lib cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
-
-libjpeg.lib: $(LIBOBJECTS)
- - del libjpeg.lib
- tlib libjpeg.lib /E /C @&&|
-+jcapimin.obj +jcapistd.obj +jctrans.obj +jcparam.obj +jdatadst.obj &
-+jcinit.obj +jcmaster.obj +jcmarker.obj +jcmainct.obj +jcprepct.obj &
-+jccoefct.obj +jccolor.obj +jcsample.obj +jchuff.obj +jcphuff.obj &
-+jcdctmgr.obj +jfdctfst.obj +jfdctflt.obj +jfdctint.obj +jdapimin.obj &
-+jdapistd.obj +jdtrans.obj +jdatasrc.obj +jdmaster.obj +jdinput.obj &
-+jdmarker.obj +jdhuff.obj +jdphuff.obj +jdmainct.obj +jdcoefct.obj &
-+jdpostct.obj +jddctmgr.obj +jidctfst.obj +jidctflt.obj +jidctint.obj &
-+jidctred.obj +jdsample.obj +jdcolor.obj +jquant1.obj +jquant2.obj &
-+jdmerge.obj +jcomapi.obj +jutils.obj +jerror.obj +jmemmgr.obj &
-$(SYSDEPMEMLIB)
-|
-
-cjpeg.exe: $(COBJECTS) libjpeg.lib
- $(CC) $(LDFLAGS) -ecjpeg.exe $(COBJECTS) libjpeg.lib
-
-djpeg.exe: $(DOBJECTS) libjpeg.lib
- $(CC) $(LDFLAGS) -edjpeg.exe $(DOBJECTS) libjpeg.lib
-
-jpegtran.exe: $(TROBJECTS) libjpeg.lib
- $(CC) $(LDFLAGS) -ejpegtran.exe $(TROBJECTS) libjpeg.lib
-
-rdjpgcom.exe: rdjpgcom.c
-!if $d(DOS)
- $(CC) -ms -O rdjpgcom.c
-!else
- $(CC) $(CFLAGS) rdjpgcom.c
-!endif
-
-# On DOS, wrjpgcom needs large model so it can malloc a 64K chunk
-wrjpgcom.exe: wrjpgcom.c
-!if $d(DOS)
- $(CC) -ml -O wrjpgcom.c
-!else
- $(CC) $(CFLAGS) wrjpgcom.c
-!endif
-
-# This "{}" syntax allows Borland Make to "batch" source files.
-# In this way, each run of the compiler can build many modules.
-.c.obj:
- $(CC) $(CFLAGS) -c{ $<}
-
-jconfig.h: jconfig.doc
- echo You must prepare a system-dependent jconfig.h file.
- echo Please read the installation directions in install.doc.
- exit 1
-
-clean:
- - del *.obj
- - del libjpeg.lib
- - del cjpeg.exe
- - del djpeg.exe
- - del jpegtran.exe
- - del rdjpgcom.exe
- - del wrjpgcom.exe
- - del testout*.*
-
-test: cjpeg.exe djpeg.exe jpegtran.exe
- - del testout*.*
- djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
- djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
- cjpeg -dct int -outfile testout.jpg testimg.ppm
- djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
- cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
- jpegtran -outfile testoutt.jpg testprog.jpg
-!if $d(DOS)
- fc /b testimg.ppm testout.ppm
- fc /b testimg.bmp testout.bmp
- fc /b testimg.jpg testout.jpg
- fc /b testimg.ppm testoutp.ppm
- fc /b testimgp.jpg testoutp.jpg
- fc /b testorig.jpg testoutt.jpg
-!else
- echo n > n.tmp
- comp testimg.ppm testout.ppm < n.tmp
- comp testimg.bmp testout.bmp < n.tmp
- comp testimg.jpg testout.jpg < n.tmp
- comp testimg.ppm testoutp.ppm < n.tmp
- comp testimgp.jpg testoutp.jpg < n.tmp
- comp testorig.jpg testoutt.jpg < n.tmp
- del n.tmp
-!endif
-
-
-jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.obj: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.obj: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.obj: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.obj: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.obj: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.obj: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.obj: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.obj: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.obj: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.obj: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.obj: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.obj: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.obj: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.obj: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.obj: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.obj: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.obj: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.obj: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.obj: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.obj: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.obj: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.obj: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.obj: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.obj: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.obj: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.obj: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.obj: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.obj: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-jmemdosa.obj: jmemdosa.asm
- tasm /mx jmemdosa.asm
diff --git a/makefile.cfg b/makefile.cfg
deleted file mode 100644
index f25e42e..0000000
--- a/makefile.cfg
+++ /dev/null
@@ -1,319 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# makefile.cfg is edited by configure to produce a custom Makefile.
-
-# Read installation instructions before saying "make" !!
-
-# For compiling with source and object files in different directories.
-srcdir = @srcdir@
-VPATH = @srcdir@
-
-# Where to install the programs and man pages.
-prefix = @prefix@
-exec_prefix = @exec_prefix@
-bindir = $(exec_prefix)/bin
-libdir = $(exec_prefix)/lib
-includedir = $(prefix)/include
-binprefix =
-manprefix =
-manext = 1
-mandir = $(prefix)/man/man$(manext)
-
-# The name of your C compiler:
-CC= @CC@
-
-# You may need to adjust these cc options:
-CFLAGS= @CFLAGS@ @CPPFLAGS@ @INCLUDEFLAGS@
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-# However, any special defines for ansi2knr.c may be included here:
-ANSI2KNRFLAGS= @ANSI2KNRFLAGS@
-
-# Link-time cc options:
-LDFLAGS= @LDFLAGS@
-
-# To link any special libraries, add the necessary -l commands here.
-LDLIBS= @LIBS@
-
-# If using GNU libtool, LIBTOOL references it; if not, LIBTOOL is empty.
-LIBTOOL = @LIBTOOL@
-# $(O) expands to "lo" if using libtool, plain "o" if not.
-# Similarly, $(A) expands to "la" or "a".
-O = @O@
-A = @A@
-
-# Library version ID; libtool uses this for the shared library version number.
-# Note: we suggest this match the macro of the same name in jpeglib.h.
-JPEG_LIB_VERSION = @JPEG_LIB_VERSION@
-
-# Put here the object file name for the correct system-dependent memory
-# manager file. For Unix this is usually jmemnobs.o, but you may want
-# to use jmemansi.o or jmemname.o if you have limited swap space.
-SYSDEPMEM= @MEMORYMGR@
-
-# miscellaneous OS-dependent stuff
-SHELL= /bin/sh
-# linker
-LN= @LN@
-# file deletion command
-RM= rm -f
-# directory creation command
-MKDIR= mkdir
-# library (.a) file creation command
-AR= ar rc
-# second step in .a creation (use "touch" if not needed)
-AR2= @RANLIB@
-# installation program
-INSTALL= @INSTALL@
-INSTALL_PROGRAM= @INSTALL_PROGRAM@
-INSTALL_LIB= @INSTALL_LIB@
-INSTALL_DATA= @INSTALL_DATA@
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
- jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
- jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
- jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
- jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
- jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
- jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
- jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
- rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
- rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
- jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
- wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
- coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
- makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
- makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
- maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
- makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
- jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
- jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
- testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
- $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.$(O) jutils.$(O) jerror.$(O) jmemmgr.$(O) $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.$(O) jcapistd.$(O) jctrans.$(O) jcparam.$(O) \
- jdatadst.$(O) jcinit.$(O) jcmaster.$(O) jcmarker.$(O) jcmainct.$(O) \
- jcprepct.$(O) jccoefct.$(O) jccolor.$(O) jcsample.$(O) jchuff.$(O) \
- jcphuff.$(O) jcdctmgr.$(O) jfdctfst.$(O) jfdctflt.$(O) \
- jfdctint.$(O)
-# decompression library object files
-DLIBOBJECTS= jdapimin.$(O) jdapistd.$(O) jdtrans.$(O) jdatasrc.$(O) \
- jdmaster.$(O) jdinput.$(O) jdmarker.$(O) jdhuff.$(O) jdphuff.$(O) \
- jdmainct.$(O) jdcoefct.$(O) jdpostct.$(O) jddctmgr.$(O) \
- jidctfst.$(O) jidctflt.$(O) jidctint.$(O) jidctred.$(O) \
- jdsample.$(O) jdcolor.$(O) jquant1.$(O) jquant2.$(O) jdmerge.$(O)
-# These objectfiles are included in libjpeg.a
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.$(O) rdppm.$(O) rdgif.$(O) rdtarga.$(O) rdrle.$(O) \
- rdbmp.$(O) rdswitch.$(O) cdjpeg.$(O)
-DOBJECTS= djpeg.$(O) wrppm.$(O) wrgif.$(O) wrtarga.$(O) wrrle.$(O) \
- wrbmp.$(O) rdcolmap.$(O) cdjpeg.$(O)
-TROBJECTS= jpegtran.$(O) rdswitch.$(O) cdjpeg.$(O) transupp.$(O)
-
-
-all: @A2K_DEPS@ libjpeg.$(A) cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-
-# Special compilation rules to support ansi2knr and libtool.
-.SUFFIXES: .lo .la
-
-# How to compile with libtool.
-@COM_LT@.c.lo:
-@COM_LT@ $(LIBTOOL) --mode=compile $(CC) $(CFLAGS) -c $(srcdir)/$*.c
-
-# How to use ansi2knr, when not using libtool.
-@COM_A2K@.c.o:
-@COM_A2K@ ./ansi2knr $(srcdir)/$*.c knr/$*.c
-@COM_A2K@ $(CC) $(CFLAGS) -c knr/$*.c
-@COM_A2K@ $(RM) knr/$*.c
-
-# How to use ansi2knr AND libtool.
-@COM_A2K@.c.lo:
-@COM_A2K@ ./ansi2knr $(srcdir)/$*.c knr/$*.c
-@COM_A2K@ $(LIBTOOL) --mode=compile $(CC) $(CFLAGS) -c knr/$*.c
-@COM_A2K@ $(RM) knr/$*.c
-
-ansi2knr: ansi2knr.c
- $(CC) $(CFLAGS) $(ANSI2KNRFLAGS) -o ansi2knr $(srcdir)/ansi2knr.c
- $(MKDIR) knr
-
-# the library:
-
-# without libtool:
-libjpeg.a: @A2K_DEPS@ $(LIBOBJECTS)
- $(RM) libjpeg.a
- $(AR) libjpeg.a $(LIBOBJECTS)
- $(AR2) libjpeg.a
-
-# with libtool:
-libjpeg.la: @A2K_DEPS@ $(LIBOBJECTS)
- $(LIBTOOL) --mode=link $(CC) -o libjpeg.la $(LIBOBJECTS) \
- -rpath $(libdir) -version-info $(JPEG_LIB_VERSION)
-
-# sample programs:
-
-cjpeg: $(COBJECTS) libjpeg.$(A)
- $(LN) $(LDFLAGS) -o cjpeg $(COBJECTS) libjpeg.$(A) $(LDLIBS)
-
-djpeg: $(DOBJECTS) libjpeg.$(A)
- $(LN) $(LDFLAGS) -o djpeg $(DOBJECTS) libjpeg.$(A) $(LDLIBS)
-
-jpegtran: $(TROBJECTS) libjpeg.$(A)
- $(LN) $(LDFLAGS) -o jpegtran $(TROBJECTS) libjpeg.$(A) $(LDLIBS)
-
-rdjpgcom: rdjpgcom.$(O)
- $(LN) $(LDFLAGS) -o rdjpgcom rdjpgcom.$(O) $(LDLIBS)
-
-wrjpgcom: wrjpgcom.$(O)
- $(LN) $(LDFLAGS) -o wrjpgcom wrjpgcom.$(O) $(LDLIBS)
-
-# Installation rules:
-
-install: cjpeg djpeg jpegtran rdjpgcom wrjpgcom @FORCE_INSTALL_LIB@
- $(INSTALL_PROGRAM) cjpeg $(bindir)/$(binprefix)cjpeg
- $(INSTALL_PROGRAM) djpeg $(bindir)/$(binprefix)djpeg
- $(INSTALL_PROGRAM) jpegtran $(bindir)/$(binprefix)jpegtran
- $(INSTALL_PROGRAM) rdjpgcom $(bindir)/$(binprefix)rdjpgcom
- $(INSTALL_PROGRAM) wrjpgcom $(bindir)/$(binprefix)wrjpgcom
- $(INSTALL_DATA) $(srcdir)/cjpeg.1 $(mandir)/$(manprefix)cjpeg.$(manext)
- $(INSTALL_DATA) $(srcdir)/djpeg.1 $(mandir)/$(manprefix)djpeg.$(manext)
- $(INSTALL_DATA) $(srcdir)/jpegtran.1 $(mandir)/$(manprefix)jpegtran.$(manext)
- $(INSTALL_DATA) $(srcdir)/rdjpgcom.1 $(mandir)/$(manprefix)rdjpgcom.$(manext)
- $(INSTALL_DATA) $(srcdir)/wrjpgcom.1 $(mandir)/$(manprefix)wrjpgcom.$(manext)
-
-install-lib: libjpeg.$(A) install-headers
- $(INSTALL_LIB) libjpeg.$(A) $(libdir)/$(binprefix)libjpeg.$(A)
-
-install-headers: jconfig.h
- $(INSTALL_DATA) jconfig.h $(includedir)/jconfig.h
- $(INSTALL_DATA) $(srcdir)/jpeglib.h $(includedir)/jpeglib.h
- $(INSTALL_DATA) $(srcdir)/jmorecfg.h $(includedir)/jmorecfg.h
- $(INSTALL_DATA) $(srcdir)/jerror.h $(includedir)/jerror.h
-
-clean:
- $(RM) *.o *.lo libjpeg.a libjpeg.la
- $(RM) cjpeg djpeg jpegtran rdjpgcom wrjpgcom
- $(RM) ansi2knr core testout* config.log config.status
- $(RM) -r knr .libs _libs
-
-distclean: clean
- $(RM) Makefile jconfig.h libtool config.cache
-
-test: cjpeg djpeg jpegtran
- $(RM) testout*
- ./djpeg -dct int -ppm -outfile testout.ppm $(srcdir)/testorig.jpg
- ./djpeg -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)/testorig.jpg
- ./cjpeg -dct int -outfile testout.jpg $(srcdir)/testimg.ppm
- ./djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)/testprog.jpg
- ./cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)/testimg.ppm
- ./jpegtran -outfile testoutt.jpg $(srcdir)/testprog.jpg
- cmp $(srcdir)/testimg.ppm testout.ppm
- cmp $(srcdir)/testimg.bmp testout.bmp
- cmp $(srcdir)/testimg.jpg testout.jpg
- cmp $(srcdir)/testimg.ppm testoutp.ppm
- cmp $(srcdir)/testimgp.jpg testoutp.jpg
- cmp $(srcdir)/testorig.jpg testoutt.jpg
-
-check: test
-
-# Mistake catcher:
-
-jconfig.h: jconfig.doc
- echo You must prepare a system-dependent jconfig.h file.
- echo Please read the installation directions in install.doc.
- exit 1
-
-# GNU Make likes to know which target names are not really files to be made:
-.PHONY: all install install-lib install-headers clean distclean test check
-
-
-jcapimin.$(O): jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.$(O): jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.$(O): jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.$(O): jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.$(O): jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.$(O): jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.$(O): jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.$(O): jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.$(O): jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.$(O): jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.$(O): jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.$(O): jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.$(O): jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.$(O): jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.$(O): jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.$(O): jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.$(O): jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.$(O): jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.$(O): jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.$(O): jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.$(O): jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.$(O): jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.$(O): jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.$(O): jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.$(O): jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.$(O): jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.$(O): jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.$(O): jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.$(O): jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.$(O): jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.$(O): jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.$(O): jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.$(O): jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.$(O): jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.$(O): jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.$(O): jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.$(O): jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.$(O): jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.$(O): jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.$(O): jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.$(O): jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.$(O): jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.$(O): jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.$(O): jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.$(O): jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.$(O): jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.$(O): jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.$(O): jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.$(O): jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.$(O): jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.$(O): cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.$(O): djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.$(O): jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.$(O): rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.$(O): wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.$(O): cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.$(O): rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.$(O): rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.$(O): transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.$(O): rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.$(O): wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.$(O): rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.$(O): wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.$(O): rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.$(O): wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.$(O): rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.$(O): wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.$(O): rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.$(O): wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makefile.dj b/makefile.dj
deleted file mode 100644
index f766d25..0000000
--- a/makefile.dj
+++ /dev/null
@@ -1,220 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is for DJGPP (Delorie's GNU C port on MS-DOS), v2.0 or later.
-# Thanks to Frank J. Donahoe for this version.
-
-# Read installation instructions before saying "make" !!
-
-# The name of your C compiler:
-CC= gcc
-
-# You may need to adjust these cc options:
-CFLAGS= -O2 -Wall -I.
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-
-# Link-time cc options:
-LDFLAGS= -s
-
-# To link any special libraries, add the necessary -l commands here.
-LDLIBS=
-
-# Put here the object file name for the correct system-dependent memory
-# manager file. For DJGPP this is usually jmemnobs.o, but you could
-# use jmemname.o if you want to use named temp files instead of swap space.
-SYSDEPMEM= jmemnobs.o
-
-# miscellaneous OS-dependent stuff
-# linker
-LN= $(CC)
-# file deletion command
-RM= del
-# library (.a) file creation command
-AR= ar rc
-# second step in .a creation (use "touch" if not needed)
-AR2= ranlib
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
- jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
- jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
- jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
- jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
- jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
- jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
- jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
- rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
- rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
- jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
- wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
- coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
- makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
- makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
- maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
- makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
- jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
- jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
- testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
- $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
- jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
- jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
- jfdctint.o
-# decompression library object files
-DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
- jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
- jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
- jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
-# These objectfiles are included in libjpeg.a
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
- cdjpeg.o
-DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
- cdjpeg.o
-TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
-
-
-all: libjpeg.a cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
-
-libjpeg.a: $(LIBOBJECTS)
- $(RM) libjpeg.a
- $(AR) libjpeg.a $(LIBOBJECTS)
- $(AR2) libjpeg.a
-
-cjpeg.exe: $(COBJECTS) libjpeg.a
- $(LN) $(LDFLAGS) -o cjpeg.exe $(COBJECTS) libjpeg.a $(LDLIBS)
-
-djpeg.exe: $(DOBJECTS) libjpeg.a
- $(LN) $(LDFLAGS) -o djpeg.exe $(DOBJECTS) libjpeg.a $(LDLIBS)
-
-jpegtran.exe: $(TROBJECTS) libjpeg.a
- $(LN) $(LDFLAGS) -o jpegtran.exe $(TROBJECTS) libjpeg.a $(LDLIBS)
-
-rdjpgcom.exe: rdjpgcom.o
- $(LN) $(LDFLAGS) -o rdjpgcom.exe rdjpgcom.o $(LDLIBS)
-
-wrjpgcom.exe: wrjpgcom.o
- $(LN) $(LDFLAGS) -o wrjpgcom.exe wrjpgcom.o $(LDLIBS)
-
-jconfig.h: jconfig.doc
- echo You must prepare a system-dependent jconfig.h file.
- echo Please read the installation directions in install.doc.
- exit 1
-
-clean:
- $(RM) *.o
- $(RM) cjpeg.exe
- $(RM) djpeg.exe
- $(RM) jpegtran.exe
- $(RM) rdjpgcom.exe
- $(RM) wrjpgcom.exe
- $(RM) libjpeg.a
- $(RM) testout*.*
-
-test: cjpeg.exe djpeg.exe jpegtran.exe
- $(RM) testout*.*
- ./djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
- ./djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
- ./cjpeg -dct int -outfile testout.jpg testimg.ppm
- ./djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
- ./cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
- ./jpegtran -outfile testoutt.jpg testprog.jpg
- fc /b testimg.ppm testout.ppm
- fc /b testimg.bmp testout.bmp
- fc /b testimg.jpg testout.jpg
- fc /b testimg.ppm testoutp.ppm
- fc /b testimgp.jpg testoutp.jpg
- fc /b testorig.jpg testoutt.jpg
-
-
-jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makefile.manx b/makefile.manx
deleted file mode 100644
index 4cb42d1..0000000
--- a/makefile.manx
+++ /dev/null
@@ -1,214 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is for Amiga systems using Manx Aztec C ver 5.x.
-# Thanks to D.J. James (djjames@cup.portal.com) for this version.
-
-# Read installation instructions before saying "make" !!
-
-# The name of your C compiler:
-CC= cc
-
-# You may need to adjust these cc options:
-# Uncomment for generic 68000 code (will work on any Amiga)
-ARCHFLAGS= -sn
-
-# Uncomment for 68020/68030 code (faster, but won't run on 68000 CPU)
-#ARCHFLAGS= -c2
-
-CFLAGS= -MC -MD $(ARCHFLAGS) -spfam -r4
-
-# Link-time cc options:
-LDFLAGS= -g
-
-# To link any special libraries, add the necessary -l commands here.
-LDLIBS= -lml -lcl
-
-# Put here the object file name for the correct system-dependent memory
-# manager file. For Amiga we recommend jmemname.o.
-SYSDEPMEM= jmemname.o
-
-# miscellaneous OS-dependent stuff
-# linker
-LN= ln
-# file deletion command
-RM= delete quiet
-# library (.lib) file creation command
-AR= lb
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
- jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
- jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
- jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
- jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
- jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
- jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
- jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
- rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
- rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
- jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
- wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
- coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
- makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
- makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
- maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
- makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
- jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
- jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
- testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
- $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
- jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
- jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
- jfdctint.o
-# decompression library object files
-DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
- jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
- jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
- jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
-# These objectfiles are included in libjpeg.lib
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
- cdjpeg.o
-DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
- cdjpeg.o
-TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
-
-
-all: libjpeg.lib cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-
-libjpeg.lib: $(LIBOBJECTS)
- -$(RM) libjpeg.lib
- $(AR) libjpeg.lib $(LIBOBJECTS)
-
-cjpeg: $(COBJECTS) libjpeg.lib
- $(LN) $(LDFLAGS) -o cjpeg $(COBJECTS) libjpeg.lib $(LDLIBS)
-
-djpeg: $(DOBJECTS) libjpeg.lib
- $(LN) $(LDFLAGS) -o djpeg $(DOBJECTS) libjpeg.lib $(LDLIBS)
-
-jpegtran: $(TROBJECTS) libjpeg.lib
- $(LN) $(LDFLAGS) -o jpegtran $(TROBJECTS) libjpeg.lib $(LDLIBS)
-
-rdjpgcom: rdjpgcom.o
- $(LN) $(LDFLAGS) -o rdjpgcom rdjpgcom.o $(LDLIBS)
-
-wrjpgcom: wrjpgcom.o
- $(LN) $(LDFLAGS) -o wrjpgcom wrjpgcom.o $(LDLIBS)
-
-jconfig.h: jconfig.doc
- echo You must prepare a system-dependent jconfig.h file.
- echo Please read the installation directions in install.doc.
- exit 1
-
-clean:
- -$(RM) *.o cjpeg djpeg jpegtran libjpeg.lib rdjpgcom wrjpgcom
- -$(RM) core testout*.*
-
-test: cjpeg djpeg jpegtran
- -$(RM) testout*.*
- djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
- djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
- cjpeg -dct int -outfile testout.jpg testimg.ppm
- djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
- cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
- jpegtran -outfile testoutt.jpg testprog.jpg
- cmp testimg.ppm testout.ppm
- cmp testimg.bmp testout.bmp
- cmp testimg.jpg testout.jpg
- cmp testimg.ppm testoutp.ppm
- cmp testimgp.jpg testoutp.jpg
- cmp testorig.jpg testoutt.jpg
-
-
-jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makefile.mc6 b/makefile.mc6
deleted file mode 100644
index 6aff054..0000000
--- a/makefile.mc6
+++ /dev/null
@@ -1,249 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is for Microsoft C for MS-DOS, version 6.00A and up.
-# Use NMAKE, not Microsoft's brain-damaged MAKE.
-# Thanks to Alan Wright and Chris Turner of Olivetti Research Ltd.
-
-# Read installation instructions before saying "nmake" !!
-
-# You may need to adjust these compiler options:
-CFLAGS = -AM -Oecigt -Gs -W3
-# -AM medium memory model (or use -AS for small model, if you remove features)
-# -Oecigt -Gs maximum safe optimisation (-Ol has bugs in MSC 6.00A)
-# -W3 warning level 3
-# You might also want to add -G2 if you have an 80286, etc.
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-
-# Jan-Herman Buining suggests the following switches for MS C 8.0 and a 486:
-# CFLAGS = /AM /f- /FPi87 /G3 /Gs /Gy /Ob1 /Oc /Oe /Og /Oi /Ol /On /Oo /Ot \
-# /OV4 /W3
-# except for jquant1.c, which must be compiled with /Oo- to avoid a compiler
-# crash.
-
-# Ingar Steinsland suggests the following switches when building
-# a 16-bit Windows DLL:
-# CFLAGS = -ALw -Gsw -Zpe -W3 -O2 -Zi -Zd
-
-# Put here the object file name for the correct system-dependent memory
-# manager file. For DOS, we recommend jmemdos.c and jmemdosa.asm.
-# (But not for Windows; see install.doc if you use this makefile for Windows.)
-SYSDEPMEM= jmemdos.obj jmemdosa.obj
-# SYSDEPMEMLIB must list the same files with "+" signs for the librarian.
-SYSDEPMEMLIB= +jmemdos.obj +jmemdosa.obj
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
- jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
- jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
- jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
- jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
- jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
- jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
- jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
- rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
- rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
- jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
- wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
- coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
- makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
- makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
- maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
- makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
- jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
- jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
- testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
- $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
- jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
- jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
- jcdctmgr.obj jfdctfst.obj jfdctflt.obj jfdctint.obj
-# decompression library object files
-DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
- jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
- jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jidctfst.obj \
- jidctflt.obj jidctint.obj jidctred.obj jdsample.obj jdcolor.obj \
- jquant1.obj jquant2.obj jdmerge.obj
-# These objectfiles are included in libjpeg.lib
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj \
- rdswitch.obj cdjpeg.obj
-DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
- rdcolmap.obj cdjpeg.obj
-TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
-
-# need linker response file because file list > 128 chars
-RFILE = libjpeg.ans
-
-
-all: libjpeg.lib cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
-
-libjpeg.lib: $(LIBOBJECTS) $(RFILE)
- del libjpeg.lib
- lib @$(RFILE)
-
-# linker response file for building libjpeg.lib
-$(RFILE) : makefile
- del $(RFILE)
- echo libjpeg.lib >$(RFILE)
-# silly want-to-create-it prompt:
- echo y >>$(RFILE)
- echo +jcapimin.obj +jcapistd.obj +jctrans.obj +jcparam.obj & >>$(RFILE)
- echo +jdatadst.obj +jcinit.obj +jcmaster.obj +jcmarker.obj & >>$(RFILE)
- echo +jcmainct.obj +jcprepct.obj +jccoefct.obj & >>$(RFILE)
- echo +jccolor.obj +jcsample.obj +jchuff.obj +jcphuff.obj & >>$(RFILE)
- echo +jcdctmgr.obj +jfdctfst.obj +jfdctflt.obj & >>$(RFILE)
- echo +jfdctint.obj +jdapimin.obj +jdapistd.obj & >>$(RFILE)
- echo +jdtrans.obj +jdatasrc.obj +jdmaster.obj +jdinput.obj & >>$(RFILE)
- echo +jdmarker.obj +jdhuff.obj +jdphuff.obj +jdmainct.obj & >>$(RFILE)
- echo +jdcoefct.obj +jdpostct.obj +jddctmgr.obj & >>$(RFILE)
- echo +jidctfst.obj +jidctflt.obj +jidctint.obj & >>$(RFILE)
- echo +jidctred.obj +jdsample.obj +jdcolor.obj +jquant1.obj & >>$(RFILE)
- echo +jquant2.obj +jdmerge.obj +jcomapi.obj +jutils.obj & >>$(RFILE)
- echo +jerror.obj +jmemmgr.obj & >>$(RFILE)
- echo $(SYSDEPMEMLIB) ; >>$(RFILE)
-
-cjpeg.exe: $(COBJECTS) libjpeg.lib
- echo $(COBJECTS) >cjpeg.lst
- link /STACK:4096 /EXEPACK @cjpeg.lst, cjpeg.exe, , libjpeg.lib, ;
- del cjpeg.lst
-
-djpeg.exe: $(DOBJECTS) libjpeg.lib
- echo $(DOBJECTS) >djpeg.lst
- link /STACK:4096 /EXEPACK @djpeg.lst, djpeg.exe, , libjpeg.lib, ;
- del djpeg.lst
-
-jpegtran.exe: $(TROBJECTS) libjpeg.lib
- link /STACK:4096 /EXEPACK $(TROBJECTS), jpegtran.exe, , libjpeg.lib, ;
-
-rdjpgcom.exe: rdjpgcom.c
- $(CC) -AS -O -W3 rdjpgcom.c
-
-# wrjpgcom needs large model so it can malloc a 64K chunk
-wrjpgcom.exe: wrjpgcom.c
- $(CC) -AL -O -W3 wrjpgcom.c
-
-jconfig.h: jconfig.doc
- echo You must prepare a system-dependent jconfig.h file.
- echo Please read the installation directions in install.doc.
- exit 1
-
-clean:
- del *.obj
- del libjpeg.lib
- del cjpeg.exe
- del djpeg.exe
- del jpegtran.exe
- del rdjpgcom.exe
- del wrjpgcom.exe
- del testout*.*
-
-test: cjpeg.exe djpeg.exe jpegtran.exe
- del testout*.*
- djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
- djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
- cjpeg -dct int -outfile testout.jpg testimg.ppm
- djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
- cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
- jpegtran -outfile testoutt.jpg testprog.jpg
- fc /b testimg.ppm testout.ppm
- fc /b testimg.bmp testout.bmp
- fc /b testimg.jpg testout.jpg
- fc /b testimg.ppm testoutp.ppm
- fc /b testimgp.jpg testoutp.jpg
- fc /b testorig.jpg testoutt.jpg
-
-
-jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.obj: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.obj: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.obj: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.obj: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.obj: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.obj: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.obj: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.obj: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.obj: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.obj: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.obj: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.obj: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.obj: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.obj: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.obj: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.obj: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.obj: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.obj: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.obj: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.obj: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.obj: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.obj: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.obj: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.obj: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.obj: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.obj: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.obj: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.obj: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-jmemdosa.obj : jmemdosa.asm
- masm /mx $*;
diff --git a/makefile.mms b/makefile.mms
deleted file mode 100644
index cf130e5..0000000
--- a/makefile.mms
+++ /dev/null
@@ -1,218 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is for use with MMS on Digital VMS systems.
-# Thanks to Rick Dyson (dyson@iowasp.physics.uiowa.edu)
-# and Tim Bell (tbell@netcom.com) for their help.
-
-# Read installation instructions before saying "MMS" !!
-
-# You may need to adjust these cc options:
-CFLAGS= $(CFLAGS) /NoDebug /Optimize
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via /Define switches here.
-.ifdef ALPHA
-OPT=
-.else
-OPT= ,Sys$Disk:[]MAKVMS.OPT/Option
-.endif
-
-# Put here the object file name for the correct system-dependent memory
-# manager file. For Unix this is usually jmemnobs.o, but you may want
-# to use jmemansi.o or jmemname.o if you have limited swap space.
-SYSDEPMEM= jmemnobs.obj
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
- jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
- jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
- jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
- jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
- jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
- jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
- jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
- rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
- rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
- jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
- wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
- coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
- makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
- makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
- maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
- makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
- jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
- jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
- testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
- $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
- jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
- jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
- jcdctmgr.obj jfdctfst.obj jfdctflt.obj jfdctint.obj
-# decompression library object files
-DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
- jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
- jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jidctfst.obj \
- jidctflt.obj jidctint.obj jidctred.obj jdsample.obj jdcolor.obj \
- jquant1.obj jquant2.obj jdmerge.obj
-# These objectfiles are included in libjpeg.olb
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj \
- rdswitch.obj cdjpeg.obj
-DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
- rdcolmap.obj cdjpeg.obj
-TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
-# objectfile lists with commas --- what a crock
-COBJLIST= cjpeg.obj,rdppm.obj,rdgif.obj,rdtarga.obj,rdrle.obj,rdbmp.obj,\
- rdswitch.obj,cdjpeg.obj
-DOBJLIST= djpeg.obj,wrppm.obj,wrgif.obj,wrtarga.obj,wrrle.obj,wrbmp.obj,\
- rdcolmap.obj,cdjpeg.obj
-TROBJLIST= jpegtran.obj,rdswitch.obj,cdjpeg.obj,transupp.obj
-LIBOBJLIST= jcapimin.obj,jcapistd.obj,jctrans.obj,jcparam.obj,jdatadst.obj,\
- jcinit.obj,jcmaster.obj,jcmarker.obj,jcmainct.obj,jcprepct.obj,\
- jccoefct.obj,jccolor.obj,jcsample.obj,jchuff.obj,jcphuff.obj,\
- jcdctmgr.obj,jfdctfst.obj,jfdctflt.obj,jfdctint.obj,jdapimin.obj,\
- jdapistd.obj,jdtrans.obj,jdatasrc.obj,jdmaster.obj,jdinput.obj,\
- jdmarker.obj,jdhuff.obj,jdphuff.obj,jdmainct.obj,jdcoefct.obj,\
- jdpostct.obj,jddctmgr.obj,jidctfst.obj,jidctflt.obj,jidctint.obj,\
- jidctred.obj,jdsample.obj,jdcolor.obj,jquant1.obj,jquant2.obj,\
- jdmerge.obj,jcomapi.obj,jutils.obj,jerror.obj,jmemmgr.obj,$(SYSDEPMEM)
-
-
-.first
- @- Define /NoLog Sys Sys$Library
-
-ALL : libjpeg.olb cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
- @ Continue
-
-libjpeg.olb : $(LIBOBJECTS)
- Library /Create libjpeg.olb $(LIBOBJLIST)
-
-cjpeg.exe : $(COBJECTS) libjpeg.olb
- $(LINK) $(LFLAGS) /Executable = cjpeg.exe $(COBJLIST),libjpeg.olb/Library$(OPT)
-
-djpeg.exe : $(DOBJECTS) libjpeg.olb
- $(LINK) $(LFLAGS) /Executable = djpeg.exe $(DOBJLIST),libjpeg.olb/Library$(OPT)
-
-jpegtran.exe : $(TROBJECTS) libjpeg.olb
- $(LINK) $(LFLAGS) /Executable = jpegtran.exe $(TROBJLIST),libjpeg.olb/Library$(OPT)
-
-rdjpgcom.exe : rdjpgcom.obj
- $(LINK) $(LFLAGS) /Executable = rdjpgcom.exe rdjpgcom.obj$(OPT)
-
-wrjpgcom.exe : wrjpgcom.obj
- $(LINK) $(LFLAGS) /Executable = wrjpgcom.exe wrjpgcom.obj$(OPT)
-
-jconfig.h : jconfig.vms
- @- Copy jconfig.vms jconfig.h
-
-clean :
- @- Set Protection = Owner:RWED *.*;-1
- @- Set Protection = Owner:RWED *.OBJ
- - Purge /NoLog /NoConfirm *.*
- - Delete /NoLog /NoConfirm *.OBJ;
-
-test : cjpeg.exe djpeg.exe jpegtran.exe
- mcr sys$disk:[]djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
- mcr sys$disk:[]djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
- mcr sys$disk:[]cjpeg -dct int -outfile testout.jpg testimg.ppm
- mcr sys$disk:[]djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
- mcr sys$disk:[]cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
- mcr sys$disk:[]jpegtran -outfile testoutt.jpg testprog.jpg
- - Backup /Compare/Log testimg.ppm testout.ppm
- - Backup /Compare/Log testimg.bmp testout.bmp
- - Backup /Compare/Log testimg.jpg testout.jpg
- - Backup /Compare/Log testimg.ppm testoutp.ppm
- - Backup /Compare/Log testimgp.jpg testoutp.jpg
- - Backup /Compare/Log testorig.jpg testoutt.jpg
-
-
-jcapimin.obj : jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.obj : jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.obj : jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.obj : jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.obj : jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.obj : jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.obj : jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.obj : jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.obj : jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.obj : jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.obj : jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.obj : jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.obj : jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.obj : jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.obj : jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.obj : jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.obj : jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.obj : jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.obj : jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.obj : jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.obj : jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.obj : jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.obj : jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.obj : jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.obj : jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.obj : jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.obj : jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.obj : jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.obj : jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.obj : jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.obj : jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.obj : jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.obj : jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.obj : jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.obj : jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.obj : jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.obj : jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.obj : jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.obj : jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.obj : jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.obj : jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.obj : jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.obj : jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.obj : jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.obj : jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.obj : jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.obj : jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.obj : jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.obj : jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.obj : jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.obj : cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.obj : djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.obj : jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.obj : rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.obj : wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.obj : cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.obj : rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.obj : rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.obj : transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.obj : rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.obj : wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.obj : rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.obj : wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.obj : rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.obj : wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.obj : rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.obj : wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.obj : rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.obj : wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makefile.sas b/makefile.sas
deleted file mode 100644
index f296faf..0000000
--- a/makefile.sas
+++ /dev/null
@@ -1,252 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is for Amiga systems using SAS C 6.0 and up.
-# Thanks to Ed Hanway, Mark Rinfret, and Jim Zepeda.
-
-# Read installation instructions before saying "make" !!
-
-# The name of your C compiler:
-CC= sc
-
-# You may need to adjust these cc options:
-# Uncomment the following lines for generic 680x0 version
-ARCHFLAGS= cpu=any
-SUFFIX=
-
-# Uncomment the following lines for 68030-only version
-#ARCHFLAGS= cpu=68030
-#SUFFIX=.030
-
-CFLAGS= nostackcheck data=near parms=register optimize $(ARCHFLAGS) \
- ignore=104 ignore=304 ignore=306
-# ignore=104 disables warnings for mismatched const qualifiers
-# ignore=304 disables warnings for variables being optimized out
-# ignore=306 disables warnings for the inlining of functions
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via define switches here.
-
-# Link-time cc options:
-LDFLAGS= SC SD ND BATCH
-
-# To link any special libraries, add the necessary commands here.
-LDLIBS= LIB:scm.lib LIB:sc.lib
-
-# Put here the object file name for the correct system-dependent memory
-# manager file. For Amiga we recommend jmemname.o.
-SYSDEPMEM= jmemname.o
-
-# miscellaneous OS-dependent stuff
-# linker
-LN= slink
-# file deletion command
-RM= delete quiet
-# library (.lib) file creation command
-AR= oml
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
- jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
- jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
- jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
- jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
- jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
- jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
- jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
- rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
- rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
- jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
- wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
- coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
- makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
- makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
- maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
- makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
- jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
- jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
- testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
- $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
- jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
- jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
- jfdctint.o
-# decompression library object files
-DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
- jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
- jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
- jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
-# These objectfiles are included in libjpeg.lib
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
- cdjpeg.o
-DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
- cdjpeg.o
-TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
-
-
-all: libjpeg.lib cjpeg$(SUFFIX) djpeg$(SUFFIX) jpegtran$(SUFFIX) rdjpgcom$(SUFFIX) wrjpgcom$(SUFFIX)
-
-# note: do several AR steps to avoid command line length limitations
-
-libjpeg.lib: $(LIBOBJECTS)
- -$(RM) libjpeg.lib
- $(AR) libjpeg.lib r $(CLIBOBJECTS)
- $(AR) libjpeg.lib r $(DLIBOBJECTS)
- $(AR) libjpeg.lib r $(COMOBJECTS)
-
-cjpeg$(SUFFIX): $(COBJECTS) libjpeg.lib
- $(LN) <WITH <
-$(LDFLAGS)
-TO cjpeg$(SUFFIX)
-FROM LIB:c.o $(COBJECTS)
-LIB libjpeg.lib $(LDLIBS)
-<
-
-djpeg$(SUFFIX): $(DOBJECTS) libjpeg.lib
- $(LN) <WITH <
-$(LDFLAGS)
-TO djpeg$(SUFFIX)
-FROM LIB:c.o $(DOBJECTS)
-LIB libjpeg.lib $(LDLIBS)
-<
-
-jpegtran$(SUFFIX): $(TROBJECTS) libjpeg.lib
- $(LN) <WITH <
-$(LDFLAGS)
-TO jpegtran$(SUFFIX)
-FROM LIB:c.o $(TROBJECTS)
-LIB libjpeg.lib $(LDLIBS)
-<
-
-rdjpgcom$(SUFFIX): rdjpgcom.o
- $(LN) <WITH <
-$(LDFLAGS)
-TO rdjpgcom$(SUFFIX)
-FROM LIB:c.o rdjpgcom.o
-LIB $(LDLIBS)
-<
-
-wrjpgcom$(SUFFIX): wrjpgcom.o
- $(LN) <WITH <
-$(LDFLAGS)
-TO wrjpgcom$(SUFFIX)
-FROM LIB:c.o wrjpgcom.o
-LIB $(LDLIBS)
-<
-
-jconfig.h: jconfig.doc
- echo You must prepare a system-dependent jconfig.h file.
- echo Please read the installation directions in install.doc.
- exit 1
-
-clean:
- -$(RM) *.o cjpeg djpeg jpegtran cjpeg.030 djpeg.030 jpegtran.030
- -$(RM) rdjpgcom wrjpgcom rdjpgcom.030 wrjpgcom.030
- -$(RM) libjpeg.lib core testout*.*
-
-test: cjpeg djpeg jpegtran
- -$(RM) testout*.*
- djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
- djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
- cjpeg -dct int -outfile testout.jpg testimg.ppm
- djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
- cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
- jpegtran -outfile testoutt.jpg testprog.jpg
- cmp testimg.ppm testout.ppm
- cmp testimg.bmp testout.bmp
- cmp testimg.jpg testout.jpg
- cmp testimg.ppm testoutp.ppm
- cmp testimgp.jpg testoutp.jpg
- cmp testorig.jpg testoutt.jpg
-
-
-jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makefile.unix b/makefile.unix
deleted file mode 100644
index 00455ab..0000000
--- a/makefile.unix
+++ /dev/null
@@ -1,228 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is suitable for Unix-like systems with non-ANSI compilers.
-# If you have an ANSI compiler, makefile.ansi is a better starting point.
-
-# Read installation instructions before saying "make" !!
-
-# The name of your C compiler:
-CC= cc
-
-# You may need to adjust these cc options:
-CFLAGS= -O
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-# However, any special defines for ansi2knr.c may be included here:
-ANSI2KNRFLAGS=
-
-# Link-time cc options:
-LDFLAGS=
-
-# To link any special libraries, add the necessary -l commands here.
-LDLIBS=
-
-# Put here the object file name for the correct system-dependent memory
-# manager file. For Unix this is usually jmemnobs.o, but you may want
-# to use jmemansi.o or jmemname.o if you have limited swap space.
-SYSDEPMEM= jmemnobs.o
-
-# miscellaneous OS-dependent stuff
-# linker
-LN= $(CC)
-# file deletion command
-RM= rm -f
-# file rename command
-MV= mv
-# library (.a) file creation command
-AR= ar rc
-# second step in .a creation (use "touch" if not needed)
-AR2= ranlib
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
- jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
- jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
- jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
- jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
- jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
- jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
- jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
- rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
- rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
- jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
- wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
- coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
- makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
- makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
- maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
- makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
- jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
- jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
- testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
- $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
- jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
- jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
- jfdctint.o
-# decompression library object files
-DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
- jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
- jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
- jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
-# These objectfiles are included in libjpeg.a
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
- cdjpeg.o
-DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
- cdjpeg.o
-TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
-
-
-all: ansi2knr libjpeg.a cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-
-# This rule causes ansi2knr to be invoked.
-.c.o:
- ./ansi2knr $*.c T$*.c
- $(CC) $(CFLAGS) -c T$*.c
- $(RM) T$*.c $*.o
- $(MV) T$*.o $*.o
-
-ansi2knr: ansi2knr.c
- $(CC) $(CFLAGS) $(ANSI2KNRFLAGS) -o ansi2knr ansi2knr.c
-
-libjpeg.a: ansi2knr $(LIBOBJECTS)
- $(RM) libjpeg.a
- $(AR) libjpeg.a $(LIBOBJECTS)
- $(AR2) libjpeg.a
-
-cjpeg: ansi2knr $(COBJECTS) libjpeg.a
- $(LN) $(LDFLAGS) -o cjpeg $(COBJECTS) libjpeg.a $(LDLIBS)
-
-djpeg: ansi2knr $(DOBJECTS) libjpeg.a
- $(LN) $(LDFLAGS) -o djpeg $(DOBJECTS) libjpeg.a $(LDLIBS)
-
-jpegtran: ansi2knr $(TROBJECTS) libjpeg.a
- $(LN) $(LDFLAGS) -o jpegtran $(TROBJECTS) libjpeg.a $(LDLIBS)
-
-rdjpgcom: rdjpgcom.o
- $(LN) $(LDFLAGS) -o rdjpgcom rdjpgcom.o $(LDLIBS)
-
-wrjpgcom: wrjpgcom.o
- $(LN) $(LDFLAGS) -o wrjpgcom wrjpgcom.o $(LDLIBS)
-
-jconfig.h: jconfig.doc
- echo You must prepare a system-dependent jconfig.h file.
- echo Please read the installation directions in install.doc.
- exit 1
-
-clean:
- $(RM) *.o cjpeg djpeg jpegtran libjpeg.a rdjpgcom wrjpgcom
- $(RM) ansi2knr core testout*
-
-test: cjpeg djpeg jpegtran
- $(RM) testout*
- ./djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
- ./djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
- ./cjpeg -dct int -outfile testout.jpg testimg.ppm
- ./djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
- ./cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
- ./jpegtran -outfile testoutt.jpg testprog.jpg
- cmp testimg.ppm testout.ppm
- cmp testimg.bmp testout.bmp
- cmp testimg.jpg testout.jpg
- cmp testimg.ppm testoutp.ppm
- cmp testimgp.jpg testoutp.jpg
- cmp testorig.jpg testoutt.jpg
-
-
-jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makefile.vc b/makefile.vc
deleted file mode 100644
index 2acf069..0000000
--- a/makefile.vc
+++ /dev/null
@@ -1,211 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is for Microsoft Visual C++ on Windows NT (and 95?).
-# It builds the IJG library as a statically linkable library (.LIB),
-# and builds the sample applications as console-mode apps.
-# Thanks to Xingong Chang, Raymond Everly and others.
-
-# Read installation instructions before saying "nmake" !!
-# To build an optimized library without debug info, say "nmake nodebug=1".
-
-# Pull in standard variable definitions
-!include <win32.mak>
-
-# You may want to adjust these compiler options:
-CFLAGS= $(cflags) $(cdebug) $(cvars) -I.
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-
-# Link-time options:
-LDFLAGS= $(ldebug) $(conlflags)
-
-# To link any special libraries, add the necessary commands here.
-LDLIBS= $(conlibs)
-
-# Put here the object file name for the correct system-dependent memory
-# manager file. For NT we suggest jmemnobs.obj, which expects the OS to
-# provide adequate virtual memory.
-SYSDEPMEM= jmemnobs.obj
-
-# miscellaneous OS-dependent stuff
-# file deletion command
-RM= del
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
- jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
- jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
- jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
- jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
- jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
- jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
- jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
- rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
- rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
- jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
- wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
- coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
- makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
- makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
- maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
- makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
- jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
- jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
- testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
- $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
- jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
- jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
- jcdctmgr.obj jfdctfst.obj jfdctflt.obj jfdctint.obj
-# decompression library object files
-DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
- jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
- jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jidctfst.obj \
- jidctflt.obj jidctint.obj jidctred.obj jdsample.obj jdcolor.obj \
- jquant1.obj jquant2.obj jdmerge.obj
-# These objectfiles are included in libjpeg.lib
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj \
- rdswitch.obj cdjpeg.obj
-DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
- rdcolmap.obj cdjpeg.obj
-TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
-
-# Template command for compiling .c to .obj
-.c.obj:
- $(cc) $(CFLAGS) $*.c
-
-
-all: libjpeg.lib cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
-
-libjpeg.lib: $(LIBOBJECTS)
- $(RM) libjpeg.lib
- lib -out:libjpeg.lib $(LIBOBJECTS)
-
-cjpeg.exe: $(COBJECTS) libjpeg.lib
- $(link) $(LDFLAGS) -out:cjpeg.exe $(COBJECTS) libjpeg.lib $(LDLIBS)
-
-djpeg.exe: $(DOBJECTS) libjpeg.lib
- $(link) $(LDFLAGS) -out:djpeg.exe $(DOBJECTS) libjpeg.lib $(LDLIBS)
-
-jpegtran.exe: $(TROBJECTS) libjpeg.lib
- $(link) $(LDFLAGS) -out:jpegtran.exe $(TROBJECTS) libjpeg.lib $(LDLIBS)
-
-rdjpgcom.exe: rdjpgcom.obj
- $(link) $(LDFLAGS) -out:rdjpgcom.exe rdjpgcom.obj $(LDLIBS)
-
-wrjpgcom.exe: wrjpgcom.obj
- $(link) $(LDFLAGS) -out:wrjpgcom.exe wrjpgcom.obj $(LDLIBS)
-
-
-clean:
- $(RM) *.obj *.exe libjpeg.lib
- $(RM) testout*
-
-test: cjpeg.exe djpeg.exe jpegtran.exe
- $(RM) testout*
- .\djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
- .\djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
- .\cjpeg -dct int -outfile testout.jpg testimg.ppm
- .\djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
- .\cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
- .\jpegtran -outfile testoutt.jpg testprog.jpg
- fc /b testimg.ppm testout.ppm
- fc /b testimg.bmp testout.bmp
- fc /b testimg.jpg testout.jpg
- fc /b testimg.ppm testoutp.ppm
- fc /b testimgp.jpg testoutp.jpg
- fc /b testorig.jpg testoutt.jpg
-
-
-jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.obj: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.obj: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.obj: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.obj: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.obj: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.obj: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.obj: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.obj: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.obj: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.obj: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.obj: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.obj: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.obj: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.obj: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.obj: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.obj: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.obj: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.obj: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.obj: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.obj: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.obj: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.obj: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.obj: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.obj: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.obj: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.obj: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.obj: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.obj: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makefile.vms b/makefile.vms
deleted file mode 100644
index a42358d..0000000
--- a/makefile.vms
+++ /dev/null
@@ -1,142 +0,0 @@
-$! Makefile for Independent JPEG Group's software
-$!
-$! This is a command procedure for Digital VMS systems that do not have MMS.
-$! It builds the JPEG software by brute force, recompiling everything whether
-$! or not it is necessary. It then runs the basic self-test.
-$! Thanks to Rick Dyson (dyson@iowasp.physics.uiowa.edu)
-$! and Tim Bell (tbell@netcom.com) for their help.
-$!
-$! Read installation instructions before running this!!
-$!
-$ If F$Mode () .eqs. "INTERACTIVE"
-$ Then
-$ VERIFY = F$Verify (0)
-$ Else
-$ VERIFY = F$Verify (1)
-$ EndIf
-$ On Control_Y Then GoTo End
-$ On Error Then GoTo End
-$
-$ If F$GetSyi ("HW_MODEL") .gt. 1023
-$ Then
-$ OPT = ""
-$ Else
-$ OPT = ",Sys$Disk:[]makvms.opt/Option"
-$ EndIf
-$
-$ DoCompile := CC /NoDebug /Optimize /NoList
-$!
-$ DoCompile jcapimin.c
-$ DoCompile jcapistd.c
-$ DoCompile jctrans.c
-$ DoCompile jcparam.c
-$ DoCompile jdatadst.c
-$ DoCompile jcinit.c
-$ DoCompile jcmaster.c
-$ DoCompile jcmarker.c
-$ DoCompile jcmainct.c
-$ DoCompile jcprepct.c
-$ DoCompile jccoefct.c
-$ DoCompile jccolor.c
-$ DoCompile jcsample.c
-$ DoCompile jchuff.c
-$ DoCompile jcphuff.c
-$ DoCompile jcdctmgr.c
-$ DoCompile jfdctfst.c
-$ DoCompile jfdctflt.c
-$ DoCompile jfdctint.c
-$ DoCompile jdapimin.c
-$ DoCompile jdapistd.c
-$ DoCompile jdtrans.c
-$ DoCompile jdatasrc.c
-$ DoCompile jdmaster.c
-$ DoCompile jdinput.c
-$ DoCompile jdmarker.c
-$ DoCompile jdhuff.c
-$ DoCompile jdphuff.c
-$ DoCompile jdmainct.c
-$ DoCompile jdcoefct.c
-$ DoCompile jdpostct.c
-$ DoCompile jddctmgr.c
-$ DoCompile jidctfst.c
-$ DoCompile jidctflt.c
-$ DoCompile jidctint.c
-$ DoCompile jidctred.c
-$ DoCompile jdsample.c
-$ DoCompile jdcolor.c
-$ DoCompile jquant1.c
-$ DoCompile jquant2.c
-$ DoCompile jdmerge.c
-$ DoCompile jcomapi.c
-$ DoCompile jutils.c
-$ DoCompile jerror.c
-$ DoCompile jmemmgr.c
-$ DoCompile jmemnobs.c
-$!
-$ Library /Create libjpeg.olb jcapimin.obj,jcapistd.obj,jctrans.obj, -
- jcparam.obj,jdatadst.obj,jcinit.obj,jcmaster.obj,jcmarker.obj, -
- jcmainct.obj,jcprepct.obj,jccoefct.obj,jccolor.obj,jcsample.obj, -
- jchuff.obj,jcphuff.obj,jcdctmgr.obj,jfdctfst.obj,jfdctflt.obj, -
- jfdctint.obj,jdapimin.obj,jdapistd.obj,jdtrans.obj,jdatasrc.obj, -
- jdmaster.obj,jdinput.obj,jdmarker.obj,jdhuff.obj,jdphuff.obj, -
- jdmainct.obj,jdcoefct.obj,jdpostct.obj,jddctmgr.obj,jidctfst.obj, -
- jidctflt.obj,jidctint.obj,jidctred.obj,jdsample.obj,jdcolor.obj, -
- jquant1.obj,jquant2.obj,jdmerge.obj,jcomapi.obj,jutils.obj, -
- jerror.obj,jmemmgr.obj,jmemnobs.obj
-$!
-$ DoCompile cjpeg.c
-$ DoCompile rdppm.c
-$ DoCompile rdgif.c
-$ DoCompile rdtarga.c
-$ DoCompile rdrle.c
-$ DoCompile rdbmp.c
-$ DoCompile rdswitch.c
-$ DoCompile cdjpeg.c
-$!
-$ Link /NoMap /Executable = cjpeg.exe cjpeg.obj,rdppm.obj,rdgif.obj, -
- rdtarga.obj,rdrle.obj,rdbmp.obj,rdswitch.obj,cdjpeg.obj,libjpeg.olb/Library'OPT'
-$!
-$ DoCompile djpeg.c
-$ DoCompile wrppm.c
-$ DoCompile wrgif.c
-$ DoCompile wrtarga.c
-$ DoCompile wrrle.c
-$ DoCompile wrbmp.c
-$ DoCompile rdcolmap.c
-$ DoCompile cdjpeg.c
-$!
-$ Link /NoMap /Executable = djpeg.exe djpeg.obj,wrppm.obj,wrgif.obj, -
- wrtarga.obj,wrrle.obj,wrbmp.obj,rdcolmap.obj,cdjpeg.obj,libjpeg.olb/Library'OPT'
-$!
-$ DoCompile jpegtran.c
-$ DoCompile rdswitch.c
-$ DoCompile cdjpeg.c
-$ DoCompile transupp.c
-$!
-$ Link /NoMap /Executable = jpegtran.exe jpegtran.obj,rdswitch.obj, -
- cdjpeg.obj,transupp.obj,libjpeg.olb/Library'OPT'
-$!
-$ DoCompile rdjpgcom.c
-$ Link /NoMap /Executable = rdjpgcom.exe rdjpgcom.obj'OPT'
-$!
-$ DoCompile wrjpgcom.c
-$ Link /NoMap /Executable = wrjpgcom.exe wrjpgcom.obj'OPT'
-$!
-$! Run the self-test
-$!
-$ mcr sys$disk:[]djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
-$ mcr sys$disk:[]djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
-$ mcr sys$disk:[]cjpeg -dct int -outfile testout.jpg testimg.ppm
-$ mcr sys$disk:[]djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-$ mcr sys$disk:[]cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-$ mcr sys$disk:[]jpegtran -outfile testoutt.jpg testprog.jpg
-$ Backup /Compare/Log testimg.ppm testout.ppm
-$ Backup /Compare/Log testimg.bmp testout.bmp
-$ Backup /Compare/Log testimg.jpg testout.jpg
-$ Backup /Compare/Log testimg.ppm testoutp.ppm
-$ Backup /Compare/Log testimgp.jpg testoutp.jpg
-$ Backup /Compare/Log testorig.jpg testoutt.jpg
-$!
-$End:
-$ If Verify Then Set Verify
-$ Exit
diff --git a/makefile.wat b/makefile.wat
deleted file mode 100644
index d953e46..0000000
--- a/makefile.wat
+++ /dev/null
@@ -1,233 +0,0 @@
-# Makefile for Independent JPEG Group's software
-
-# This makefile is suitable for Watcom C/C++ 10.0 on MS-DOS (using
-# dos4g extender), OS/2, and Windows NT console mode.
-# Thanks to Janos Haide, jhaide@btrvtech.com.
-
-# Read installation instructions before saying "wmake" !!
-
-# Uncomment line for desired system
-SYSTEM=DOS
-#SYSTEM=OS2
-#SYSTEM=NT
-
-# The name of your C compiler:
-CC= wcl386
-
-# You may need to adjust these cc options:
-CFLAGS= -4r -ort -wx -zq -bt=$(SYSTEM)
-# Caution: avoid -ol or -ox; these generate bad code with 10.0 or 10.0a.
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-
-# Link-time cc options:
-!ifeq SYSTEM DOS
-LDFLAGS= -zq -l=dos4g
-!else ifeq SYSTEM OS2
-LDFLAGS= -zq -l=os2v2
-!else ifeq SYSTEM NT
-LDFLAGS= -zq -l=nt
-!endif
-
-# Put here the object file name for the correct system-dependent memory
-# manager file. jmemnobs should work fine for dos4g or OS/2 environment.
-SYSDEPMEM= jmemnobs.obj
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c &
- jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c &
- jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c &
- jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c &
- jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c &
- jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c &
- jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c &
- jquant2.c jutils.c jmemmgr.c
-# memmgr back ends: compile only one of these into a working library
-SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c &
- rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c &
- rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h &
- jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 &
- wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc &
- coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc &
- makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds &
- makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st &
- maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms &
- makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat &
- jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas &
- jconfig.vms
-CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg &
- testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) &
- $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj &
- jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj &
- jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj &
- jcdctmgr.obj jfdctfst.obj jfdctflt.obj jfdctint.obj
-# decompression library object files
-DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj &
- jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj &
- jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jidctfst.obj &
- jidctflt.obj jidctint.obj jidctred.obj jdsample.obj jdcolor.obj &
- jquant1.obj jquant2.obj jdmerge.obj
-# These objectfiles are included in libjpeg.lib
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj &
- rdswitch.obj cdjpeg.obj
-DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj &
- rdcolmap.obj cdjpeg.obj
-TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
-
-
-all: libjpeg.lib cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
-
-libjpeg.lib: $(LIBOBJECTS)
- - del libjpeg.lib
- * wlib -n libjpeg.lib $(LIBOBJECTS)
-
-cjpeg.exe: $(COBJECTS) libjpeg.lib
- $(CC) $(LDFLAGS) $(COBJECTS) libjpeg.lib
-
-djpeg.exe: $(DOBJECTS) libjpeg.lib
- $(CC) $(LDFLAGS) $(DOBJECTS) libjpeg.lib
-
-jpegtran.exe: $(TROBJECTS) libjpeg.lib
- $(CC) $(LDFLAGS) $(TROBJECTS) libjpeg.lib
-
-rdjpgcom.exe: rdjpgcom.c
- $(CC) $(CFLAGS) $(LDFLAGS) rdjpgcom.c
-
-wrjpgcom.exe: wrjpgcom.c
- $(CC) $(CFLAGS) $(LDFLAGS) wrjpgcom.c
-
-.c.obj:
- $(CC) $(CFLAGS) -c $<
-
-jconfig.h: jconfig.doc
- echo You must prepare a system-dependent jconfig.h file.
- echo Please read the installation directions in install.doc.
- exit 1
-
-clean: .SYMBOLIC
- - del *.obj
- - del libjpeg.lib
- - del cjpeg.exe
- - del djpeg.exe
- - del jpegtran.exe
- - del rdjpgcom.exe
- - del wrjpgcom.exe
- - del testout*.*
-
-test: cjpeg.exe djpeg.exe jpegtran.exe .SYMBOLIC
- - del testout*.*
- djpeg -dct int -ppm -outfile testout.ppm testorig.jpg
- djpeg -dct int -bmp -colors 256 -outfile testout.bmp testorig.jpg
- cjpeg -dct int -outfile testout.jpg testimg.ppm
- djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
- cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
- jpegtran -outfile testoutt.jpg testprog.jpg
-!ifeq SYSTEM DOS
- fc /b testimg.ppm testout.ppm
- fc /b testimg.bmp testout.bmp
- fc /b testimg.jpg testout.jpg
- fc /b testimg.ppm testoutp.ppm
- fc /b testimgp.jpg testoutp.jpg
- fc /b testorig.jpg testoutt.jpg
-!else
- echo n > n.tmp
- comp testimg.ppm testout.ppm < n.tmp
- comp testimg.bmp testout.bmp < n.tmp
- comp testimg.jpg testout.jpg < n.tmp
- comp testimg.ppm testoutp.ppm < n.tmp
- comp testimgp.jpg testoutp.jpg < n.tmp
- comp testorig.jpg testoutt.jpg < n.tmp
- del n.tmp
-!endif
-
-
-jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.obj: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.obj: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.obj: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.obj: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.obj: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.obj: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.obj: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.obj: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemmac.obj: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.obj: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.obj: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.obj: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
-rdjpgcom.obj: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.obj: wrjpgcom.c jinclude.h jconfig.h
-cdjpeg.obj: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.obj: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.obj: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-transupp.obj: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
-rdppm.obj: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.obj: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.obj: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.obj: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.obj: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.obj: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.obj: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.obj: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.obj: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.obj: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makelib.ds b/makelib.ds
deleted file mode 100644
index c7ad36d..0000000
--- a/makelib.ds
+++ /dev/null
@@ -1,1046 +0,0 @@
-# Microsoft Developer Studio Generated NMAKE File, Format Version 4.20
-# ** DO NOT EDIT **
-
-# TARGTYPE "Win32 (x86) Static Library" 0x0104
-
-!IF "$(CFG)" == ""
-CFG=jpeg - Win32
-!MESSAGE No configuration specified. Defaulting to jpeg - Win32.
-!ENDIF
-
-!IF "$(CFG)" != "jpeg - Win32"
-!MESSAGE Invalid configuration "$(CFG)" specified.
-!MESSAGE You can specify a configuration when running NMAKE on this makefile
-!MESSAGE by defining the macro CFG on the command line. For example:
-!MESSAGE
-!MESSAGE NMAKE /f "jpeg.mak" CFG="jpeg - Win32"
-!MESSAGE
-!MESSAGE Possible choices for configuration are:
-!MESSAGE
-!MESSAGE "jpeg - Win32" (based on "Win32 (x86) Static Library")
-!MESSAGE
-!ERROR An invalid configuration is specified.
-!ENDIF
-
-!IF "$(OS)" == "Windows_NT"
-NULL=
-!ELSE
-NULL=nul
-!ENDIF
-################################################################################
-# Begin Project
-# PROP Target_Last_Scanned "jpeg - Win32"
-CPP=cl.exe
-
-!IF "$(CFG)" == "jpeg - Win32"
-
-# PROP BASE Use_MFC 0
-# PROP BASE Use_Debug_Libraries 0
-# PROP BASE Output_Dir "Release"
-# PROP BASE Intermediate_Dir "Release"
-# PROP BASE Target_Dir ""
-# PROP Use_MFC 0
-# PROP Use_Debug_Libraries 0
-# PROP Output_Dir "Release"
-# PROP Intermediate_Dir "Release"
-# PROP Target_Dir ""
-OUTDIR=.\Release
-INTDIR=.\Release
-
-ALL : "$(OUTDIR)\jpeg.lib"
-
-CLEAN :
- -@erase "$(INTDIR)\jcapimin.obj"
- -@erase "$(INTDIR)\jcapistd.obj"
- -@erase "$(INTDIR)\jctrans.obj"
- -@erase "$(INTDIR)\jcparam.obj"
- -@erase "$(INTDIR)\jdatadst.obj"
- -@erase "$(INTDIR)\jcinit.obj"
- -@erase "$(INTDIR)\jcmaster.obj"
- -@erase "$(INTDIR)\jcmarker.obj"
- -@erase "$(INTDIR)\jcmainct.obj"
- -@erase "$(INTDIR)\jcprepct.obj"
- -@erase "$(INTDIR)\jccoefct.obj"
- -@erase "$(INTDIR)\jccolor.obj"
- -@erase "$(INTDIR)\jcsample.obj"
- -@erase "$(INTDIR)\jchuff.obj"
- -@erase "$(INTDIR)\jcphuff.obj"
- -@erase "$(INTDIR)\jcdctmgr.obj"
- -@erase "$(INTDIR)\jfdctfst.obj"
- -@erase "$(INTDIR)\jfdctflt.obj"
- -@erase "$(INTDIR)\jfdctint.obj"
- -@erase "$(INTDIR)\jdapimin.obj"
- -@erase "$(INTDIR)\jdapistd.obj"
- -@erase "$(INTDIR)\jdtrans.obj"
- -@erase "$(INTDIR)\jdatasrc.obj"
- -@erase "$(INTDIR)\jdmaster.obj"
- -@erase "$(INTDIR)\jdinput.obj"
- -@erase "$(INTDIR)\jdmarker.obj"
- -@erase "$(INTDIR)\jdhuff.obj"
- -@erase "$(INTDIR)\jdphuff.obj"
- -@erase "$(INTDIR)\jdmainct.obj"
- -@erase "$(INTDIR)\jdcoefct.obj"
- -@erase "$(INTDIR)\jdpostct.obj"
- -@erase "$(INTDIR)\jddctmgr.obj"
- -@erase "$(INTDIR)\jidctfst.obj"
- -@erase "$(INTDIR)\jidctflt.obj"
- -@erase "$(INTDIR)\jidctint.obj"
- -@erase "$(INTDIR)\jidctred.obj"
- -@erase "$(INTDIR)\jdsample.obj"
- -@erase "$(INTDIR)\jdcolor.obj"
- -@erase "$(INTDIR)\jquant1.obj"
- -@erase "$(INTDIR)\jquant2.obj"
- -@erase "$(INTDIR)\jdmerge.obj"
- -@erase "$(INTDIR)\jcomapi.obj"
- -@erase "$(INTDIR)\jutils.obj"
- -@erase "$(INTDIR)\jerror.obj"
- -@erase "$(INTDIR)\jmemmgr.obj"
- -@erase "$(INTDIR)\jmemnobs.obj"
- -@erase "$(OUTDIR)\jpeg.lib"
-
-"$(OUTDIR)" :
- if not exist "$(OUTDIR)/$(NULL)" mkdir "$(OUTDIR)"
-
-# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /YX /c
-# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /YX /c
-CPP_PROJ=/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS"\
- /Fp"$(INTDIR)/jpeg.pch" /YX /Fo"$(INTDIR)/" /c
-CPP_OBJS=.\Release/
-CPP_SBRS=.\.
-BSC32=bscmake.exe
-# ADD BASE BSC32 /nologo
-# ADD BSC32 /nologo
-BSC32_FLAGS=/nologo /o"$(OUTDIR)/jpeg.bsc"
-BSC32_SBRS= \
-
-LIB32=link.exe -lib
-# ADD BASE LIB32 /nologo
-# ADD LIB32 /nologo
-LIB32_FLAGS=/nologo /out:"$(OUTDIR)/jpeg.lib"
-LIB32_OBJS= \
- "$(INTDIR)\jcapimin.obj" \
- "$(INTDIR)\jcapistd.obj" \
- "$(INTDIR)\jctrans.obj" \
- "$(INTDIR)\jcparam.obj" \
- "$(INTDIR)\jdatadst.obj" \
- "$(INTDIR)\jcinit.obj" \
- "$(INTDIR)\jcmaster.obj" \
- "$(INTDIR)\jcmarker.obj" \
- "$(INTDIR)\jcmainct.obj" \
- "$(INTDIR)\jcprepct.obj" \
- "$(INTDIR)\jccoefct.obj" \
- "$(INTDIR)\jccolor.obj" \
- "$(INTDIR)\jcsample.obj" \
- "$(INTDIR)\jchuff.obj" \
- "$(INTDIR)\jcphuff.obj" \
- "$(INTDIR)\jcdctmgr.obj" \
- "$(INTDIR)\jfdctfst.obj" \
- "$(INTDIR)\jfdctflt.obj" \
- "$(INTDIR)\jfdctint.obj" \
- "$(INTDIR)\jdapimin.obj" \
- "$(INTDIR)\jdapistd.obj" \
- "$(INTDIR)\jdtrans.obj" \
- "$(INTDIR)\jdatasrc.obj" \
- "$(INTDIR)\jdmaster.obj" \
- "$(INTDIR)\jdinput.obj" \
- "$(INTDIR)\jdmarker.obj" \
- "$(INTDIR)\jdhuff.obj" \
- "$(INTDIR)\jdphuff.obj" \
- "$(INTDIR)\jdmainct.obj" \
- "$(INTDIR)\jdcoefct.obj" \
- "$(INTDIR)\jdpostct.obj" \
- "$(INTDIR)\jddctmgr.obj" \
- "$(INTDIR)\jidctfst.obj" \
- "$(INTDIR)\jidctflt.obj" \
- "$(INTDIR)\jidctint.obj" \
- "$(INTDIR)\jidctred.obj" \
- "$(INTDIR)\jdsample.obj" \
- "$(INTDIR)\jdcolor.obj" \
- "$(INTDIR)\jquant1.obj" \
- "$(INTDIR)\jquant2.obj" \
- "$(INTDIR)\jdmerge.obj" \
- "$(INTDIR)\jcomapi.obj" \
- "$(INTDIR)\jutils.obj" \
- "$(INTDIR)\jerror.obj" \
- "$(INTDIR)\jmemmgr.obj" \
- "$(INTDIR)\jmemnobs.obj"
-
-"$(OUTDIR)\jpeg.lib" : "$(OUTDIR)" $(DEF_FILE) $(LIB32_OBJS)
- $(LIB32) @<<
- $(LIB32_FLAGS) $(DEF_FLAGS) $(LIB32_OBJS)
-<<
-
-!ENDIF
-
-.c{$(CPP_OBJS)}.obj:
- $(CPP) $(CPP_PROJ) $<
-
-.cpp{$(CPP_OBJS)}.obj:
- $(CPP) $(CPP_PROJ) $<
-
-.cxx{$(CPP_OBJS)}.obj:
- $(CPP) $(CPP_PROJ) $<
-
-.c{$(CPP_SBRS)}.sbr:
- $(CPP) $(CPP_PROJ) $<
-
-.cpp{$(CPP_SBRS)}.sbr:
- $(CPP) $(CPP_PROJ) $<
-
-.cxx{$(CPP_SBRS)}.sbr:
- $(CPP) $(CPP_PROJ) $<
-
-################################################################################
-# Begin Target
-
-# Name "jpeg - Win32"
-
-!IF "$(CFG)" == "jpeg - Win32"
-
-!ENDIF
-
-################################################################################
-# Begin Source File
-
-SOURCE="jcapimin.c"
-DEP_CPP_JCAPI=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jcapimin.obj" : $(SOURCE) $(DEP_CPP_JCAPI) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcapistd.c"
-DEP_CPP_JCAPIS=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jcapistd.obj" : $(SOURCE) $(DEP_CPP_JCAPIS) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jccoefct.c"
-DEP_CPP_JCCOE=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jccoefct.obj" : $(SOURCE) $(DEP_CPP_JCCOE) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jccolor.c"
-DEP_CPP_JCCOL=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jccolor.obj" : $(SOURCE) $(DEP_CPP_JCCOL) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcdctmgr.c"
-DEP_CPP_JCDCT=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jdct.h"\
-
-
-"$(INTDIR)\jcdctmgr.obj" : $(SOURCE) $(DEP_CPP_JCDCT) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jchuff.c"
-DEP_CPP_JCHUF=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jchuff.h"\
-
-
-"$(INTDIR)\jchuff.obj" : $(SOURCE) $(DEP_CPP_JCHUF) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcinit.c"
-DEP_CPP_JCINI=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jcinit.obj" : $(SOURCE) $(DEP_CPP_JCINI) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcmainct.c"
-DEP_CPP_JCMAI=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jcmainct.obj" : $(SOURCE) $(DEP_CPP_JCMAI) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcmarker.c"
-DEP_CPP_JCMAR=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jcmarker.obj" : $(SOURCE) $(DEP_CPP_JCMAR) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcmaster.c"
-DEP_CPP_JCMAS=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jcmaster.obj" : $(SOURCE) $(DEP_CPP_JCMAS) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcomapi.c"
-DEP_CPP_JCOMA=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jcomapi.obj" : $(SOURCE) $(DEP_CPP_JCOMA) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcparam.c"
-DEP_CPP_JCPAR=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jcparam.obj" : $(SOURCE) $(DEP_CPP_JCPAR) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcphuff.c"
-DEP_CPP_JCPHU=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jchuff.h"\
-
-
-"$(INTDIR)\jcphuff.obj" : $(SOURCE) $(DEP_CPP_JCPHU) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcprepct.c"
-DEP_CPP_JCPRE=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jcprepct.obj" : $(SOURCE) $(DEP_CPP_JCPRE) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jcsample.c"
-DEP_CPP_JCSAM=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jcsample.obj" : $(SOURCE) $(DEP_CPP_JCSAM) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jctrans.c"
-DEP_CPP_JCTRA=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jctrans.obj" : $(SOURCE) $(DEP_CPP_JCTRA) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdapimin.c"
-DEP_CPP_JDAPI=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdapimin.obj" : $(SOURCE) $(DEP_CPP_JDAPI) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdapistd.c"
-DEP_CPP_JDAPIS=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdapistd.obj" : $(SOURCE) $(DEP_CPP_JDAPIS) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdatadst.c"
-DEP_CPP_JDATA=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdatadst.obj" : $(SOURCE) $(DEP_CPP_JDATA) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdatasrc.c"
-DEP_CPP_JDATAS=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdatasrc.obj" : $(SOURCE) $(DEP_CPP_JDATAS) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdcoefct.c"
-DEP_CPP_JDCOE=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdcoefct.obj" : $(SOURCE) $(DEP_CPP_JDCOE) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdcolor.c"
-DEP_CPP_JDCOL=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdcolor.obj" : $(SOURCE) $(DEP_CPP_JDCOL) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jddctmgr.c"
-DEP_CPP_JDDCT=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jdct.h"\
-
-
-"$(INTDIR)\jddctmgr.obj" : $(SOURCE) $(DEP_CPP_JDDCT) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdhuff.c"
-DEP_CPP_JDHUF=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jdhuff.h"\
-
-
-"$(INTDIR)\jdhuff.obj" : $(SOURCE) $(DEP_CPP_JDHUF) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdinput.c"
-DEP_CPP_JDINP=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdinput.obj" : $(SOURCE) $(DEP_CPP_JDINP) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdmainct.c"
-DEP_CPP_JDMAI=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdmainct.obj" : $(SOURCE) $(DEP_CPP_JDMAI) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdmarker.c"
-DEP_CPP_JDMAR=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdmarker.obj" : $(SOURCE) $(DEP_CPP_JDMAR) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdmaster.c"
-DEP_CPP_JDMAS=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdmaster.obj" : $(SOURCE) $(DEP_CPP_JDMAS) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdmerge.c"
-DEP_CPP_JDMER=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdmerge.obj" : $(SOURCE) $(DEP_CPP_JDMER) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdphuff.c"
-DEP_CPP_JDPHU=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jdhuff.h"\
-
-
-"$(INTDIR)\jdphuff.obj" : $(SOURCE) $(DEP_CPP_JDPHU) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdpostct.c"
-DEP_CPP_JDPOS=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdpostct.obj" : $(SOURCE) $(DEP_CPP_JDPOS) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdsample.c"
-DEP_CPP_JDSAM=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdsample.obj" : $(SOURCE) $(DEP_CPP_JDSAM) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jdtrans.c"
-DEP_CPP_JDTRA=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jdtrans.obj" : $(SOURCE) $(DEP_CPP_JDTRA) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jerror.c"
-DEP_CPP_JERRO=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jversion.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jerror.obj" : $(SOURCE) $(DEP_CPP_JERRO) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jfdctflt.c"
-DEP_CPP_JFDCT=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jdct.h"\
-
-
-"$(INTDIR)\jfdctflt.obj" : $(SOURCE) $(DEP_CPP_JFDCT) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jfdctfst.c"
-DEP_CPP_JFDCTF=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jdct.h"\
-
-
-"$(INTDIR)\jfdctfst.obj" : $(SOURCE) $(DEP_CPP_JFDCTF) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jfdctint.c"
-DEP_CPP_JFDCTI=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jdct.h"\
-
-
-"$(INTDIR)\jfdctint.obj" : $(SOURCE) $(DEP_CPP_JFDCTI) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jidctflt.c"
-DEP_CPP_JIDCT=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jdct.h"\
-
-
-"$(INTDIR)\jidctflt.obj" : $(SOURCE) $(DEP_CPP_JIDCT) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jidctfst.c"
-DEP_CPP_JIDCTF=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jdct.h"\
-
-
-"$(INTDIR)\jidctfst.obj" : $(SOURCE) $(DEP_CPP_JIDCTF) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jidctint.c"
-DEP_CPP_JIDCTI=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jdct.h"\
-
-
-"$(INTDIR)\jidctint.obj" : $(SOURCE) $(DEP_CPP_JIDCTI) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jidctred.c"
-DEP_CPP_JIDCTR=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jdct.h"\
-
-
-"$(INTDIR)\jidctred.obj" : $(SOURCE) $(DEP_CPP_JIDCTR) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jquant1.c"
-DEP_CPP_JQUAN=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jquant1.obj" : $(SOURCE) $(DEP_CPP_JQUAN) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jquant2.c"
-DEP_CPP_JQUANT=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jquant2.obj" : $(SOURCE) $(DEP_CPP_JQUANT) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jutils.c"
-DEP_CPP_JUTIL=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
-
-
-"$(INTDIR)\jutils.obj" : $(SOURCE) $(DEP_CPP_JUTIL) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jmemmgr.c"
-DEP_CPP_JMEMM=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jmemsys.h"\
-
-
-"$(INTDIR)\jmemmgr.obj" : $(SOURCE) $(DEP_CPP_JMEMM) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-################################################################################
-# Begin Source File
-
-SOURCE="jmemnobs.c"
-DEP_CPP_JMEMN=\
- "jinclude.h"\
- "jconfig.h"\
- "jpeglib.h"\
- "jmorecfg.h"\
- "jpegint.h"\
- "jerror.h"\
- "jmemsys.h"\
-
-
-"$(INTDIR)\jmemnobs.obj" : $(SOURCE) $(DEP_CPP_JMEMN) "$(INTDIR)"
- $(CPP) $(CPP_PROJ) $(SOURCE)
-
-
-# End Source File
-# End Target
-# End Project
-################################################################################
-
diff --git a/makeproj.mac b/makeproj.mac
deleted file mode 100644
index ed277c8..0000000
--- a/makeproj.mac
+++ /dev/null
@@ -1,213 +0,0 @@
---
--- makeproj.mac
---
--- This AppleScript builds Code Warrior PRO Release 2 project files for the
--- libjpeg library as well as the test programs 'cjpeg', 'djpeg', 'jpegtran'.
--- (We'd distribute real project files, except they're not text
--- and would create maintenance headaches.)
---
--- The script then compiles and links the library and the test programs.
--- NOTE: if you haven't already created a 'jconfig.h' file, the script
--- automatically copies 'jconfig.mac' to 'jconfig.h'.
---
--- To use this script, you must have AppleScript 1.1 or later installed
--- and a suitable AppleScript editor like Script Editor or Script Debugger
--- (http://www.latenightsw.com). Open this file with your AppleScript
--- editor and execute the "run" command to build the projects.
---
--- Thanks to Dan Sears and Don Agro for this script.
--- Questions about this script can be addressed to dogpark@interlog.com
---
-
-on run
-
- choose folder with prompt ">>> Select IJG source folder <<<"
- set ijg_folder to result
-
- choose folder with prompt ">>> Select MetroWerks folder <<<"
- set cw_folder to result
-
- -- if jconfig.h doesn't already exist, copy jconfig.mac
-
- tell application "Finder"
- if not (exists file "jconfig.h" of ijg_folder) then
- duplicate {file "jconfig.mac" of folder ijg_folder}
- select file "jconfig.mac copy" of folder ijg_folder
- set name of selection to "jconfig.h"
- end if
- end tell
-
- tell application "CodeWarrior IDE 2.1"
- with timeout of 10000 seconds
-
- -- create libjpeg project
-
- activate
- Create Project (ijg_folder as string) & "libjpeg.proj"
- Set Preferences of panel "Target Settings" to {Target Name:"libjpeg"}
- Set Preferences of panel "PPC Project" to {File Name:"libjpeg"}
- Set Preferences of panel "Target Settings" to {Linker:"MacOS PPC Linker"}
- Set Preferences of panel "PPC Project" to {Project Type:library}
- Set Preferences of panel "C/C++ Compiler" to {ANSI Strict:true}
- Set Preferences of panel "C/C++ Compiler" to {Enums Always Ints:true}
- Set Preferences of panel "PPC Codegen" to {Struct Alignment:PowerPC}
- Set Preferences of panel "PPC Linker" to {Generate SYM File:false}
-
- Add Files (ijg_folder as string) & "jcapimin.c" To Segment 1
- Add Files (ijg_folder as string) & "jcapistd.c" To Segment 1
- Add Files (ijg_folder as string) & "jctrans.c" To Segment 1
- Add Files (ijg_folder as string) & "jcparam.c" To Segment 1
- Add Files (ijg_folder as string) & "jdatadst.c" To Segment 1
- Add Files (ijg_folder as string) & "jcinit.c" To Segment 1
- Add Files (ijg_folder as string) & "jcmaster.c" To Segment 1
- Add Files (ijg_folder as string) & "jcmarker.c" To Segment 1
- Add Files (ijg_folder as string) & "jcmainct.c" To Segment 1
- Add Files (ijg_folder as string) & "jcprepct.c" To Segment 1
- Add Files (ijg_folder as string) & "jccoefct.c" To Segment 1
- Add Files (ijg_folder as string) & "jccolor.c" To Segment 1
- Add Files (ijg_folder as string) & "jcsample.c" To Segment 1
- Add Files (ijg_folder as string) & "jchuff.c" To Segment 1
- Add Files (ijg_folder as string) & "jcphuff.c" To Segment 1
- Add Files (ijg_folder as string) & "jcdctmgr.c" To Segment 1
- Add Files (ijg_folder as string) & "jfdctfst.c" To Segment 1
- Add Files (ijg_folder as string) & "jfdctflt.c" To Segment 1
- Add Files (ijg_folder as string) & "jfdctint.c" To Segment 1
- Add Files (ijg_folder as string) & "jdapimin.c" To Segment 1
- Add Files (ijg_folder as string) & "jdapistd.c" To Segment 1
- Add Files (ijg_folder as string) & "jdtrans.c" To Segment 1
- Add Files (ijg_folder as string) & "jdatasrc.c" To Segment 1
- Add Files (ijg_folder as string) & "jdmaster.c" To Segment 1
- Add Files (ijg_folder as string) & "jdinput.c" To Segment 1
- Add Files (ijg_folder as string) & "jdmarker.c" To Segment 1
- Add Files (ijg_folder as string) & "jdhuff.c" To Segment 1
- Add Files (ijg_folder as string) & "jdphuff.c" To Segment 1
- Add Files (ijg_folder as string) & "jdmainct.c" To Segment 1
- Add Files (ijg_folder as string) & "jdcoefct.c" To Segment 1
- Add Files (ijg_folder as string) & "jdpostct.c" To Segment 1
- Add Files (ijg_folder as string) & "jddctmgr.c" To Segment 1
- Add Files (ijg_folder as string) & "jidctfst.c" To Segment 1
- Add Files (ijg_folder as string) & "jidctflt.c" To Segment 1
- Add Files (ijg_folder as string) & "jidctint.c" To Segment 1
- Add Files (ijg_folder as string) & "jidctred.c" To Segment 1
- Add Files (ijg_folder as string) & "jdsample.c" To Segment 1
- Add Files (ijg_folder as string) & "jdcolor.c" To Segment 1
- Add Files (ijg_folder as string) & "jquant1.c" To Segment 1
- Add Files (ijg_folder as string) & "jquant2.c" To Segment 1
- Add Files (ijg_folder as string) & "jdmerge.c" To Segment 1
- Add Files (ijg_folder as string) & "jcomapi.c" To Segment 1
- Add Files (ijg_folder as string) & "jutils.c" To Segment 1
- Add Files (ijg_folder as string) & "jerror.c" To Segment 1
- Add Files (ijg_folder as string) & "jmemmgr.c" To Segment 1
- Add Files (ijg_folder as string) & "jmemmac.c" To Segment 1
-
- -- compile and link the library
-
- Make Project
- Close Project
-
- -- create cjpeg project
-
- activate
- Create Project (ijg_folder as string) & "cjpeg.proj"
- Set Preferences of panel "Target Settings" to {Target Name:"cjpeg"}
- Set Preferences of panel "PPC Project" to {File Name:"cjpeg"}
- Set Preferences of panel "Target Settings" to {Linker:"MacOS PPC Linker"}
- Set Preferences of panel "C/C++ Compiler" to {ANSI Strict:true}
- Set Preferences of panel "C/C++ Compiler" to {Enums Always Ints:true}
- Set Preferences of panel "PPC Codegen" to {Struct Alignment:PowerPC}
- Set Preferences of panel "PPC Linker" to {Generate SYM File:false}
-
- Add Files (ijg_folder as string) & "cjpeg.c" To Segment 1
- Add Files (ijg_folder as string) & "rdppm.c" To Segment 1
- Add Files (ijg_folder as string) & "rdgif.c" To Segment 1
- Add Files (ijg_folder as string) & "rdtarga.c" To Segment 1
- Add Files (ijg_folder as string) & "rdrle.c" To Segment 1
- Add Files (ijg_folder as string) & "rdbmp.c" To Segment 1
- Add Files (ijg_folder as string) & "rdswitch.c" To Segment 1
- Add Files (ijg_folder as string) & "cdjpeg.c" To Segment 1
-
- Add Files (ijg_folder as string) & "libjpeg" To Segment 2
-
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:Metrowerks Standard Library:MSL C:Bin:MSL C.PPC.Lib" To Segment 3
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:Metrowerks Standard Library:MSL C:Bin:MSL SIOUX.PPC.Lib" To Segment 3
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:Runtime:Runtime PPC:MSL RuntimePPC.Lib" To Segment 3
-
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:MacOS Common:InterfaceLib" To Segment 4
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:MacOS Common:MathLib" To Segment 4
-
- -- compile and link cjpeg
-
- Make Project
- Close Project
-
- -- create djpeg project
-
- activate
- Create Project (ijg_folder as string) & "djpeg.proj"
- Set Preferences of panel "Target Settings" to {Target Name:"djpeg"}
- Set Preferences of panel "PPC Project" to {File Name:"djpeg"}
- Set Preferences of panel "Target Settings" to {Linker:"MacOS PPC Linker"}
- Set Preferences of panel "C/C++ Compiler" to {ANSI Strict:true}
- Set Preferences of panel "C/C++ Compiler" to {Enums Always Ints:true}
- Set Preferences of panel "PPC Codegen" to {Struct Alignment:PowerPC}
- Set Preferences of panel "PPC Linker" to {Generate SYM File:false}
-
- Add Files (ijg_folder as string) & "djpeg.c" To Segment 1
- Add Files (ijg_folder as string) & "wrppm.c" To Segment 1
- Add Files (ijg_folder as string) & "wrgif.c" To Segment 1
- Add Files (ijg_folder as string) & "wrtarga.c" To Segment 1
- Add Files (ijg_folder as string) & "wrrle.c" To Segment 1
- Add Files (ijg_folder as string) & "wrbmp.c" To Segment 1
- Add Files (ijg_folder as string) & "rdcolmap.c" To Segment 1
- Add Files (ijg_folder as string) & "cdjpeg.c" To Segment 1
-
- Add Files (ijg_folder as string) & "libjpeg" To Segment 2
-
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:Metrowerks Standard Library:MSL C:Bin:MSL C.PPC.Lib" To Segment 3
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:Metrowerks Standard Library:MSL C:Bin:MSL SIOUX.PPC.Lib" To Segment 3
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:Runtime:Runtime PPC:MSL RuntimePPC.Lib" To Segment 3
-
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:MacOS Common:InterfaceLib" To Segment 4
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:MacOS Common:MathLib" To Segment 4
-
- -- compile and link djpeg
-
- Make Project
- Close Project
-
- -- create jpegtran project
-
- activate
- Create Project (ijg_folder as string) & "jpegtran.proj"
- Set Preferences of panel "Target Settings" to {Target Name:"jpegtran"}
- Set Preferences of panel "PPC Project" to {File Name:"jpegtran"}
- Set Preferences of panel "Target Settings" to {Linker:"MacOS PPC Linker"}
- Set Preferences of panel "C/C++ Compiler" to {ANSI Strict:true}
- Set Preferences of panel "C/C++ Compiler" to {Enums Always Ints:true}
- Set Preferences of panel "PPC Codegen" to {Struct Alignment:PowerPC}
- Set Preferences of panel "PPC Linker" to {Generate SYM File:false}
-
- Add Files (ijg_folder as string) & "jpegtran.c" To Segment 1
- Add Files (ijg_folder as string) & "rdswitch.c" To Segment 1
- Add Files (ijg_folder as string) & "cdjpeg.c" To Segment 1
- Add Files (ijg_folder as string) & "transupp.c" To Segment 1
-
- Add Files (ijg_folder as string) & "libjpeg" To Segment 2
-
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:Metrowerks Standard Library:MSL C:Bin:MSL C.PPC.Lib" To Segment 3
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:Metrowerks Standard Library:MSL C:Bin:MSL SIOUX.PPC.Lib" To Segment 3
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:Runtime:Runtime PPC:MSL RuntimePPC.Lib" To Segment 3
-
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:MacOS Common:InterfaceLib" To Segment 4
- Add Files (cw_folder as string) & "Metrowerks CodeWarrior:MacOS Support:Libraries:MacOS Common:MathLib" To Segment 4
-
- -- compile and link jpegtran
-
- Make Project
- Close Project
-
- quit
-
- end timeout
- end tell
-end run
diff --git a/makljpeg.st b/makljpeg.st
deleted file mode 100644
index 813493e..0000000
--- a/makljpeg.st
+++ /dev/null
@@ -1,70 +0,0 @@
-; Project file for Independent JPEG Group's software
-;
-; This project file is for Atari ST/STE/TT systems using Pure C or Turbo C.
-; Thanks to Frank Moehle (Frank.Moehle@arbi.informatik.uni-oldenburg.de),
-; Dr. B. Setzepfandt (bernd@gina.uni-muenster.de),
-; and Guido Vollbeding (guivol@esc.de).
-;
-; To use this file, rename it to libjpeg.prj.
-; Read installation instructions before trying to make the program!
-;
-;
-; * * * Output file * * *
-libjpeg.lib
-;
-; * * * COMPILER OPTIONS * * *
-.C[-P] ; absolute calls
-.C[-M] ; and no string merging, folks
-.C[-w-cln] ; no "constant is long" warnings
-.C[-w-par] ; no "parameter xxxx unused"
-.C[-w-rch] ; no "unreachable code"
-.C[-wsig] ; warn if significant digits may be lost
-.L[-J] ; link new Obj-format (so we get a library)
-=
-; * * * * List of modules * * * *
-jcapimin.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcapistd.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jccoefct.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jccolor.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcdctmgr.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jchuff.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jchuff.h)
-jcinit.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcmainct.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcmarker.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcmaster.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcomapi.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcparam.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcphuff.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jchuff.h)
-jcprepct.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jcsample.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jctrans.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdapimin.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdapistd.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdatadst.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h)
-jdatasrc.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h)
-jdcoefct.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdcolor.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jddctmgr.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jdhuff.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdhuff.h)
-jdinput.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdmainct.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdmarker.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdmaster.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdmerge.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdphuff.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdhuff.h)
-jdpostct.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdsample.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jdtrans.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jerror.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jversion.h,jerror.h)
-jfdctflt.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jfdctfst.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jfdctint.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jidctflt.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jidctfst.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jidctint.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jidctred.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jdct.h)
-jquant1.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jquant2.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jutils.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h)
-jmemmgr.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jmemsys.h)
-jmemansi.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,jmemsys.h)
diff --git a/maktjpeg.st b/maktjpeg.st
deleted file mode 100644
index 31f4d16..0000000
--- a/maktjpeg.st
+++ /dev/null
@@ -1,32 +0,0 @@
-; Project file for Independent JPEG Group's software
-;
-; This project file is for Atari ST/STE/TT systems using Pure C or Turbo C.
-; Thanks to Frank Moehle (Frank.Moehle@arbi.informatik.uni-oldenburg.de),
-; Dr. B. Setzepfandt (bernd@gina.uni-muenster.de),
-; and Guido Vollbeding (guivol@esc.de).
-;
-; To use this file, rename it to jpegtran.prj.
-; If you are using Turbo C, change filenames beginning with "pc..." to "tc..."
-; Read installation instructions before trying to make the program!
-;
-;
-; * * * Output file * * *
-jpegtran.ttp
-;
-; * * * COMPILER OPTIONS * * *
-.C[-P] ; absolute calls
-.C[-M] ; and no string merging, folks
-.C[-w-cln] ; no "constant is long" warnings
-.C[-w-par] ; no "parameter xxxx unused"
-.C[-w-rch] ; no "unreachable code"
-.C[-wsig] ; warn if significant digits may be lost
-=
-; * * * * List of modules * * * *
-pcstart.o
-jpegtran.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h,transupp.h,jversion.h)
-cdjpeg.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-rdswitch.c (cdjpeg.h,jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jerror.h,cderror.h)
-transupp.c (jinclude.h,jconfig.h,jpeglib.h,jmorecfg.h,jpegint.h,jerror.h,transupp.h)
-libjpeg.lib ; built by libjpeg.prj
-pcstdlib.lib ; standard library
-pcextlib.lib ; extended library
diff --git a/makvms.opt b/makvms.opt
deleted file mode 100644
index 675e8fe..0000000
--- a/makvms.opt
+++ /dev/null
@@ -1,4 +0,0 @@
-! A pointer to the VAX/VMS C Run-Time Shareable Library.
-! This file is needed by makefile.mms and makefile.vms,
-! but only for the older VAX C compiler. DEC C does not need it.
-Sys$Library:VAXCRTL.EXE /Share
diff --git a/rrtimer.h b/rrtimer.h
new file mode 100644
index 0000000..4db5e37
--- /dev/null
+++ b/rrtimer.h
@@ -0,0 +1,114 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005 Sun Microsystems, Inc.
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version. The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * wxWindows Library License for more details.
+ */
+
+#ifndef __RRTIMER_H__
+#define __RRTIMER_H__
+
+#ifdef __cplusplus
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <sys/time.h>
+#endif
+
+class rrtimer
+{
+ public:
+
+ rrtimer(void) : t1(0.0)
+ {
+ #ifdef _WIN32
+ highres=false; tick=0.001;
+ LARGE_INTEGER Frequency;
+ if(QueryPerformanceFrequency(&Frequency)!=0)
+ {
+ tick=(double)1.0/(double)(Frequency.QuadPart);
+ highres=true;
+ }
+ #endif
+ }
+
+ void start(void)
+ {
+ t1=time();
+ }
+
+ double time(void)
+ {
+ #ifdef _WIN32
+ if(highres)
+ {
+ LARGE_INTEGER Time;
+ QueryPerformanceCounter(&Time);
+ return((double)(Time.QuadPart)*tick);
+ }
+ else
+ return((double)GetTickCount()*tick);
+ #else
+ struct timeval __tv;
+ gettimeofday(&__tv, (struct timezone *)NULL);
+ return((double)(__tv.tv_sec)+(double)(__tv.tv_usec)*0.000001);
+ #endif
+ }
+
+ double elapsed(void)
+ {
+ return time()-t1;
+ }
+
+ private:
+
+ #ifdef _WIN32
+ bool highres; double tick;
+ #endif
+ double t1;
+};
+
+#endif // __cplusplus
+
+#ifdef _WIN32
+
+#include <windows.h>
+
+__inline double rrtime(void)
+{
+ LARGE_INTEGER Frequency, Time;
+ if(QueryPerformanceFrequency(&Frequency)!=0)
+ {
+ QueryPerformanceCounter(&Time);
+ return (double)Time.QuadPart/(double)Frequency.QuadPart;
+ }
+ else return (double)GetTickCount()*0.001;
+}
+
+#else
+
+#include <sys/time.h>
+
+#ifdef sun
+#define __inline inline
+#endif
+
+static __inline double rrtime(void)
+{
+ struct timeval __tv;
+ gettimeofday(&__tv, (struct timezone *)NULL);
+ return((double)__tv.tv_sec+(double)__tv.tv_usec*0.000001);
+}
+
+#endif
+
+#endif
+
diff --git a/rrutil.h b/rrutil.h
new file mode 100644
index 0000000..4918120
--- /dev/null
+++ b/rrutil.h
@@ -0,0 +1,81 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005 Sun Microsystems, Inc.
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version. The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * wxWindows Library License for more details.
+ */
+
+#ifndef __RRUTIL_H__
+#define __RRUTIL_H__
+
+#ifdef _WIN32
+ #include <windows.h>
+ #define sleep(t) Sleep((t)*1000)
+ #define usleep(t) Sleep((t)/1000)
+#else
+ #include <unistd.h>
+ #define stricmp strcasecmp
+ #define strnicmp strncasecmp
+#endif
+
+#ifndef min
+ #define min(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef max
+ #define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#define pow2(i) (1<<(i))
+#define isPow2(x) (((x)&(x-1))==0)
+
+#ifdef sgi
+#define _SC_NPROCESSORS_CONF _SC_NPROC_CONF
+#endif
+
+#ifdef sun
+#define __inline inline
+#endif
+
+static __inline int numprocs(void)
+{
+ #ifdef _WIN32
+ DWORD ProcAff, SysAff, i; int count=0;
+ if(!GetProcessAffinityMask(GetCurrentProcess(), &ProcAff, &SysAff)) return(1);
+ for(i=0; i<32; i++) if(ProcAff&(1<<i)) count++;
+ return(count);
+ #elif defined (__APPLE__)
+ return(1);
+ #else
+ long count=1;
+ if((count=sysconf(_SC_NPROCESSORS_CONF))!=-1) return((int)count);
+ else return(1);
+ #endif
+}
+
+#define byteswap(i) ( \
+ (((i) & 0xff000000) >> 24) | \
+ (((i) & 0x00ff0000) >> 8) | \
+ (((i) & 0x0000ff00) << 8) | \
+ (((i) & 0x000000ff) << 24) )
+
+#define byteswap16(i) ( \
+ (((i) & 0xff00) >> 8) | \
+ (((i) & 0x00ff) << 8) )
+
+static __inline int littleendian(void)
+{
+ unsigned int value=1;
+ unsigned char *ptr=(unsigned char *)(&value);
+ if(ptr[0]==1 && ptr[3]==0) return 1;
+ else return 0;
+}
+
+#endif
diff --git a/simd/Makefile.am b/simd/Makefile.am
new file mode 100644
index 0000000..1df6143
--- /dev/null
+++ b/simd/Makefile.am
@@ -0,0 +1,57 @@
+noinst_LTLIBRARIES = libsimd.la
+
+BUILT_SOURCES = jsimdcfg.inc
+
+EXTRA_DIST = nasm_lt.sh jcclrmmx.asm jcclrss2.asm jdclrmmx.asm jdclrss2.asm \
+ jdmrgmmx.asm jdmrgss2.asm jcclrss2-64.asm jdclrss2-64.asm \
+ jdmrgss2-64.asm
+
+if SIMD_X86_64
+
+libsimd_la_SOURCES = jsimd_x86_64.c \
+ jsimd.h jsimdcfg.inc.h \
+ jsimdext.inc jcolsamp.inc jdct.inc \
+ jfsseflt-64.asm \
+ jccolss2-64.asm jdcolss2-64.asm \
+ jcsamss2-64.asm jdsamss2-64.asm jdmerss2-64.asm \
+ jcqnts2i-64.asm jfss2fst-64.asm jfss2int-64.asm \
+ jiss2red-64.asm jiss2int-64.asm jiss2fst-64.asm \
+ jcqnts2f-64.asm jiss2flt-64.asm
+
+jccolss2-64.lo: jcclrss2-64.asm
+jdcolss2-64.lo: jdclrss2-64.asm
+jdmerss2-64.lo: jdmrgss2-64.asm
+endif
+
+if SIMD_I386
+
+libsimd_la_SOURCES = jsimd_i386.c \
+ jsimd.h jsimdcfg.inc.h \
+ jsimdext.inc jcolsamp.inc jdct.inc \
+ jsimdcpu.asm \
+ jccolmmx.asm jdcolmmx.asm \
+ jcsammmx.asm jdsammmx.asm jdmermmx.asm \
+ jcqntmmx.asm jfmmxfst.asm jfmmxint.asm \
+ jimmxred.asm jimmxint.asm jimmxfst.asm \
+ jcqnt3dn.asm jf3dnflt.asm ji3dnflt.asm \
+ jcqntsse.asm jfsseflt.asm jisseflt.asm \
+ jccolss2.asm jdcolss2.asm \
+ jcsamss2.asm jdsamss2.asm jdmerss2.asm \
+ jcqnts2i.asm jfss2fst.asm jfss2int.asm \
+ jiss2red.asm jiss2int.asm jiss2fst.asm \
+ jcqnts2f.asm jiss2flt.asm
+
+jccolmmx.lo: jcclrmmx.asm
+jccolss2.lo: jcclrss2.asm
+jdcolmmx.lo: jdclrmmx.asm
+jdcolss2.lo: jdclrss2.asm
+jdmermmx.lo: jdmrgmmx.asm
+jdmerss2.lo: jdmrgss2.asm
+endif
+
+.asm.lo:
+ $(LIBTOOL) --mode=compile --tag NASM ./nasm_lt.sh $(NASM) $(NAFLAGS) $< -o $@
+
+jsimdcfg.inc: jsimdcfg.inc.h ../jpeglib.h ../jconfig.h ../jmorecfg.h
+ $(CPP) jsimdcfg.inc.h | $(EGREP) ^[\;%] | sed 's%_cpp_protection_%%' > $@
+
diff --git a/simd/jcclrmmx.asm b/simd/jcclrmmx.asm
new file mode 100644
index 0000000..b6b8912
--- /dev/null
+++ b/simd/jcclrmmx.asm
@@ -0,0 +1,479 @@
+;
+; jcclrmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
+; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+; JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b) (b)+8 ; JDIMENSION img_width
+%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b)+20 ; JDIMENSION output_row
+%define num_rows(b) (b)+24 ; int num_rows
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 8
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+
+ align 16
+ global EXTN(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)] ; num_cols
+ test ecx,ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax,eax
+ jle near .return
+ alignx 16,7
+.rowloop:
+ pushpic eax
+ push edx
+ push ebx
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ mov ebx, JSAMPROW [ebx] ; outptr1
+ mov edx, JSAMPROW [edx] ; outptr2
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jae short .columnloop
+ alignx 16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ xor eax,eax
+ mov al, BYTE [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ xor edx,edx
+ mov dx, WORD [esi+ecx]
+ shl eax, WORD_BIT
+ or eax,edx
+.column_ld4:
+ movd mmA,eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd mmG, DWORD [esi+ecx]
+ psllq mmA, DWORD_BIT
+ por mmA,mmG
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ movq mmG,mmA
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ mov ecx, SIZEOF_MMWORD
+ jmp short .rgb_ycc_cnv
+.column_ld16:
+ test cl, 2*SIZEOF_MMWORD
+ mov ecx, SIZEOF_MMWORD
+ jz short .rgb_ycc_cnv
+ movq mmF,mmA
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16,7
+
+.columnloop:
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+ ; mmA=(00 10 20 01 11 21 02 12)
+ ; mmG=(22 03 13 23 04 14 24 05)
+ ; mmF=(15 25 06 16 26 07 17 27)
+
+ movq mmD,mmA
+ psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
+ psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
+
+ punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05)
+ psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
+
+ punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16)
+ punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27)
+
+ movq mmE,mmA
+ psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
+ psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
+
+ punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
+ psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
+
+ punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07)
+ punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27)
+
+ pxor mmH,mmH
+
+ movq mmC,mmA
+ punpcklbw mmA,mmH ; mmA=(00 02 04 06)
+ punpckhbw mmC,mmH ; mmC=(10 12 14 16)
+
+ movq mmB,mmE
+ punpcklbw mmE,mmH ; mmE=(20 22 24 26)
+ punpckhbw mmB,mmH ; mmB=(01 03 05 07)
+
+ movq mmF,mmD
+ punpcklbw mmD,mmH ; mmD=(11 13 15 17)
+ punpckhbw mmF,mmH ; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_MMWORD/8
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_MMWORD/8
+ movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_MMWORD/4
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_MMWORD/4
+ movq mmF,mmA
+ movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+ test cl, SIZEOF_MMWORD/2
+ mov ecx, SIZEOF_MMWORD
+ jz short .rgb_ycc_cnv
+ movq mmD,mmA
+ movq mmC,mmF
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16,7
+
+.columnloop:
+ movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+ movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+ ; mmA=(00 10 20 30 01 11 21 31)
+ ; mmF=(02 12 22 32 03 13 23 33)
+ ; mmD=(04 14 24 34 05 15 25 35)
+ ; mmC=(06 16 26 36 07 17 27 37)
+
+ movq mmB,mmA
+ punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32)
+ punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33)
+
+ movq mmG,mmD
+ punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36)
+ punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37)
+
+ movq mmE,mmA
+ punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16)
+ punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36)
+
+ movq mmH,mmB
+ punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17)
+ punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37)
+
+ pxor mmF,mmF
+
+ movq mmC,mmA
+ punpcklbw mmA,mmF ; mmA=(00 02 04 06)
+ punpckhbw mmC,mmF ; mmC=(10 12 14 16)
+
+ movq mmD,mmB
+ punpcklbw mmB,mmF ; mmB=(01 03 05 07)
+ punpckhbw mmD,mmF ; mmD=(11 13 15 17)
+
+ movq mmG,mmE
+ punpcklbw mmE,mmF ; mmE=(20 22 24 26)
+ punpckhbw mmG,mmF ; mmG=(30 32 34 36)
+
+ punpcklbw mmF,mmH
+ punpckhbw mmH,mmH
+ psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27)
+ psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+ ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movq MMWORD [wk(0)], mm0 ; wk(0)=RE
+ movq MMWORD [wk(1)], mm1 ; wk(1)=RO
+ movq MMWORD [wk(2)], mm4 ; wk(2)=BE
+ movq MMWORD [wk(3)], mm5 ; wk(3)=BO
+
+ movq mm6,mm1
+ punpcklwd mm1,mm3
+ punpckhwd mm6,mm3
+ movq mm7,mm1
+ movq mm4,mm6
+ pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor mm1,mm1
+ pxor mm6,mm6
+ punpcklwd mm1,mm5 ; mm1=BOL
+ punpckhwd mm6,mm5 ; mm6=BOH
+ psrld mm1,1 ; mm1=BOL*FIX(0.500)
+ psrld mm6,1 ; mm6=BOH*FIX(0.500)
+
+ movq mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
+
+ paddd mm7,mm1
+ paddd mm4,mm6
+ paddd mm7,mm5
+ paddd mm4,mm5
+ psrld mm7,SCALEBITS ; mm7=CbOL
+ psrld mm4,SCALEBITS ; mm4=CbOH
+ packssdw mm7,mm4 ; mm7=CbO
+
+ movq mm1, MMWORD [wk(2)] ; mm1=BE
+
+ movq mm6,mm0
+ punpcklwd mm0,mm2
+ punpckhwd mm6,mm2
+ movq mm5,mm0
+ movq mm4,mm6
+ pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor mm0,mm0
+ pxor mm6,mm6
+ punpcklwd mm0,mm1 ; mm0=BEL
+ punpckhwd mm6,mm1 ; mm6=BEH
+ psrld mm0,1 ; mm0=BEL*FIX(0.500)
+ psrld mm6,1 ; mm6=BEH*FIX(0.500)
+
+ movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+ paddd mm5,mm0
+ paddd mm4,mm6
+ paddd mm5,mm1
+ paddd mm4,mm1
+ psrld mm5,SCALEBITS ; mm5=CbEL
+ psrld mm4,SCALEBITS ; mm4=CbEH
+ packssdw mm5,mm4 ; mm5=CbE
+
+ psllw mm7,BYTE_BIT
+ por mm5,mm7 ; mm5=Cb
+ movq MMWORD [ebx], mm5 ; Save Cb
+
+ movq mm0, MMWORD [wk(3)] ; mm0=BO
+ movq mm6, MMWORD [wk(2)] ; mm6=BE
+ movq mm1, MMWORD [wk(1)] ; mm1=RO
+
+ movq mm4,mm0
+ punpcklwd mm0,mm3
+ punpckhwd mm4,mm3
+ movq mm7,mm0
+ movq mm5,mm4
+ pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
+
+ paddd mm0, MMWORD [wk(4)]
+ paddd mm4, MMWORD [wk(5)]
+ paddd mm0,mm3
+ paddd mm4,mm3
+ psrld mm0,SCALEBITS ; mm0=YOL
+ psrld mm4,SCALEBITS ; mm4=YOH
+ packssdw mm0,mm4 ; mm0=YO
+
+ pxor mm3,mm3
+ pxor mm4,mm4
+ punpcklwd mm3,mm1 ; mm3=ROL
+ punpckhwd mm4,mm1 ; mm4=ROH
+ psrld mm3,1 ; mm3=ROL*FIX(0.500)
+ psrld mm4,1 ; mm4=ROH*FIX(0.500)
+
+ movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+ paddd mm7,mm3
+ paddd mm5,mm4
+ paddd mm7,mm1
+ paddd mm5,mm1
+ psrld mm7,SCALEBITS ; mm7=CrOL
+ psrld mm5,SCALEBITS ; mm5=CrOH
+ packssdw mm7,mm5 ; mm7=CrO
+
+ movq mm3, MMWORD [wk(0)] ; mm3=RE
+
+ movq mm4,mm6
+ punpcklwd mm6,mm2
+ punpckhwd mm4,mm2
+ movq mm1,mm6
+ movq mm5,mm4
+ pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
+
+ paddd mm6, MMWORD [wk(6)]
+ paddd mm4, MMWORD [wk(7)]
+ paddd mm6,mm2
+ paddd mm4,mm2
+ psrld mm6,SCALEBITS ; mm6=YEL
+ psrld mm4,SCALEBITS ; mm4=YEH
+ packssdw mm6,mm4 ; mm6=YE
+
+ psllw mm0,BYTE_BIT
+ por mm6,mm0 ; mm6=Y
+ movq MMWORD [edi], mm6 ; Save Y
+
+ pxor mm2,mm2
+ pxor mm4,mm4
+ punpcklwd mm2,mm3 ; mm2=REL
+ punpckhwd mm4,mm3 ; mm4=REH
+ psrld mm2,1 ; mm2=REL*FIX(0.500)
+ psrld mm4,1 ; mm4=REH*FIX(0.500)
+
+ movq mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
+
+ paddd mm1,mm2
+ paddd mm5,mm4
+ paddd mm1,mm0
+ paddd mm5,mm0
+ psrld mm1,SCALEBITS ; mm1=CrEL
+ psrld mm5,SCALEBITS ; mm5=CrEH
+ packssdw mm1,mm5 ; mm1=CrE
+
+ psllw mm7,BYTE_BIT
+ por mm1,mm7 ; mm1=Cr
+ movq MMWORD [edx], mm1 ; Save Cr
+
+ sub ecx, byte SIZEOF_MMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
+ add edi, byte SIZEOF_MMWORD ; outptr0
+ add ebx, byte SIZEOF_MMWORD ; outptr1
+ add edx, byte SIZEOF_MMWORD ; outptr2
+ cmp ecx, byte SIZEOF_MMWORD
+ jae near .columnloop
+ test ecx,ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ pop ebx
+ pop edx
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jcclrss2-64.asm b/simd/jcclrss2-64.asm
new file mode 100644
index 0000000..31c5be6
--- /dev/null
+++ b/simd/jcclrss2-64.asm
@@ -0,0 +1,487 @@
+;
+; jcclrss2.asm - colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
+; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+; JDIMENSION output_row, int num_rows);
+;
+
+; r10 = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13 = JDIMENSION output_row
+; r14 = int num_rows
+
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 8
+
+ align 16
+
+ global EXTN(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+ push rbp
+ mov rax,rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp],rax
+ mov rbp,rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ push rbx
+ collect_args
+
+ mov rcx, r10
+ test rcx,rcx
+ jz near .return
+
+ push rcx
+
+ mov rsi, r12
+ mov rcx, r13
+ mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+ mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+ lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+ lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+ lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rsi, r11
+ mov rax, r14
+ test rax,rax
+ jle near .return
+.rowloop:
+ push rdx
+ push rbx
+ push rdi
+ push rsi
+ push rcx ; col
+
+ mov rsi, JSAMPROW [rsi] ; inptr
+ mov rdi, JSAMPROW [rdi] ; outptr0
+ mov rbx, JSAMPROW [rbx] ; outptr1
+ mov rdx, JSAMPROW [rdx] ; outptr2
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push rax
+ push rdx
+ lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_BYTE
+ movzx rax, BYTE [rsi+rcx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_WORD
+ movzx rdx, WORD [rsi+rcx]
+ shl rax, WORD_BIT
+ or rax,rdx
+.column_ld4:
+ movd xmmA,eax
+ pop rdx
+ pop rax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA,xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA,xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF,xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .rgb_ycc_cnv
+.column_ld32:
+ test cl, 2*SIZEOF_XMMWORD
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmB,xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+
+.columnloop:
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ movdqa xmmG,xmmA
+ pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+ punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+ movdqa xmmD,xmmA
+ pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+ punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+ movdqa xmmE,xmmA
+ pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+ punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+ pxor xmmH,xmmH
+
+ movdqa xmmC,xmmA
+ punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmB,xmmE
+ punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+ movdqa xmmF,xmmD
+ punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA,xmmE
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE,xmmA
+ movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmF,xmmA
+ movdqa xmmH,xmmE
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+
+.columnloop:
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD,xmmA
+ punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC,xmmF
+ punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB,xmmA
+ punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG,xmmD
+ punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE,xmmA
+ punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH,xmmB
+ punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF,xmmF
+
+ movdqa xmmC,xmmA
+ punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD,xmmB
+ punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG,xmmE
+ punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF,xmmH
+ punpckhbw xmmH,xmmH
+ psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
+ movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
+
+ movdqa xmm6,xmm1
+ punpcklwd xmm1,xmm3
+ punpckhwd xmm6,xmm3
+ movdqa xmm7,xmm1
+ movdqa xmm4,xmm6
+ pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor xmm1,xmm1
+ pxor xmm6,xmm6
+ punpcklwd xmm1,xmm5 ; xmm1=BOL
+ punpckhwd xmm6,xmm5 ; xmm6=BOH
+ psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
+ psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
+
+ movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7,xmm1
+ paddd xmm4,xmm6
+ paddd xmm7,xmm5
+ paddd xmm4,xmm5
+ psrld xmm7,SCALEBITS ; xmm7=CbOL
+ psrld xmm4,SCALEBITS ; xmm4=CbOH
+ packssdw xmm7,xmm4 ; xmm7=CbO
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
+
+ movdqa xmm6,xmm0
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm6,xmm2
+ movdqa xmm5,xmm0
+ movdqa xmm4,xmm6
+ pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor xmm0,xmm0
+ pxor xmm6,xmm6
+ punpcklwd xmm0,xmm1 ; xmm0=BEL
+ punpckhwd xmm6,xmm1 ; xmm6=BEH
+ psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
+ psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
+
+ movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm5,xmm0
+ paddd xmm4,xmm6
+ paddd xmm5,xmm1
+ paddd xmm4,xmm1
+ psrld xmm5,SCALEBITS ; xmm5=CbEL
+ psrld xmm4,SCALEBITS ; xmm4=CbEH
+ packssdw xmm5,xmm4 ; xmm5=CbE
+
+ psllw xmm7,BYTE_BIT
+ por xmm5,xmm7 ; xmm5=Cb
+ movdqa XMMWORD [rbx], xmm5 ; Save Cb
+
+ movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
+
+ movdqa xmm4,xmm0
+ punpcklwd xmm0,xmm3
+ punpckhwd xmm4,xmm3
+ movdqa xmm7,xmm0
+ movdqa xmm5,xmm4
+ pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, XMMWORD [wk(4)]
+ paddd xmm4, XMMWORD [wk(5)]
+ paddd xmm0,xmm3
+ paddd xmm4,xmm3
+ psrld xmm0,SCALEBITS ; xmm0=YOL
+ psrld xmm4,SCALEBITS ; xmm4=YOH
+ packssdw xmm0,xmm4 ; xmm0=YO
+
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ punpcklwd xmm3,xmm1 ; xmm3=ROL
+ punpckhwd xmm4,xmm1 ; xmm4=ROH
+ psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
+ psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
+
+ movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7,xmm3
+ paddd xmm5,xmm4
+ paddd xmm7,xmm1
+ paddd xmm5,xmm1
+ psrld xmm7,SCALEBITS ; xmm7=CrOL
+ psrld xmm5,SCALEBITS ; xmm5=CrOH
+ packssdw xmm7,xmm5 ; xmm7=CrO
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
+
+ movdqa xmm4,xmm6
+ punpcklwd xmm6,xmm2
+ punpckhwd xmm4,xmm2
+ movdqa xmm1,xmm6
+ movdqa xmm5,xmm4
+ pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(6)]
+ paddd xmm4, XMMWORD [wk(7)]
+ paddd xmm6,xmm2
+ paddd xmm4,xmm2
+ psrld xmm6,SCALEBITS ; xmm6=YEL
+ psrld xmm4,SCALEBITS ; xmm4=YEH
+ packssdw xmm6,xmm4 ; xmm6=YE
+
+ psllw xmm0,BYTE_BIT
+ por xmm6,xmm0 ; xmm6=Y
+ movdqa XMMWORD [rdi], xmm6 ; Save Y
+
+ pxor xmm2,xmm2
+ pxor xmm4,xmm4
+ punpcklwd xmm2,xmm3 ; xmm2=REL
+ punpckhwd xmm4,xmm3 ; xmm4=REH
+ psrld xmm2,1 ; xmm2=REL*FIX(0.500)
+ psrld xmm4,1 ; xmm4=REH*FIX(0.500)
+
+ movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
+
+ paddd xmm1,xmm2
+ paddd xmm5,xmm4
+ paddd xmm1,xmm0
+ paddd xmm5,xmm0
+ psrld xmm1,SCALEBITS ; xmm1=CrEL
+ psrld xmm5,SCALEBITS ; xmm5=CrEH
+ packssdw xmm1,xmm5 ; xmm1=CrE
+
+ psllw xmm7,BYTE_BIT
+ por xmm1,xmm7 ; xmm1=Cr
+ movdqa XMMWORD [rdx], xmm1 ; Save Cr
+
+ sub rcx, byte SIZEOF_XMMWORD
+ add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add rdi, byte SIZEOF_XMMWORD ; outptr0
+ add rbx, byte SIZEOF_XMMWORD ; outptr1
+ add rdx, byte SIZEOF_XMMWORD ; outptr2
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test rcx,rcx
+ jnz near .column_ld1
+
+ pop rcx ; col
+ pop rsi
+ pop rdi
+ pop rbx
+ pop rdx
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_buf
+ add rdi, byte SIZEOF_JSAMPROW
+ add rbx, byte SIZEOF_JSAMPROW
+ add rdx, byte SIZEOF_JSAMPROW
+ dec rax ; num_rows
+ jg near .rowloop
+
+.return:
+ uncollect_args
+ pop rbx
+ mov rsp,rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jcclrss2.asm b/simd/jcclrss2.asm
new file mode 100644
index 0000000..8def718
--- /dev/null
+++ b/simd/jcclrss2.asm
@@ -0,0 +1,505 @@
+;
+; jcclrss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
+; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+; JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b) (b)+8 ; JDIMENSION img_width
+%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b)+20 ; JDIMENSION output_row
+%define num_rows(b) (b)+24 ; int num_rows
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 8
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+
+ align 16
+
+ global EXTN(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)]
+ test ecx,ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax,eax
+ jle near .return
+ alignx 16,7
+.rowloop:
+ pushpic eax
+ push edx
+ push ebx
+ push edi
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ mov ebx, JSAMPROW [ebx] ; outptr1
+ mov edx, JSAMPROW [edx] ; outptr2
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ alignx 16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+ push eax
+ push edx
+ lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ movzx eax, BYTE [esi+ecx]
+.column_ld2:
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ movzx edx, WORD [esi+ecx]
+ shl eax, WORD_BIT
+ or eax,edx
+.column_ld4:
+ movd xmmA,eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA,xmmF
+.column_ld8:
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA,xmmB
+.column_ld16:
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF,xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .rgb_ycc_cnv
+.column_ld32:
+ test cl, 2*SIZEOF_XMMWORD
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmB,xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16,7
+
+.columnloop:
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ movdqa xmmG,xmmA
+ pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+ punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+ movdqa xmmD,xmmA
+ pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+ punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+ movdqa xmmE,xmmA
+ pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+ punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+ punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+ pxor xmmH,xmmH
+
+ movdqa xmmC,xmmA
+ punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmB,xmmE
+ punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+ movdqa xmmF,xmmD
+ punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA,xmmE
+.column_ld4:
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE,xmmA
+ movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+ test cl, SIZEOF_XMMWORD/2
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmF,xmmA
+ movdqa xmmH,xmmE
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16,7
+
+.columnloop:
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD,xmmA
+ punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC,xmmF
+ punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB,xmmA
+ punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG,xmmD
+ punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE,xmmA
+ punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH,xmmB
+ punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF,xmmF
+
+ movdqa xmmC,xmmA
+ punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD,xmmB
+ punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG,xmmE
+ punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF,xmmH
+ punpckhbw xmmH,xmmH
+ psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
+ movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
+
+ movdqa xmm6,xmm1
+ punpcklwd xmm1,xmm3
+ punpckhwd xmm6,xmm3
+ movdqa xmm7,xmm1
+ movdqa xmm4,xmm6
+ pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor xmm1,xmm1
+ pxor xmm6,xmm6
+ punpcklwd xmm1,xmm5 ; xmm1=BOL
+ punpckhwd xmm6,xmm5 ; xmm6=BOH
+ psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
+ psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
+
+ movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7,xmm1
+ paddd xmm4,xmm6
+ paddd xmm7,xmm5
+ paddd xmm4,xmm5
+ psrld xmm7,SCALEBITS ; xmm7=CbOL
+ psrld xmm4,SCALEBITS ; xmm4=CbOH
+ packssdw xmm7,xmm4 ; xmm7=CbO
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
+
+ movdqa xmm6,xmm0
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm6,xmm2
+ movdqa xmm5,xmm0
+ movdqa xmm4,xmm6
+ pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor xmm0,xmm0
+ pxor xmm6,xmm6
+ punpcklwd xmm0,xmm1 ; xmm0=BEL
+ punpckhwd xmm6,xmm1 ; xmm6=BEH
+ psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
+ psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
+
+ movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm5,xmm0
+ paddd xmm4,xmm6
+ paddd xmm5,xmm1
+ paddd xmm4,xmm1
+ psrld xmm5,SCALEBITS ; xmm5=CbEL
+ psrld xmm4,SCALEBITS ; xmm4=CbEH
+ packssdw xmm5,xmm4 ; xmm5=CbE
+
+ psllw xmm7,BYTE_BIT
+ por xmm5,xmm7 ; xmm5=Cb
+ movdqa XMMWORD [ebx], xmm5 ; Save Cb
+
+ movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
+
+ movdqa xmm4,xmm0
+ punpcklwd xmm0,xmm3
+ punpckhwd xmm4,xmm3
+ movdqa xmm7,xmm0
+ movdqa xmm5,xmm4
+ pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, XMMWORD [wk(4)]
+ paddd xmm4, XMMWORD [wk(5)]
+ paddd xmm0,xmm3
+ paddd xmm4,xmm3
+ psrld xmm0,SCALEBITS ; xmm0=YOL
+ psrld xmm4,SCALEBITS ; xmm4=YOH
+ packssdw xmm0,xmm4 ; xmm0=YO
+
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ punpcklwd xmm3,xmm1 ; xmm3=ROL
+ punpckhwd xmm4,xmm1 ; xmm4=ROH
+ psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
+ psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
+
+ movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7,xmm3
+ paddd xmm5,xmm4
+ paddd xmm7,xmm1
+ paddd xmm5,xmm1
+ psrld xmm7,SCALEBITS ; xmm7=CrOL
+ psrld xmm5,SCALEBITS ; xmm5=CrOH
+ packssdw xmm7,xmm5 ; xmm7=CrO
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
+
+ movdqa xmm4,xmm6
+ punpcklwd xmm6,xmm2
+ punpckhwd xmm4,xmm2
+ movdqa xmm1,xmm6
+ movdqa xmm5,xmm4
+ pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(6)]
+ paddd xmm4, XMMWORD [wk(7)]
+ paddd xmm6,xmm2
+ paddd xmm4,xmm2
+ psrld xmm6,SCALEBITS ; xmm6=YEL
+ psrld xmm4,SCALEBITS ; xmm4=YEH
+ packssdw xmm6,xmm4 ; xmm6=YE
+
+ psllw xmm0,BYTE_BIT
+ por xmm6,xmm0 ; xmm6=Y
+ movdqa XMMWORD [edi], xmm6 ; Save Y
+
+ pxor xmm2,xmm2
+ pxor xmm4,xmm4
+ punpcklwd xmm2,xmm3 ; xmm2=REL
+ punpckhwd xmm4,xmm3 ; xmm4=REH
+ psrld xmm2,1 ; xmm2=REL*FIX(0.500)
+ psrld xmm4,1 ; xmm4=REH*FIX(0.500)
+
+ movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
+
+ paddd xmm1,xmm2
+ paddd xmm5,xmm4
+ paddd xmm1,xmm0
+ paddd xmm5,xmm0
+ psrld xmm1,SCALEBITS ; xmm1=CrEL
+ psrld xmm5,SCALEBITS ; xmm5=CrEH
+ packssdw xmm1,xmm5 ; xmm1=CrE
+
+ psllw xmm7,BYTE_BIT
+ por xmm1,xmm7 ; xmm1=Cr
+ movdqa XMMWORD [edx], xmm1 ; Save Cr
+
+ sub ecx, byte SIZEOF_XMMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add edi, byte SIZEOF_XMMWORD ; outptr0
+ add ebx, byte SIZEOF_XMMWORD ; outptr1
+ add edx, byte SIZEOF_XMMWORD ; outptr2
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test ecx,ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ pop ebx
+ pop edx
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jccolmmx.asm b/simd/jccolmmx.asm
new file mode 100644
index 0000000..5e7f3be
--- /dev/null
+++ b/simd/jccolmmx.asm
@@ -0,0 +1,120 @@
+;
+; jccolmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337 times 2 dw F_0_299, F_0_337
+PW_F0114_F0250 times 2 dw F_0_114, F_0_250
+PW_MF016_MF033 times 2 dw -F_0_168,-F_0_331
+PW_MF008_MF041 times 2 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 2 dd (1 << (SCALEBITS-1))
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
+%include "jcclrmmx.asm"
diff --git a/simd/jccolss2-64.asm b/simd/jccolss2-64.asm
new file mode 100644
index 0000000..a419d1b
--- /dev/null
+++ b/simd/jccolss2-64.asm
@@ -0,0 +1,117 @@
+;
+; jccolss2.asm - colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jcclrss2-64.asm"
diff --git a/simd/jccolss2.asm b/simd/jccolss2.asm
new file mode 100644
index 0000000..8d1f734
--- /dev/null
+++ b/simd/jccolss2.asm
@@ -0,0 +1,117 @@
+;
+; jccolss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jcclrss2.asm"
diff --git a/simd/jcolsamp.inc b/simd/jcolsamp.inc
new file mode 100644
index 0000000..79751b7
--- /dev/null
+++ b/simd/jcolsamp.inc
@@ -0,0 +1,105 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; [TAB8]
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_RED == 0
+%define mmA mm0
+%define mmB mm1
+%define xmmA xmm0
+%define xmmB xmm1
+%elif RGB_GREEN == 0
+%define mmA mm2
+%define mmB mm3
+%define xmmA xmm2
+%define xmmB xmm3
+%elif RGB_BLUE == 0
+%define mmA mm4
+%define mmB mm5
+%define xmmA xmm4
+%define xmmB xmm5
+%else
+%define mmA mm6
+%define mmB mm7
+%define xmmA xmm6
+%define xmmB xmm7
+%endif
+
+%if RGB_RED == 1
+%define mmC mm0
+%define mmD mm1
+%define xmmC xmm0
+%define xmmD xmm1
+%elif RGB_GREEN == 1
+%define mmC mm2
+%define mmD mm3
+%define xmmC xmm2
+%define xmmD xmm3
+%elif RGB_BLUE == 1
+%define mmC mm4
+%define mmD mm5
+%define xmmC xmm4
+%define xmmD xmm5
+%else
+%define mmC mm6
+%define mmD mm7
+%define xmmC xmm6
+%define xmmD xmm7
+%endif
+
+%if RGB_RED == 2
+%define mmE mm0
+%define mmF mm1
+%define xmmE xmm0
+%define xmmF xmm1
+%elif RGB_GREEN == 2
+%define mmE mm2
+%define mmF mm3
+%define xmmE xmm2
+%define xmmF xmm3
+%elif RGB_BLUE == 2
+%define mmE mm4
+%define mmF mm5
+%define xmmE xmm4
+%define xmmF xmm5
+%else
+%define mmE mm6
+%define mmF mm7
+%define xmmE xmm6
+%define xmmF xmm7
+%endif
+
+%if RGB_RED == 3
+%define mmG mm0
+%define mmH mm1
+%define xmmG xmm0
+%define xmmH xmm1
+%elif RGB_GREEN == 3
+%define mmG mm2
+%define mmH mm3
+%define xmmG xmm2
+%define xmmH xmm3
+%elif RGB_BLUE == 3
+%define mmG mm4
+%define mmH mm5
+%define xmmG xmm4
+%define xmmH xmm5
+%else
+%define mmG mm6
+%define mmH mm7
+%define xmmG xmm6
+%define xmmH xmm7
+%endif
+
+; --------------------------------------------------------------------------
diff --git a/simd/jcqnt3dn.asm b/simd/jcqnt3dn.asm
new file mode 100644
index 0000000..182c869
--- /dev/null
+++ b/simd/jcqnt3dn.asm
@@ -0,0 +1,233 @@
+;
+; jcqnt3dn.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
+; FAST_FLOAT * workspace);
+;
+
+%define sample_data ebp+8 ; JSAMPARRAY sample_data
+%define start_col ebp+12 ; JDIMENSION start_col
+%define workspace ebp+16 ; FAST_FLOAT * workspace
+
+ align 16
+ global EXTN(jsimd_convsamp_float_3dnow)
+
+EXTN(jsimd_convsamp_float_3dnow):
+ push ebp
+ mov ebp,esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pcmpeqw mm7,mm7
+ psllw mm7,7
+ packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/2
+ alignx 16,7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ psubb mm0,mm7 ; mm0=(01234567)
+ psubb mm1,mm7 ; mm1=(89ABCDEF)
+
+ punpcklbw mm2,mm0 ; mm2=(*0*1*2*3)
+ punpckhbw mm0,mm0 ; mm0=(*4*5*6*7)
+ punpcklbw mm3,mm1 ; mm3=(*8*9*A*B)
+ punpckhbw mm1,mm1 ; mm1=(*C*D*E*F)
+
+ punpcklwd mm4,mm2 ; mm4=(***0***1)
+ punpckhwd mm2,mm2 ; mm2=(***2***3)
+ punpcklwd mm5,mm0 ; mm5=(***4***5)
+ punpckhwd mm0,mm0 ; mm0=(***6***7)
+
+ psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01)
+ psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23)
+ pi2fd mm4,mm4
+ pi2fd mm2,mm2
+ psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45)
+ psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67)
+ pi2fd mm5,mm5
+ pi2fd mm0,mm0
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+ movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+ movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+ punpcklwd mm6,mm3 ; mm6=(***8***9)
+ punpckhwd mm3,mm3 ; mm3=(***A***B)
+ punpcklwd mm4,mm1 ; mm4=(***C***D)
+ punpckhwd mm1,mm1 ; mm1=(***E***F)
+
+ psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89)
+ psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB)
+ pi2fd mm6,mm6
+ pi2fd mm3,mm3
+ psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD)
+ psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF)
+ pi2fd mm4,mm4
+ pi2fd mm1,mm1
+
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+ movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+ add esi, byte 2*SIZEOF_JSAMPROW
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .convloop
+
+ femms ; empty MMX/3DNow! state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+; FAST_FLOAT * workspace);
+;
+
+%define coef_block ebp+8 ; JCOEFPTR coef_block
+%define divisors ebp+12 ; FAST_FLOAT * divisors
+%define workspace ebp+16 ; FAST_FLOAT * workspace
+
+ align 16
+ global EXTN(jsimd_quantize_float_3dnow)
+
+EXTN(jsimd_quantize_float_3dnow):
+ push ebp
+ mov ebp,esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic)
+ movd mm7,eax
+ punpckldq mm7,mm7 ; mm7={12582912.0F 12582912.0F}
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/16
+ alignx 16,7
+.quantloop:
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+ pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+ pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+ pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+ pfadd mm0,mm7 ; mm0=(00 ** 01 **)
+ pfadd mm1,mm7 ; mm1=(02 ** 03 **)
+ pfadd mm2,mm7 ; mm0=(04 ** 05 **)
+ pfadd mm3,mm7 ; mm1=(06 ** 07 **)
+
+ movq mm4,mm0
+ punpcklwd mm0,mm1 ; mm0=(00 02 ** **)
+ punpckhwd mm4,mm1 ; mm4=(01 03 ** **)
+ movq mm5,mm2
+ punpcklwd mm2,mm3 ; mm2=(04 06 ** **)
+ punpckhwd mm5,mm3 ; mm5=(05 07 ** **)
+
+ punpcklwd mm0,mm4 ; mm0=(00 01 02 03)
+ punpcklwd mm2,mm5 ; mm2=(04 05 06 07)
+
+ movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+ pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+ movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+ pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+ pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+ pfadd mm6,mm7 ; mm0=(10 ** 11 **)
+ pfadd mm1,mm7 ; mm4=(12 ** 13 **)
+ pfadd mm3,mm7 ; mm0=(14 ** 15 **)
+ pfadd mm4,mm7 ; mm4=(16 ** 17 **)
+
+ movq mm5,mm6
+ punpcklwd mm6,mm1 ; mm6=(10 12 ** **)
+ punpckhwd mm5,mm1 ; mm5=(11 13 ** **)
+ movq mm1,mm3
+ punpcklwd mm3,mm4 ; mm3=(14 16 ** **)
+ punpckhwd mm1,mm4 ; mm1=(15 17 ** **)
+
+ punpcklwd mm6,mm5 ; mm6=(10 11 12 13)
+ punpcklwd mm3,mm1 ; mm3=(14 15 16 17)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+ add esi, byte 16*SIZEOF_FAST_FLOAT
+ add edx, byte 16*SIZEOF_FAST_FLOAT
+ add edi, byte 16*SIZEOF_JCOEF
+ dec eax
+ jnz near .quantloop
+
+ femms ; empty MMX/3DNow! state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jcqntmmx.asm b/simd/jcqntmmx.asm
new file mode 100644
index 0000000..08b08b7
--- /dev/null
+++ b/simd/jcqntmmx.asm
@@ -0,0 +1,274 @@
+;
+; jcqntmmx.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM * workspace);
+;
+
+%define sample_data ebp+8 ; JSAMPARRAY sample_data
+%define start_col ebp+12 ; JDIMENSION start_col
+%define workspace ebp+16 ; DCTELEM * workspace
+
+ align 16
+ global EXTN(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+ push ebp
+ mov ebp,esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pxor mm6,mm6 ; mm6=(all 0's)
+ pcmpeqw mm7,mm7
+ psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16,7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567)
+ movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF)
+
+ mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN)
+ movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV)
+
+ movq mm4,mm0
+ punpcklbw mm0,mm6 ; mm0=(0123)
+ punpckhbw mm4,mm6 ; mm4=(4567)
+ movq mm5,mm1
+ punpcklbw mm1,mm6 ; mm1=(89AB)
+ punpckhbw mm5,mm6 ; mm5=(CDEF)
+
+ paddw mm0,mm7
+ paddw mm4,mm7
+ paddw mm1,mm7
+ paddw mm5,mm7
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+ movq mm0,mm2
+ punpcklbw mm2,mm6 ; mm2=(GHIJ)
+ punpckhbw mm0,mm6 ; mm0=(KLMN)
+ movq mm4,mm3
+ punpcklbw mm3,mm6 ; mm3=(OPQR)
+ punpckhbw mm4,mm6 ; mm4=(STUV)
+
+ paddw mm2,mm7
+ paddw mm0,mm7
+ paddw mm3,mm7
+ paddw mm4,mm7
+
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+ add esi, byte 4*SIZEOF_JSAMPROW
+ add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz short .convloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
+; DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+%define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block ebp+8 ; JCOEFPTR coef_block
+%define divisors ebp+12 ; DCTELEM * divisors
+%define workspace ebp+16 ; DCTELEM * workspace
+
+ align 16
+ global EXTN(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+ push ebp
+ mov ebp,esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov ah, 2
+ alignx 16,7
+.quantloop1:
+ mov al, DCTSIZE2/8/2
+ alignx 16,7
+.quantloop2:
+ movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+ movq mm0,mm2
+ movq mm1,mm3
+
+ psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise
+ psraw mm3,(WORD_BIT-1)
+
+ pxor mm0,mm2 ; val = -val
+ pxor mm1,mm3
+ psubw mm0,mm2
+ psubw mm1,mm3
+
+ ;
+ ; MMX is an annoyingly crappy instruction set. It has two
+ ; misfeatures that are causing problems here:
+ ;
+ ; - All multiplications are signed.
+ ;
+ ; - The second operand for the shifts is not treated as packed.
+ ;
+ ;
+ ; We work around the first problem by implementing this algorithm:
+ ;
+ ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+ ; {
+ ; enum { SHORT_BIT = 16 };
+ ; signed short sx = (signed short) x;
+ ; signed short sy = (signed short) y;
+ ; signed long sz;
+ ;
+ ; sz = (long) sx * (long) sy; /* signed multiply */
+ ;
+ ; if (sx < 0) sz += (long) sy << SHORT_BIT;
+ ; if (sy < 0) sz += (long) sx << SHORT_BIT;
+ ;
+ ; return (unsigned long) sz;
+ ; }
+ ;
+ ; (note that a negative sx adds _sy_ and vice versa)
+ ;
+ ; For the second problem, we replace the shift by a multiplication.
+ ; Unfortunately that means we have to deal with the signed issue again.
+ ;
+
+ paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
+ paddw mm1, MMWORD [CORRECTION(0,1,edx)]
+
+ movq mm4,mm0 ; store current value for later
+ movq mm5,mm1
+ pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
+ pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]
+ paddw mm0,mm4 ; reciprocal is always negative (MSB=1),
+ paddw mm1,mm5 ; so we always need to add the initial value
+ ; (input value is never negative as we
+ ; inverted it at the start of this routine)
+
+ ; here it gets a bit tricky as both scale
+ ; and mm0/mm1 can be negative
+ movq mm6, MMWORD [SCALE(0,0,edx)] ; scale
+ movq mm7, MMWORD [SCALE(0,1,edx)]
+ movq mm4,mm0
+ movq mm5,mm1
+ pmulhw mm0,mm6
+ pmulhw mm1,mm7
+
+ psraw mm6,(WORD_BIT-1) ; determine if scale is negative
+ psraw mm7,(WORD_BIT-1)
+
+ pand mm6,mm4 ; and add input if it is
+ pand mm7,mm5
+ paddw mm0,mm6
+ paddw mm1,mm7
+
+ psraw mm4,(WORD_BIT-1) ; then check if negative input
+ psraw mm5,(WORD_BIT-1)
+
+ pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is
+ pand mm5, MMWORD [SCALE(0,1,edx)]
+ paddw mm0,mm4
+ paddw mm1,mm5
+
+ pxor mm0,mm2 ; val = -val
+ pxor mm1,mm3
+ psubw mm0,mm2
+ psubw mm1,mm3
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+ add esi, byte 8*SIZEOF_DCTELEM
+ add edx, byte 8*SIZEOF_DCTELEM
+ add edi, byte 8*SIZEOF_JCOEF
+ dec al
+ jnz near .quantloop2
+ dec ah
+ jnz near .quantloop1 ; to avoid branch misprediction
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jcqnts2f-64.asm b/simd/jcqnts2f-64.asm
new file mode 100644
index 0000000..e09387c
--- /dev/null
+++ b/simd/jcqnts2f-64.asm
@@ -0,0 +1,156 @@
+;
+; jcqnts2f.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+; FAST_FLOAT * workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11 = JDIMENSION start_col
+; r12 = FAST_FLOAT * workspace
+
+ align 16
+ global EXTN(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+ push rbp
+ mov rbp,rsp
+ push rbx
+ collect_args
+
+ pcmpeqw xmm7,xmm7
+ psllw xmm7,7
+ packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov rsi, r10
+ mov rax, r11
+ mov rdi, r12
+ mov rcx, DCTSIZE/2
+.convloop:
+ mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+ movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+ psubb xmm0,xmm7 ; xmm0=(01234567)
+ psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
+
+ punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
+ punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
+
+ punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3)
+ punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7)
+ punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B)
+ punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F)
+
+ psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
+ psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
+ cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
+ cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
+ psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
+ psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
+ cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
+ cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
+
+ movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+ add rsi, byte 2*SIZEOF_JSAMPROW
+ add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec rcx
+ jnz short .convloop
+
+ uncollect_args
+ pop rbx
+ pop rbp
+ ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+; FAST_FLOAT * workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT * divisors
+; r12 = FAST_FLOAT * workspace
+
+ align 16
+ global EXTN(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+ push rbp
+ mov rbp,rsp
+ collect_args
+
+ mov rsi, r12
+ mov rdx, r11
+ mov rdi, r10
+ mov rax, DCTSIZE2/16
+.quantloop:
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+ mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+ mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+ cvtps2dq xmm0,xmm0
+ cvtps2dq xmm1,xmm1
+ cvtps2dq xmm2,xmm2
+ cvtps2dq xmm3,xmm3
+
+ packssdw xmm0,xmm1
+ packssdw xmm2,xmm3
+
+ movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+ add rsi, byte 16*SIZEOF_FAST_FLOAT
+ add rdx, byte 16*SIZEOF_FAST_FLOAT
+ add rdi, byte 16*SIZEOF_JCOEF
+ dec rax
+ jnz short .quantloop
+
+ uncollect_args
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jcqnts2f.asm b/simd/jcqnts2f.asm
new file mode 100644
index 0000000..d80ae5d
--- /dev/null
+++ b/simd/jcqnts2f.asm
@@ -0,0 +1,171 @@
+;
+; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+; FAST_FLOAT * workspace);
+;
+
+%define sample_data ebp+8 ; JSAMPARRAY sample_data
+%define start_col ebp+12 ; JDIMENSION start_col
+%define workspace ebp+16 ; FAST_FLOAT * workspace
+
+ align 16
+ global EXTN(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+ push ebp
+ mov ebp,esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pcmpeqw xmm7,xmm7
+ psllw xmm7,7
+ packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/2
+ alignx 16,7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ psubb xmm0,xmm7 ; xmm0=(01234567)
+ psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
+
+ punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
+ punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
+
+ punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3)
+ punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7)
+ punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B)
+ punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F)
+
+ psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
+ psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
+ cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
+ cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
+ psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
+ psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
+ cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
+ cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+ add esi, byte 2*SIZEOF_JSAMPROW
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz short .convloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+; FAST_FLOAT * workspace);
+;
+
+%define coef_block ebp+8 ; JCOEFPTR coef_block
+%define divisors ebp+12 ; FAST_FLOAT * divisors
+%define workspace ebp+16 ; FAST_FLOAT * workspace
+
+ align 16
+ global EXTN(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+ push ebp
+ mov ebp,esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/16
+ alignx 16,7
+.quantloop:
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+ cvtps2dq xmm0,xmm0
+ cvtps2dq xmm1,xmm1
+ cvtps2dq xmm2,xmm2
+ cvtps2dq xmm3,xmm3
+
+ packssdw xmm0,xmm1
+ packssdw xmm2,xmm3
+
+ movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+ add esi, byte 16*SIZEOF_FAST_FLOAT
+ add edx, byte 16*SIZEOF_FAST_FLOAT
+ add edi, byte 16*SIZEOF_JCOEF
+ dec eax
+ jnz short .quantloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jcqnts2i-64.asm b/simd/jcqnts2i-64.asm
new file mode 100644
index 0000000..4568dfc
--- /dev/null
+++ b/simd/jcqnts2i-64.asm
@@ -0,0 +1,185 @@
+;
+; jcqnts2i.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM * workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11 = JDIMENSION start_col
+; r12 = DCTELEM * workspace
+
+ align 16
+ global EXTN(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+ push rbp
+ mov rbp,rsp
+ push rbx
+ collect_args
+
+ pxor xmm6,xmm6 ; xmm6=(all 0's)
+ pcmpeqw xmm7,xmm7
+ psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ mov rsi, r10
+ mov rax, r11
+ mov rdi, r12
+ mov rcx, DCTSIZE/4
+.convloop:
+ mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
+ movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
+
+ mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
+ movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
+
+ punpcklbw xmm0,xmm6 ; xmm0=(01234567)
+ punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)
+ paddw xmm0,xmm7
+ paddw xmm1,xmm7
+ punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)
+ punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)
+ paddw xmm2,xmm7
+ paddw xmm3,xmm7
+
+ movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+ add rsi, byte 4*SIZEOF_JSAMPROW
+ add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec rcx
+ jnz short .convloop
+
+ uncollect_args
+ pop rbx
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
+; DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM * divisors
+; r12 = DCTELEM * workspace
+
+ align 16
+ global EXTN(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+ push rbp
+ mov rbp,rsp
+ collect_args
+
+ mov rsi, r12
+ mov rdx, r11
+ mov rdi, r10
+ mov rax, DCTSIZE2/32
+.quantloop:
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm5
+ movdqa xmm2,xmm6
+ movdqa xmm3,xmm7
+ psraw xmm4,(WORD_BIT-1)
+ psraw xmm5,(WORD_BIT-1)
+ psraw xmm6,(WORD_BIT-1)
+ psraw xmm7,(WORD_BIT-1)
+ pxor xmm0,xmm4
+ pxor xmm1,xmm5
+ pxor xmm2,xmm6
+ pxor xmm3,xmm7
+ psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
+ psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
+ psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
+ psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
+
+ paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
+ paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+ paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+ paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+ pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
+ pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+ pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+ pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+ pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
+ pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
+ pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
+ pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+ pxor xmm0,xmm4
+ pxor xmm1,xmm5
+ pxor xmm2,xmm6
+ pxor xmm3,xmm7
+ psubw xmm0,xmm4
+ psubw xmm1,xmm5
+ psubw xmm2,xmm6
+ psubw xmm3,xmm7
+ movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+ add rsi, byte 32*SIZEOF_DCTELEM
+ add rdx, byte 32*SIZEOF_DCTELEM
+ add rdi, byte 32*SIZEOF_JCOEF
+ dec rax
+ jnz near .quantloop
+
+ uncollect_args
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jcqnts2i.asm b/simd/jcqnts2i.asm
new file mode 100644
index 0000000..0864d6e
--- /dev/null
+++ b/simd/jcqnts2i.asm
@@ -0,0 +1,200 @@
+;
+; jcqnts2i.asm - sample data conversion and quantization (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM * workspace);
+;
+
+%define sample_data ebp+8 ; JSAMPARRAY sample_data
+%define start_col ebp+12 ; JDIMENSION start_col
+%define workspace ebp+16 ; DCTELEM * workspace
+
+ align 16
+ global EXTN(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+ push ebp
+ mov ebp,esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pxor xmm6,xmm6 ; xmm6=(all 0's)
+ pcmpeqw xmm7,xmm7
+ psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16,7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
+ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
+
+ mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
+ movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
+
+ punpcklbw xmm0,xmm6 ; xmm0=(01234567)
+ punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)
+ paddw xmm0,xmm7
+ paddw xmm1,xmm7
+ punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)
+ punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)
+ paddw xmm2,xmm7
+ paddw xmm3,xmm7
+
+ movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+ add esi, byte 4*SIZEOF_JSAMPROW
+ add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz short .convloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+; "How to optimize for the Pentium family of microprocessors"
+; (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
+; DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block ebp+8 ; JCOEFPTR coef_block
+%define divisors ebp+12 ; DCTELEM * divisors
+%define workspace ebp+16 ; DCTELEM * workspace
+
+ align 16
+ global EXTN(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+ push ebp
+ mov ebp,esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/32
+ alignx 16,7
+.quantloop:
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm5
+ movdqa xmm2,xmm6
+ movdqa xmm3,xmm7
+ psraw xmm4,(WORD_BIT-1)
+ psraw xmm5,(WORD_BIT-1)
+ psraw xmm6,(WORD_BIT-1)
+ psraw xmm7,(WORD_BIT-1)
+ pxor xmm0,xmm4
+ pxor xmm1,xmm5
+ pxor xmm2,xmm6
+ pxor xmm3,xmm7
+ psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
+ psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
+ psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
+ psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
+
+ paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
+ paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
+ paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
+ paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
+ pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
+ pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+ pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+ pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+ pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale
+ pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
+ pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
+ pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
+
+ pxor xmm0,xmm4
+ pxor xmm1,xmm5
+ pxor xmm2,xmm6
+ pxor xmm3,xmm7
+ psubw xmm0,xmm4
+ psubw xmm1,xmm5
+ psubw xmm2,xmm6
+ psubw xmm3,xmm7
+ movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+ add esi, byte 32*SIZEOF_DCTELEM
+ add edx, byte 32*SIZEOF_DCTELEM
+ add edi, byte 32*SIZEOF_JCOEF
+ dec eax
+ jnz near .quantloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jcqntsse.asm b/simd/jcqntsse.asm
new file mode 100644
index 0000000..3065eca
--- /dev/null
+++ b/simd/jcqntsse.asm
@@ -0,0 +1,211 @@
+;
+; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
+; FAST_FLOAT * workspace);
+;
+
+%define sample_data ebp+8 ; JSAMPARRAY sample_data
+%define start_col ebp+12 ; JDIMENSION start_col
+%define workspace ebp+16 ; FAST_FLOAT * workspace
+
+ align 16
+ global EXTN(jsimd_convsamp_float_sse)
+
+EXTN(jsimd_convsamp_float_sse):
+ push ebp
+ mov ebp,esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pcmpeqw mm7,mm7
+ psllw mm7,7
+ packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/2
+ alignx 16,7
+.convloop:
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ psubb mm0,mm7 ; mm0=(01234567)
+ psubb mm1,mm7 ; mm1=(89ABCDEF)
+
+ punpcklbw mm2,mm0 ; mm2=(*0*1*2*3)
+ punpckhbw mm0,mm0 ; mm0=(*4*5*6*7)
+ punpcklbw mm3,mm1 ; mm3=(*8*9*A*B)
+ punpckhbw mm1,mm1 ; mm1=(*C*D*E*F)
+
+ punpcklwd mm4,mm2 ; mm4=(***0***1)
+ punpckhwd mm2,mm2 ; mm2=(***2***3)
+ punpcklwd mm5,mm0 ; mm5=(***4***5)
+ punpckhwd mm0,mm0 ; mm0=(***6***7)
+
+ psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01)
+ psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23)
+ cvtpi2ps xmm0,mm4 ; xmm0=(01**)
+ cvtpi2ps xmm1,mm2 ; xmm1=(23**)
+ psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45)
+ psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67)
+ cvtpi2ps xmm2,mm5 ; xmm2=(45**)
+ cvtpi2ps xmm3,mm0 ; xmm3=(67**)
+
+ punpcklwd mm6,mm3 ; mm6=(***8***9)
+ punpckhwd mm3,mm3 ; mm3=(***A***B)
+ punpcklwd mm4,mm1 ; mm4=(***C***D)
+ punpckhwd mm1,mm1 ; mm1=(***E***F)
+
+ psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89)
+ psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB)
+ cvtpi2ps xmm4,mm6 ; xmm4=(89**)
+ cvtpi2ps xmm5,mm3 ; xmm5=(AB**)
+ psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD)
+ psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF)
+ cvtpi2ps xmm6,mm4 ; xmm6=(CD**)
+ cvtpi2ps xmm7,mm1 ; xmm7=(EF**)
+
+ movlhps xmm0,xmm1 ; xmm0=(0123)
+ movlhps xmm2,xmm3 ; xmm2=(4567)
+ movlhps xmm4,xmm5 ; xmm4=(89AB)
+ movlhps xmm6,xmm7 ; xmm6=(CDEF)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+ add esi, byte 2*SIZEOF_JSAMPROW
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .convloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+; FAST_FLOAT * workspace);
+;
+
+%define coef_block ebp+8 ; JCOEFPTR coef_block
+%define divisors ebp+12 ; FAST_FLOAT * divisors
+%define workspace ebp+16 ; FAST_FLOAT * workspace
+
+ align 16
+ global EXTN(jsimd_quantize_float_sse)
+
+EXTN(jsimd_quantize_float_sse):
+ push ebp
+ mov ebp,esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/16
+ alignx 16,7
+.quantloop:
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+ movhlps xmm4,xmm0
+ movhlps xmm5,xmm1
+
+ cvtps2pi mm0,xmm0
+ cvtps2pi mm1,xmm1
+ cvtps2pi mm4,xmm4
+ cvtps2pi mm5,xmm5
+
+ movhlps xmm6,xmm2
+ movhlps xmm7,xmm3
+
+ cvtps2pi mm2,xmm2
+ cvtps2pi mm3,xmm3
+ cvtps2pi mm6,xmm6
+ cvtps2pi mm7,xmm7
+
+ packssdw mm0,mm4
+ packssdw mm1,mm5
+ packssdw mm2,mm6
+ packssdw mm3,mm7
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+ add esi, byte 16*SIZEOF_FAST_FLOAT
+ add edx, byte 16*SIZEOF_FAST_FLOAT
+ add edi, byte 16*SIZEOF_JCOEF
+ dec eax
+ jnz short .quantloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jcsammmx.asm b/simd/jcsammmx.asm
new file mode 100644
index 0000000..c7126a0
--- /dev/null
+++ b/simd/jcsammmx.asm
@@ -0,0 +1,324 @@
+;
+; jcsammmx.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+; JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b)+8 ; JDIMENSION image_width
+%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
+%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
+%define input_data(b) (b)+24 ; JSAMPARRAY input_data
+%define output_data(b) (b)+28 ; JSAMPARRAY output_data
+
+ align 16
+ global EXTN(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+ push ebp
+ mov ebp,esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx,1 ; output_cols * 2
+ sub ecx,edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax,eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16,7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi,edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax,eax
+ jle short .return
+
+ mov edx, 0x00010000 ; bias pattern
+ movd mm7,edx
+ pcmpeqw mm6,mm6
+ punpckldq mm7,mm7 ; mm7={0, 1, 0, 1}
+ psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16,7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+ alignx 16,7
+.columnloop:
+
+ movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+ movq mm2,mm0
+ movq mm3,mm1
+
+ pand mm0,mm6
+ psrlw mm2,BYTE_BIT
+ pand mm1,mm6
+ psrlw mm3,BYTE_BIT
+
+ paddw mm0,mm2
+ paddw mm1,mm3
+ paddw mm0,mm7
+ paddw mm1,mm7
+ psrlw mm0,1
+ psrlw mm1,1
+
+ packuswb mm0,mm1
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+ add esi, byte 2*SIZEOF_MMWORD ; inptr
+ add edi, byte 1*SIZEOF_MMWORD ; outptr
+ sub ecx, byte SIZEOF_MMWORD ; outcol
+ jnz short .columnloop
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg short .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+; JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b)+8 ; JDIMENSION image_width
+%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
+%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
+%define input_data(b) (b)+24 ; JSAMPARRAY input_data
+%define output_data(b) (b)+28 ; JSAMPARRAY output_data
+
+ align 16
+ global EXTN(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+ push ebp
+ mov ebp,esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx,1 ; output_cols * 2
+ sub ecx,edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax,eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16,7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi,edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax,eax
+ jle near .return
+
+ mov edx, 0x00020001 ; bias pattern
+ movd mm7,edx
+ pcmpeqw mm6,mm6
+ punpckldq mm7,mm7 ; mm7={1, 2, 1, 2}
+ psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16,7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov edi, JSAMPROW [edi] ; outptr
+ alignx 16,7
+.columnloop:
+
+ movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+ movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+ movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+ movq mm4,mm0
+ movq mm5,mm1
+ pand mm0,mm6
+ psrlw mm4,BYTE_BIT
+ pand mm1,mm6
+ psrlw mm5,BYTE_BIT
+ paddw mm0,mm4
+ paddw mm1,mm5
+
+ movq mm4,mm2
+ movq mm5,mm3
+ pand mm2,mm6
+ psrlw mm4,BYTE_BIT
+ pand mm3,mm6
+ psrlw mm5,BYTE_BIT
+ paddw mm2,mm4
+ paddw mm3,mm5
+
+ paddw mm0,mm1
+ paddw mm2,mm3
+ paddw mm0,mm7
+ paddw mm2,mm7
+ psrlw mm0,2
+ psrlw mm2,2
+
+ packuswb mm0,mm2
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+ add edx, byte 2*SIZEOF_MMWORD ; inptr0
+ add esi, byte 2*SIZEOF_MMWORD ; inptr1
+ add edi, byte 1*SIZEOF_MMWORD ; outptr
+ sub ecx, byte SIZEOF_MMWORD ; outcol
+ jnz near .columnloop
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jcsamss2-64.asm b/simd/jcsamss2-64.asm
new file mode 100644
index 0000000..29c3f4f
--- /dev/null
+++ b/simd/jcsamss2-64.asm
@@ -0,0 +1,328 @@
+;
+; jcsamss2.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+; JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+; r10 = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12 = JDIMENSION v_samp_factor
+; r13 = JDIMENSION width_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+ align 16
+ global EXTN(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+ push rbp
+ mov rbp,rsp
+ collect_args
+
+ mov rcx, r13
+ shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
+ jz near .return
+
+ mov rdx, r10
+
+ ; -- expand_right_edge
+
+ push rcx
+ shl rcx,1 ; output_cols * 2
+ sub rcx,rdx
+ jle short .expand_end
+
+ mov rax, r11
+ test rax,rax
+ jle short .expand_end
+
+ cld
+ mov rsi, r14 ; input_data
+.expandloop:
+ push rax
+ push rcx
+
+ mov rdi, JSAMPROW [rsi]
+ add rdi,rdx
+ mov al, JSAMPLE [rdi-1]
+
+ rep stosb
+
+ pop rcx
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ dec rax
+ jg short .expandloop
+
+.expand_end:
+ pop rcx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov rax, r12 ; rowctr
+ test eax,eax
+ jle near .return
+
+ mov rdx, 0x00010000 ; bias pattern
+ movd xmm7,edx
+ pcmpeqw xmm6,xmm6
+ pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+ psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov rsi, r14 ; input_data
+ mov rdi, r15 ; output_data
+.rowloop:
+ push rcx
+ push rdi
+ push rsi
+
+ mov rsi, JSAMPROW [rsi] ; inptr
+ mov rdi, JSAMPROW [rdi] ; outptr
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+
+.columnloop_r8:
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ pxor xmm1,xmm1
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .downsample
+
+.columnloop:
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+ movdqa xmm2,xmm0
+ movdqa xmm3,xmm1
+
+ pand xmm0,xmm6
+ psrlw xmm2,BYTE_BIT
+ pand xmm1,xmm6
+ psrlw xmm3,BYTE_BIT
+
+ paddw xmm0,xmm2
+ paddw xmm1,xmm3
+ paddw xmm0,xmm7
+ paddw xmm1,xmm7
+ psrlw xmm0,1
+ psrlw xmm1,1
+
+ packuswb xmm0,xmm1
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+ sub rcx, byte SIZEOF_XMMWORD ; outcol
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr
+ add rdi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ test rcx,rcx
+ jnz short .columnloop_r8
+
+ pop rsi
+ pop rdi
+ pop rcx
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rax ; rowctr
+ jg near .rowloop
+
+.return:
+ uncollect_args
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+; JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+; r10 = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12 = JDIMENSION v_samp_factor
+; r13 = JDIMENSION width_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+ align 16
+ global EXTN(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+ push rbp
+ mov rbp,rsp
+ collect_args
+
+ mov rcx, r13
+ shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
+ jz near .return
+
+ mov rdx, r10
+
+ ; -- expand_right_edge
+
+ push rcx
+ shl rcx,1 ; output_cols * 2
+ sub rcx,rdx
+ jle short .expand_end
+
+ mov rax, r11
+ test rax,rax
+ jle short .expand_end
+
+ cld
+ mov rsi, r14 ; input_data
+.expandloop:
+ push rax
+ push rcx
+
+ mov rdi, JSAMPROW [rsi]
+ add rdi,rdx
+ mov al, JSAMPLE [rdi-1]
+
+ rep stosb
+
+ pop rcx
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ dec rax
+ jg short .expandloop
+
+.expand_end:
+ pop rcx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov rax, r12 ; rowctr
+ test rax,rax
+ jle near .return
+
+ mov rdx, 0x00020001 ; bias pattern
+ movd xmm7,edx
+ pcmpeqw xmm6,xmm6
+ pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+ psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov rsi, r14 ; input_data
+ mov rdi, r15 ; output_data
+.rowloop:
+ push rcx
+ push rdi
+ push rsi
+
+ mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov rdi, JSAMPROW [rdi] ; outptr
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+
+.columnloop_r8:
+ movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .downsample
+
+.columnloop:
+ movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm1
+ pand xmm0,xmm6
+ psrlw xmm4,BYTE_BIT
+ pand xmm1,xmm6
+ psrlw xmm5,BYTE_BIT
+ paddw xmm0,xmm4
+ paddw xmm1,xmm5
+
+ movdqa xmm4,xmm2
+ movdqa xmm5,xmm3
+ pand xmm2,xmm6
+ psrlw xmm4,BYTE_BIT
+ pand xmm3,xmm6
+ psrlw xmm5,BYTE_BIT
+ paddw xmm2,xmm4
+ paddw xmm3,xmm5
+
+ paddw xmm0,xmm1
+ paddw xmm2,xmm3
+ paddw xmm0,xmm7
+ paddw xmm2,xmm7
+ psrlw xmm0,2
+ psrlw xmm2,2
+
+ packuswb xmm0,xmm2
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+ sub rcx, byte SIZEOF_XMMWORD ; outcol
+ add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
+ add rdi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test rcx,rcx
+ jnz near .columnloop_r8
+
+ pop rsi
+ pop rdi
+ pop rcx
+
+ add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec rax ; rowctr
+ jg near .rowloop
+
+.return:
+ uncollect_args
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jcsamss2.asm b/simd/jcsamss2.asm
new file mode 100644
index 0000000..818e911
--- /dev/null
+++ b/simd/jcsamss2.asm
@@ -0,0 +1,351 @@
+;
+; jcsamss2.asm - downsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+; JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b)+8 ; JDIMENSION image_width
+%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
+%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
+%define input_data(b) (b)+24 ; JSAMPARRAY input_data
+%define output_data(b) (b)+28 ; JSAMPARRAY output_data
+
+ align 16
+ global EXTN(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+ push ebp
+ mov ebp,esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx,1 ; output_cols * 2
+ sub ecx,edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax,eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16,7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi,edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v1_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax,eax
+ jle near .return
+
+ mov edx, 0x00010000 ; bias pattern
+ movd xmm7,edx
+ pcmpeqw xmm6,xmm6
+ pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+ psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16,7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ alignx 16,7
+
+.columnloop_r8:
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ pxor xmm1,xmm1
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .downsample
+ alignx 16,7
+
+.columnloop:
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+ movdqa xmm2,xmm0
+ movdqa xmm3,xmm1
+
+ pand xmm0,xmm6
+ psrlw xmm2,BYTE_BIT
+ pand xmm1,xmm6
+ psrlw xmm3,BYTE_BIT
+
+ paddw xmm0,xmm2
+ paddw xmm1,xmm3
+ paddw xmm0,xmm7
+ paddw xmm1,xmm7
+ psrlw xmm0,1
+ psrlw xmm1,1
+
+ packuswb xmm0,xmm1
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+ sub ecx, byte SIZEOF_XMMWORD ; outcol
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr
+ add edi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ test ecx,ecx
+ jnz short .columnloop_r8
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+; JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b) (b)+8 ; JDIMENSION image_width
+%define max_v_samp(b) (b)+12 ; int max_v_samp_factor
+%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
+%define width_blks(b) (b)+20 ; JDIMENSION width_blocks
+%define input_data(b) (b)+24 ; JSAMPARRAY input_data
+%define output_data(b) (b)+28 ; JSAMPARRAY output_data
+
+ align 16
+ global EXTN(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+ push ebp
+ mov ebp,esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx,1 ; output_cols * 2
+ sub ecx,edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax,eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16,7
+.expandloop:
+ push eax
+ push ecx
+
+ mov edi, JSAMPROW [esi]
+ add edi,edx
+ mov al, JSAMPLE [edi-1]
+
+ rep stosb
+
+ pop ecx
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
+
+.expand_end:
+ pop ecx ; output_cols
+
+ ; -- h2v2_downsample
+
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax,eax
+ jle near .return
+
+ mov edx, 0x00020001 ; bias pattern
+ movd xmm7,edx
+ pcmpeqw xmm6,xmm6
+ pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+ psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16,7
+.rowloop:
+ push ecx
+ push edi
+ push esi
+
+ mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov edi, JSAMPROW [edi] ; outptr
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ alignx 16,7
+
+.columnloop_r8:
+ movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .downsample
+ alignx 16,7
+
+.columnloop:
+ movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm1
+ pand xmm0,xmm6
+ psrlw xmm4,BYTE_BIT
+ pand xmm1,xmm6
+ psrlw xmm5,BYTE_BIT
+ paddw xmm0,xmm4
+ paddw xmm1,xmm5
+
+ movdqa xmm4,xmm2
+ movdqa xmm5,xmm3
+ pand xmm2,xmm6
+ psrlw xmm4,BYTE_BIT
+ pand xmm3,xmm6
+ psrlw xmm5,BYTE_BIT
+ paddw xmm2,xmm4
+ paddw xmm3,xmm5
+
+ paddw xmm0,xmm1
+ paddw xmm2,xmm3
+ paddw xmm0,xmm7
+ paddw xmm2,xmm7
+ psrlw xmm0,2
+ psrlw xmm2,2
+
+ packuswb xmm0,xmm2
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+ sub ecx, byte SIZEOF_XMMWORD ; outcol
+ add edx, byte 2*SIZEOF_XMMWORD ; inptr0
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr1
+ add edi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test ecx,ecx
+ jnz near .columnloop_r8
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jdclrmmx.asm b/simd/jdclrmmx.asm
new file mode 100644
index 0000000..79772e0
--- /dev/null
+++ b/simd/jdclrmmx.asm
@@ -0,0 +1,407 @@
+;
+; jdclrmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
+; JSAMPIMAGE input_buf, JDIMENSION input_row,
+; JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b) (b)+8 ; JDIMENSION out_width
+%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
+%define input_row(b) (b)+16 ; JDIMENSION input_row
+%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
+%define num_rows(b) (b)+24 ; int num_rows
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+
+ align 16
+ global EXTN(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [out_width(eax)] ; num_cols
+ test ecx,ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [input_row(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax,eax
+ jle near .return
+ alignx 16,7
+.rowloop:
+ push eax
+ push edi
+ push edx
+ push ebx
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr0
+ mov ebx, JSAMPROW [ebx] ; inptr1
+ mov edx, JSAMPROW [edx] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ alignx 16,7
+.columnloop:
+
+ movq mm5, MMWORD [ebx] ; mm5=Cb(01234567)
+ movq mm1, MMWORD [edx] ; mm1=Cr(01234567)
+
+ pcmpeqw mm4,mm4
+ pcmpeqw mm7,mm7
+ psrlw mm4,BYTE_BIT
+ psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+ movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand mm4,mm5 ; mm4=Cb(0246)=CbE
+ psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO
+ pand mm0,mm1 ; mm0=Cr(0246)=CrE
+ psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO
+
+ paddw mm4,mm7
+ paddw mm5,mm7
+ paddw mm0,mm7
+ paddw mm1,mm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movq mm2,mm4 ; mm2=CbE
+ movq mm3,mm5 ; mm3=CbO
+ paddw mm4,mm4 ; mm4=2*CbE
+ paddw mm5,mm5 ; mm5=2*CbO
+ movq mm6,mm0 ; mm6=CrE
+ movq mm7,mm1 ; mm7=CrO
+ paddw mm0,mm0 ; mm0=2*CrE
+ paddw mm1,mm1 ; mm1=2*CrO
+
+ pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800))
+ pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800))
+ pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200))
+ pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200))
+
+ paddw mm4,[GOTOFF(eax,PW_ONE)]
+ paddw mm5,[GOTOFF(eax,PW_ONE)]
+ psraw mm4,1 ; mm4=(CbE * -FIX(0.22800))
+ psraw mm5,1 ; mm5=(CbO * -FIX(0.22800))
+ paddw mm0,[GOTOFF(eax,PW_ONE)]
+ paddw mm1,[GOTOFF(eax,PW_ONE)]
+ psraw mm0,1 ; mm0=(CrE * FIX(0.40200))
+ psraw mm1,1 ; mm1=(CrO * FIX(0.40200))
+
+ paddw mm4,mm2
+ paddw mm5,mm3
+ paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E
+ movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O
+
+ movq mm4,mm2
+ movq mm5,mm3
+ punpcklwd mm2,mm6
+ punpckhwd mm4,mm6
+ pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd mm3,mm7
+ punpckhwd mm5,mm7
+ pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd mm2,[GOTOFF(eax,PD_ONEHALF)]
+ paddd mm4,[GOTOFF(eax,PD_ONEHALF)]
+ psrad mm2,SCALEBITS
+ psrad mm4,SCALEBITS
+ paddd mm3,[GOTOFF(eax,PD_ONEHALF)]
+ paddd mm5,[GOTOFF(eax,PD_ONEHALF)]
+ psrad mm3,SCALEBITS
+ psrad mm5,SCALEBITS
+
+ packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movq mm5, MMWORD [esi] ; mm5=Y(01234567)
+
+ pcmpeqw mm4,mm4
+ psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..}
+ pand mm4,mm5 ; mm4=Y(0246)=YE
+ psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO
+
+ paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+ paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+ packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
+ packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+ paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+ paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+ packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
+ packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+ paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+ paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+ packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
+ packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+ punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07)
+ punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27)
+
+ movq mmG,mmA
+ movq mmH,mmA
+ punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03)
+ punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07)
+
+ psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
+ psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
+
+ movq mmC,mmD
+ movq mmB,mmD
+ punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14)
+ punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --)
+
+ psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
+
+ movq mmF,mmE
+ punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25)
+ punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --)
+
+ punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12)
+ punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05)
+ punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ jmp near .columnloop
+ alignx 16,7
+
+.column_st16:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_MMWORD
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq mmA,mmC
+ sub ecx, byte 2*SIZEOF_MMWORD
+ add edi, byte 2*SIZEOF_MMWORD
+ jmp short .column_st4
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA,mmE
+ sub ecx, byte SIZEOF_MMWORD
+ add edi, byte SIZEOF_MMWORD
+.column_st4:
+ movd eax,mmA
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st2
+ mov DWORD [edi+0*SIZEOF_DWORD], eax
+ psrlq mmA,DWORD_BIT
+ movd eax,mmA
+ sub ecx, byte SIZEOF_DWORD
+ add edi, byte SIZEOF_DWORD
+.column_st2:
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov WORD [edi+0*SIZEOF_WORD], ax
+ shr eax,WORD_BIT
+ sub ecx, byte SIZEOF_WORD
+ add edi, byte SIZEOF_WORD
+.column_st1:
+ cmp ecx, byte SIZEOF_BYTE
+ jb short .nextrow
+ mov BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+ pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+ punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36)
+ punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17)
+ punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37)
+
+ movq mmC,mmA
+ punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32)
+ punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36)
+ movq mmG,mmB
+ punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33)
+ punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37)
+
+ movq mmD,mmA
+ punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31)
+ punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33)
+ movq mmH,mmC
+ punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35)
+ punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ jmp near .columnloop
+ alignx 16,7
+
+.column_st16:
+ cmp ecx, byte SIZEOF_MMWORD/2
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq mmA,mmC
+ movq mmD,mmH
+ sub ecx, byte SIZEOF_MMWORD/2
+ add edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD/4
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA,mmD
+ sub ecx, byte SIZEOF_MMWORD/4
+ add edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+ cmp ecx, byte SIZEOF_MMWORD/8
+ jb short .nextrow
+ movd DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ alignx 16,7
+
+.nextrow:
+ pop ecx
+ pop esi
+ pop ebx
+ pop edx
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ add edi, byte SIZEOF_JSAMPROW ; output_buf
+ dec eax ; num_rows
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm
new file mode 100644
index 0000000..ea9d2ac
--- /dev/null
+++ b/simd/jdclrss2-64.asm
@@ -0,0 +1,487 @@
+;
+; jdclrss2.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
+; JSAMPIMAGE input_buf, JDIMENSION input_row,
+; JSAMPARRAY output_buf, int num_rows)
+;
+
+; r10 = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14 = int num_rows
+
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 16
+ global EXTN(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+ push rbp
+ mov rax,rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp],rax
+ mov rbp,rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ push rbx
+ collect_args
+
+ mov rcx, r10 ; num_cols
+ test rcx,rcx
+ jz near .return
+
+ push rcx
+
+ mov rdi, r11
+ mov rcx, r12
+ mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+ lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+ lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rdi, r13
+ mov rax, r14
+ test rax,rax
+ jle near .return
+.rowloop:
+ push rax
+ push rdi
+ push rdx
+ push rbx
+ push rsi
+ push rcx ; col
+
+ mov rsi, JSAMPROW [rsi] ; inptr0
+ mov rbx, JSAMPROW [rbx] ; inptr1
+ mov rdx, JSAMPROW [rdx] ; inptr2
+ mov rdi, JSAMPROW [rdi] ; outptr
+.columnloop:
+
+ movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
+ movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
+
+ pcmpeqw xmm4,xmm4
+ pcmpeqw xmm7,xmm7
+ psrlw xmm4,BYTE_BIT
+ psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+ movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
+ psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
+ pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
+ psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
+
+ paddw xmm4,xmm7
+ paddw xmm5,xmm7
+ paddw xmm0,xmm7
+ paddw xmm1,xmm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm2,xmm4 ; xmm2=CbE
+ movdqa xmm3,xmm5 ; xmm3=CbO
+ paddw xmm4,xmm4 ; xmm4=2*CbE
+ paddw xmm5,xmm5 ; xmm5=2*CbO
+ movdqa xmm6,xmm0 ; xmm6=CrE
+ movdqa xmm7,xmm1 ; xmm7=CrO
+ paddw xmm0,xmm0 ; xmm0=2*CrE
+ paddw xmm1,xmm1 ; xmm1=2*CrO
+
+ pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
+ pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
+ pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
+ pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
+
+ paddw xmm4,[rel PW_ONE]
+ paddw xmm5,[rel PW_ONE]
+ psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
+ psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
+ paddw xmm0,[rel PW_ONE]
+ paddw xmm1,[rel PW_ONE]
+ psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
+ psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
+
+ paddw xmm4,xmm2
+ paddw xmm5,xmm3
+ paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
+
+ movdqa xmm4,xmm2
+ movdqa xmm5,xmm3
+ punpcklwd xmm2,xmm6
+ punpckhwd xmm4,xmm6
+ pmaddwd xmm2,[rel PW_MF0344_F0285]
+ pmaddwd xmm4,[rel PW_MF0344_F0285]
+ punpcklwd xmm3,xmm7
+ punpckhwd xmm5,xmm7
+ pmaddwd xmm3,[rel PW_MF0344_F0285]
+ pmaddwd xmm5,[rel PW_MF0344_F0285]
+
+ paddd xmm2,[rel PD_ONEHALF]
+ paddd xmm4,[rel PD_ONEHALF]
+ psrad xmm2,SCALEBITS
+ psrad xmm4,SCALEBITS
+ paddd xmm3,[rel PD_ONEHALF]
+ paddd xmm5,[rel PD_ONEHALF]
+ psrad xmm3,SCALEBITS
+ psrad xmm5,SCALEBITS
+
+ packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm4,xmm4
+ psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
+ psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
+
+ paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+ paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+ packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+ paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+ packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+ paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+ packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG,xmmA
+ movdqa xmmH,xmmA
+ punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC,xmmD
+ movdqa xmmB,xmmD
+ punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF,xmmE
+ punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB,xmmE
+ punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB,xmmF
+ punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+.out0:
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st32:
+ pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
+ lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp rcx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmF
+ sub rcx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmD
+ sub rcx, byte SIZEOF_XMMWORD
+.column_st15:
+ mov rax,rcx
+ xor rcx, byte 0x0F
+ shl rcx, 2
+ movd xmmB,ecx
+ psrlq xmmH,4
+ pcmpeqb xmmE,xmmE
+ psrlq xmmH,xmmB
+ psrlq xmmE,xmmB
+ punpcklbw xmmE,xmmH
+ ; ----------------
+ mov rcx,rdi
+ and rcx, byte SIZEOF_XMMWORD-1
+ jz short .adj0
+ add rax,rcx
+ cmp rax, byte SIZEOF_XMMWORD
+ ja short .adj0
+ and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
+ shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,rcx
+ movdqa xmmG,xmmA
+ movdqa xmmC,xmmE
+ pslldq xmmA, SIZEOF_XMMWORD/2
+ pslldq xmmE, SIZEOF_XMMWORD/2
+ movd xmmD,ecx
+ sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+ jb short .adj1
+ movd xmmF,ecx
+ psllq xmmA,xmmF
+ psllq xmmE,xmmF
+ jmp short .adj0
+.adj1: neg ecx
+ movd xmmF,ecx
+ psrlq xmmA,xmmF
+ psrlq xmmE,xmmF
+ psllq xmmG,xmmD
+ psllq xmmC,xmmD
+ por xmmA,xmmG
+ por xmmE,xmmC
+.adj0: ; ----------------
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC,xmmA
+ punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG,xmmB
+ punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD,xmmA
+ punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH,xmmC
+ punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+.out0:
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st32:
+ pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
+ cmp rcx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmC
+ movdqa xmmD,xmmH
+ sub rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp rcx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmD
+ sub rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ cmp rcx, byte SIZEOF_XMMWORD/16
+ jb near .nextrow
+ mov rax,rcx
+ xor rcx, byte 0x03
+ inc rcx
+ shl rcx, 4
+ movd xmmF,ecx
+ psrlq xmmE,xmmF
+ punpcklbw xmmE,xmmE
+ ; ----------------
+ mov rcx,rdi
+ and rcx, byte SIZEOF_XMMWORD-1
+ jz short .adj0
+ lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
+ cmp rax, byte SIZEOF_XMMWORD
+ ja short .adj0
+ and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
+ shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
+ movdqa xmmB,xmmA
+ movdqa xmmG,xmmE
+ pslldq xmmA, SIZEOF_XMMWORD/2
+ pslldq xmmE, SIZEOF_XMMWORD/2
+ movd xmmC,ecx
+ sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+ jb short .adj1
+ movd xmmH,ecx
+ psllq xmmA,xmmH
+ psllq xmmE,xmmH
+ jmp short .adj0
+.adj1: neg rcx
+ movd xmmH,ecx
+ psrlq xmmA,xmmH
+ psrlq xmmE,xmmH
+ psllq xmmB,xmmC
+ psllq xmmG,xmmC
+ por xmmA,xmmB
+ por xmmE,xmmG
+.adj0: ; ----------------
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+ pop rcx
+ pop rsi
+ pop rbx
+ pop rdx
+ pop rdi
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ add rbx, byte SIZEOF_JSAMPROW
+ add rdx, byte SIZEOF_JSAMPROW
+ add rdi, byte SIZEOF_JSAMPROW ; output_buf
+ dec rax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
+
+.return:
+ uncollect_args
+ pop rbx
+ mov rsp,rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm
new file mode 100644
index 0000000..865fa82
--- /dev/null
+++ b/simd/jdclrss2.asm
@@ -0,0 +1,505 @@
+;
+; jdclrss2.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
+; JSAMPIMAGE input_buf, JDIMENSION input_row,
+; JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b) (b)+8 ; JDIMENSION out_width
+%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
+%define input_row(b) (b)+16 ; JDIMENSION input_row
+%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
+%define num_rows(b) (b)+24 ; int num_rows
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+
+ align 16
+ global EXTN(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [out_width(eax)] ; num_cols
+ test ecx,ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [input_row(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax,eax
+ jle near .return
+ alignx 16,7
+.rowloop:
+ push eax
+ push edi
+ push edx
+ push ebx
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr0
+ mov ebx, JSAMPROW [ebx] ; inptr1
+ mov edx, JSAMPROW [edx] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ alignx 16,7
+.columnloop:
+
+ movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
+ movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
+
+ pcmpeqw xmm4,xmm4
+ pcmpeqw xmm7,xmm7
+ psrlw xmm4,BYTE_BIT
+ psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+ movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
+ psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
+ pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
+ psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
+
+ paddw xmm4,xmm7
+ paddw xmm5,xmm7
+ paddw xmm0,xmm7
+ paddw xmm1,xmm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm2,xmm4 ; xmm2=CbE
+ movdqa xmm3,xmm5 ; xmm3=CbO
+ paddw xmm4,xmm4 ; xmm4=2*CbE
+ paddw xmm5,xmm5 ; xmm5=2*CbO
+ movdqa xmm6,xmm0 ; xmm6=CrE
+ movdqa xmm7,xmm1 ; xmm7=CrO
+ paddw xmm0,xmm0 ; xmm0=2*CrE
+ paddw xmm1,xmm1 ; xmm1=2*CrO
+
+ pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
+ pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
+ pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
+ pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
+
+ paddw xmm4,[GOTOFF(eax,PW_ONE)]
+ paddw xmm5,[GOTOFF(eax,PW_ONE)]
+ psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
+ psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
+ paddw xmm0,[GOTOFF(eax,PW_ONE)]
+ paddw xmm1,[GOTOFF(eax,PW_ONE)]
+ psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
+ psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
+
+ paddw xmm4,xmm2
+ paddw xmm5,xmm3
+ paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
+
+ movdqa xmm4,xmm2
+ movdqa xmm5,xmm3
+ punpcklwd xmm2,xmm6
+ punpckhwd xmm4,xmm6
+ pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd xmm3,xmm7
+ punpckhwd xmm5,xmm7
+ pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm4,[GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm2,SCALEBITS
+ psrad xmm4,SCALEBITS
+ paddd xmm3,[GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm3,SCALEBITS
+ psrad xmm5,SCALEBITS
+
+ packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm4,xmm4
+ psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
+ psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
+
+ paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+ paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+ packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+ paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+ packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+ paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+ packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG,xmmA
+ movdqa xmmH,xmmA
+ punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC,xmmD
+ movdqa xmmB,xmmD
+ punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF,xmmE
+ punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB,xmmE
+ punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB,xmmF
+ punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
+ add edi, byte SIZEOF_XMMWORD ; outptr
+.out0:
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16,7
+
+.column_st32:
+ pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmF
+ sub ecx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmD
+ sub ecx, byte SIZEOF_XMMWORD
+.column_st15:
+ mov eax,ecx
+ xor ecx, byte 0x0F
+ shl ecx, 2
+ movd xmmB,ecx
+ psrlq xmmH,4
+ pcmpeqb xmmE,xmmE
+ psrlq xmmH,xmmB
+ psrlq xmmE,xmmB
+ punpcklbw xmmE,xmmH
+ ; ----------------
+ mov ecx,edi
+ and ecx, byte SIZEOF_XMMWORD-1
+ jz short .adj0
+ add eax,ecx
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .adj0
+ and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
+ shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
+ movdqa xmmG,xmmA
+ movdqa xmmC,xmmE
+ pslldq xmmA, SIZEOF_XMMWORD/2
+ pslldq xmmE, SIZEOF_XMMWORD/2
+ movd xmmD,ecx
+ sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+ jb short .adj1
+ movd xmmF,ecx
+ psllq xmmA,xmmF
+ psllq xmmE,xmmF
+ jmp short .adj0
+.adj1: neg ecx
+ movd xmmF,ecx
+ psrlq xmmA,xmmF
+ psrlq xmmE,xmmF
+ psllq xmmG,xmmD
+ psllq xmmC,xmmD
+ por xmmA,xmmG
+ por xmmE,xmmC
+.adj0: ; ----------------
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC,xmmA
+ punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG,xmmB
+ punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD,xmmA
+ punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH,xmmC
+ punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
+ add edi, byte SIZEOF_XMMWORD ; outptr
+.out0:
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .nextrow
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16,7
+
+.column_st32:
+ pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
+ cmp ecx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmC
+ movdqa xmmD,xmmH
+ sub ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmD
+ sub ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ cmp ecx, byte SIZEOF_XMMWORD/16
+ jb short .nextrow
+ mov eax,ecx
+ xor ecx, byte 0x03
+ inc ecx
+ shl ecx, 4
+ movd xmmF,ecx
+ psrlq xmmE,xmmF
+ punpcklbw xmmE,xmmE
+ ; ----------------
+ mov ecx,edi
+ and ecx, byte SIZEOF_XMMWORD-1
+ jz short .adj0
+ lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .adj0
+ and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
+ shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
+ movdqa xmmB,xmmA
+ movdqa xmmG,xmmE
+ pslldq xmmA, SIZEOF_XMMWORD/2
+ pslldq xmmE, SIZEOF_XMMWORD/2
+ movd xmmC,ecx
+ sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+ jb short .adj1
+ movd xmmH,ecx
+ psllq xmmA,xmmH
+ psllq xmmE,xmmH
+ jmp short .adj0
+.adj1: neg ecx
+ movd xmmH,ecx
+ psrlq xmmA,xmmH
+ psrlq xmmE,xmmH
+ psllq xmmB,xmmC
+ psllq xmmG,xmmC
+ por xmmA,xmmB
+ por xmmE,xmmG
+.adj0: ; ----------------
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ alignx 16,7
+
+.nextrow:
+ pop ecx
+ pop esi
+ pop ebx
+ pop edx
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ add edi, byte SIZEOF_JSAMPROW ; output_buf
+ dec eax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jdcolmmx.asm b/simd/jdcolmmx.asm
new file mode 100644
index 0000000..58775e8
--- /dev/null
+++ b/simd/jdcolmmx.asm
@@ -0,0 +1,117 @@
+;
+; jdcolmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402 times 4 dw F_0_402
+PW_MF0228 times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE times 4 dw 1
+PD_ONEHALF times 2 dd 1 << (SCALEBITS-1)
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
+%include "jdclrmmx.asm"
diff --git a/simd/jdcolss2-64.asm b/simd/jdcolss2-64.asm
new file mode 100644
index 0000000..5e8a322
--- /dev/null
+++ b/simd/jdcolss2-64.asm
@@ -0,0 +1,117 @@
+;
+; jdcolss2.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402 times 8 dw F_0_402
+PW_MF0228 times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE times 8 dw 1
+PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdclrss2-64.asm"
diff --git a/simd/jdcolss2.asm b/simd/jdcolss2.asm
new file mode 100644
index 0000000..7ae985d
--- /dev/null
+++ b/simd/jdcolss2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolss2.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402 times 8 dw F_0_402
+PW_MF0228 times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE times 8 dw 1
+PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdclrss2.asm"
diff --git a/simd/jdct.inc b/simd/jdct.inc
new file mode 100644
index 0000000..cc62704
--- /dev/null
+++ b/simd/jdct.inc
@@ -0,0 +1,28 @@
+;
+; jdct.inc - private declarations for forward & reverse DCT subsystems
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; [TAB8]
+
+; Each IDCT routine is responsible for range-limiting its results and
+; converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could
+; be quite far out of range if the input data is corrupt, so a bulletproof
+; range-limiting step is required. We use a mask-and-table-lookup method
+; to do the combined operations quickly.
+;
+%define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples
+
+%define ROW(n,b,s) ((b)+(n)*(s))
+%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE)
+
+%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
+%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
+%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
+
+; --------------------------------------------------------------------------
diff --git a/simd/jdmermmx.asm b/simd/jdmermmx.asm
new file mode 100644
index 0000000..fd587fb
--- /dev/null
+++ b/simd/jdmermmx.asm
@@ -0,0 +1,123 @@
+;
+; jdmermmx.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_merged_upsample_mmx)
+
+EXTN(jconst_merged_upsample_mmx):
+
+PW_F0402 times 4 dw F_0_402
+PW_MF0228 times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE times 4 dw 1
+PD_ONEHALF times 2 dd 1 << (SCALEBITS-1)
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
+%include "jdmrgmmx.asm"
diff --git a/simd/jdmerss2-64.asm b/simd/jdmerss2-64.asm
new file mode 100644
index 0000000..2f9c5c1
--- /dev/null
+++ b/simd/jdmerss2-64.asm
@@ -0,0 +1,123 @@
+;
+; jdmerss2.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402 times 8 dw F_0_402
+PW_MF0228 times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE times 8 dw 1
+PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
diff --git a/simd/jdmerss2.asm b/simd/jdmerss2.asm
new file mode 100644
index 0000000..2294e0d
--- /dev/null
+++ b/simd/jdmerss2.asm
@@ -0,0 +1,123 @@
+;
+; jdmerss2.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS 16
+
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402 times 8 dw F_0_402
+PW_MF0228 times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE times 8 dw 1
+PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgss2.asm"
diff --git a/simd/jdmrgmmx.asm b/simd/jdmrgmmx.asm
new file mode 100644
index 0000000..f5fa936
--- /dev/null
+++ b/simd/jdmrgmmx.asm
@@ -0,0 +1,466 @@
+;
+; jdmrgmmx.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b)+8 ; JDIMENSION output_width
+%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 3
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+
+ align 16
+ global EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [output_width(eax)] ; col
+ test ecx,ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [in_row_group_ctr(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
+ mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
+ mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+
+ pop ecx ; col
+
+ alignx 16,7
+.columnloop:
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ movq mm6, MMWORD [ebx] ; mm6=Cb(01234567)
+ movq mm7, MMWORD [edx] ; mm7=Cr(01234567)
+
+ pxor mm1,mm1 ; mm1=(all 0's)
+ pcmpeqw mm3,mm3
+ psllw mm3,7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+ movq mm4,mm6
+ punpckhbw mm6,mm1 ; mm6=Cb(4567)=CbH
+ punpcklbw mm4,mm1 ; mm4=Cb(0123)=CbL
+ movq mm0,mm7
+ punpckhbw mm7,mm1 ; mm7=Cr(4567)=CrH
+ punpcklbw mm0,mm1 ; mm0=Cr(0123)=CrL
+
+ paddw mm6,mm3
+ paddw mm4,mm3
+ paddw mm7,mm3
+ paddw mm0,mm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movq mm5,mm6 ; mm5=CbH
+ movq mm2,mm4 ; mm2=CbL
+ paddw mm6,mm6 ; mm6=2*CbH
+ paddw mm4,mm4 ; mm4=2*CbL
+ movq mm1,mm7 ; mm1=CrH
+ movq mm3,mm0 ; mm3=CrL
+ paddw mm7,mm7 ; mm7=2*CrH
+ paddw mm0,mm0 ; mm0=2*CrL
+
+ pmulhw mm6,[GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800))
+ pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800))
+ pmulhw mm7,[GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200))
+ pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200))
+
+ paddw mm6,[GOTOFF(eax,PW_ONE)]
+ paddw mm4,[GOTOFF(eax,PW_ONE)]
+ psraw mm6,1 ; mm6=(CbH * -FIX(0.22800))
+ psraw mm4,1 ; mm4=(CbL * -FIX(0.22800))
+ paddw mm7,[GOTOFF(eax,PW_ONE)]
+ paddw mm0,[GOTOFF(eax,PW_ONE)]
+ psraw mm7,1 ; mm7=(CrH * FIX(0.40200))
+ psraw mm0,1 ; mm0=(CrL * FIX(0.40200))
+
+ paddw mm6,mm5
+ paddw mm4,mm2
+ paddw mm6,mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
+ paddw mm4,mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
+ paddw mm7,mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
+ paddw mm0,mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H
+ movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H
+
+ movq mm6,mm5
+ movq mm7,mm2
+ punpcklwd mm5,mm1
+ punpckhwd mm6,mm1
+ pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm6,[GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd mm2,mm3
+ punpckhwd mm7,mm3
+ pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd mm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd mm5,[GOTOFF(eax,PD_ONEHALF)]
+ paddd mm6,[GOTOFF(eax,PD_ONEHALF)]
+ psrad mm5,SCALEBITS
+ psrad mm6,SCALEBITS
+ paddd mm2,[GOTOFF(eax,PD_ONEHALF)]
+ paddd mm7,[GOTOFF(eax,PD_ONEHALF)]
+ psrad mm2,SCALEBITS
+ psrad mm7,SCALEBITS
+
+ packssdw mm5,mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ packssdw mm2,mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ psubw mm5,mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ psubw mm2,mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H
+
+ mov al,2 ; Yctr
+ jmp short .Yloop_1st
+ alignx 16,7
+
+.Yloop_2nd:
+ movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H
+ movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H
+ movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H
+ alignx 16,7
+
+.Yloop_1st:
+ movq mm7, MMWORD [esi] ; mm7=Y(01234567)
+
+ pcmpeqw mm6,mm6
+ psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
+ pand mm6,mm7 ; mm6=Y(0246)=YE
+ psrlw mm7,BYTE_BIT ; mm7=Y(1357)=YO
+
+ movq mm1,mm0 ; mm1=mm0=(R-Y)(L/H)
+ movq mm3,mm2 ; mm3=mm2=(G-Y)(L/H)
+ movq mm5,mm4 ; mm5=mm4=(B-Y)(L/H)
+
+ paddw mm0,mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+ paddw mm1,mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+ packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
+ packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+ paddw mm2,mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+ paddw mm3,mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+ packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
+ packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+ paddw mm4,mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+ paddw mm5,mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+ packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
+ packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+ punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07)
+ punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27)
+
+ movq mmG,mmA
+ movq mmH,mmA
+ punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03)
+ punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07)
+
+ psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
+ psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
+
+ movq mmC,mmD
+ movq mmB,mmD
+ punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14)
+ punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --)
+
+ psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
+
+ movq mmF,mmE
+ punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25)
+ punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --)
+
+ punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12)
+ punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05)
+ punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz short .endcolumn
+
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16,7
+
+.column_st16:
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_MMWORD
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
+ movq mmA,mmC
+ sub ecx, byte 2*SIZEOF_MMWORD
+ add edi, byte 2*SIZEOF_MMWORD
+ jmp short .column_st4
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA,mmE
+ sub ecx, byte SIZEOF_MMWORD
+ add edi, byte SIZEOF_MMWORD
+.column_st4:
+ movd eax,mmA
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st2
+ mov DWORD [edi+0*SIZEOF_DWORD], eax
+ psrlq mmA,DWORD_BIT
+ movd eax,mmA
+ sub ecx, byte SIZEOF_DWORD
+ add edi, byte SIZEOF_DWORD
+.column_st2:
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov WORD [edi+0*SIZEOF_WORD], ax
+ shr eax,WORD_BIT
+ sub ecx, byte SIZEOF_WORD
+ add edi, byte SIZEOF_WORD
+.column_st1:
+ cmp ecx, byte SIZEOF_BYTE
+ jb short .endcolumn
+ mov BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+ pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
+ pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+ ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+ ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+ ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+ ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+ punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
+ punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36)
+ punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17)
+ punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37)
+
+ movq mmC,mmA
+ punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32)
+ punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36)
+ movq mmG,mmB
+ punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33)
+ punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37)
+
+ movq mmD,mmA
+ punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31)
+ punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33)
+ movq mmH,mmC
+ punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35)
+ punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37)
+
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st16
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+ sub ecx, byte SIZEOF_MMWORD
+ jz short .endcolumn
+
+ add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
+ add esi, byte SIZEOF_MMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_MMWORD ; inptr1
+ add edx, byte SIZEOF_MMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16,7
+
+.column_st16:
+ cmp ecx, byte SIZEOF_MMWORD/2
+ jb short .column_st8
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
+ movq mmA,mmC
+ movq mmD,mmH
+ sub ecx, byte SIZEOF_MMWORD/2
+ add edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+ cmp ecx, byte SIZEOF_MMWORD/4
+ jb short .column_st4
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
+ movq mmA,mmD
+ sub ecx, byte SIZEOF_MMWORD/4
+ add edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+ cmp ecx, byte SIZEOF_MMWORD/8
+ jb short .endcolumn
+ movd DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b)+8 ; JDIMENSION output_width
+%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
+
+ align 16
+ global EXTN(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+ push ebp
+ mov ebp,esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov eax, JDIMENSION [output_width(ebp)]
+
+ mov edi, JSAMPIMAGE [input_buf(ebp)]
+ mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(ebp)]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+ push edx ; inptr2
+ push ebx ; inptr1
+ push esi ; inptr00
+ mov ebx,esp
+
+ push edi ; output_buf (outptr0)
+ push ecx ; in_row_group_ctr
+ push ebx ; input_buf
+ push eax ; output_width
+
+ call near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+ add esi, byte SIZEOF_JSAMPROW ; inptr01
+ add edi, byte SIZEOF_JSAMPROW ; outptr1
+ mov POINTER [ebx+0*SIZEOF_POINTER], esi
+ mov POINTER [ebx-1*SIZEOF_POINTER], edi
+
+ call near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+ add esp, byte 7*SIZEOF_DWORD
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm
new file mode 100644
index 0000000..3e54c7a
--- /dev/null
+++ b/simd/jdmrgss2-64.asm
@@ -0,0 +1,569 @@
+;
+; jdmrgss2.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+; r10 = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 3
+
+ align 16
+ global EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+ push rbp
+ mov rax,rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp],rax
+ mov rbp,rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ push rbx
+ collect_args
+
+ mov rcx, r10 ; col
+ test rcx,rcx
+ jz near .return
+
+ push rcx
+
+ mov rdi, r11
+ mov rcx, r12
+ mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rdi, r13
+ mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
+ mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
+ mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
+ mov rdi, JSAMPROW [rdi] ; outptr
+
+ pop rcx ; col
+
+.columnloop:
+
+ movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
+ movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
+
+ pxor xmm1,xmm1 ; xmm1=(all 0's)
+ pcmpeqw xmm3,xmm3
+ psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ movdqa xmm4,xmm6
+ punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
+ punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
+ movdqa xmm0,xmm7
+ punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
+ punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
+
+ paddw xmm6,xmm3
+ paddw xmm4,xmm3
+ paddw xmm7,xmm3
+ paddw xmm0,xmm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm5,xmm6 ; xmm5=CbH
+ movdqa xmm2,xmm4 ; xmm2=CbL
+ paddw xmm6,xmm6 ; xmm6=2*CbH
+ paddw xmm4,xmm4 ; xmm4=2*CbL
+ movdqa xmm1,xmm7 ; xmm1=CrH
+ movdqa xmm3,xmm0 ; xmm3=CrL
+ paddw xmm7,xmm7 ; xmm7=2*CrH
+ paddw xmm0,xmm0 ; xmm0=2*CrL
+
+ pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
+ pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
+ pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
+ pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
+
+ paddw xmm6,[rel PW_ONE]
+ paddw xmm4,[rel PW_ONE]
+ psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
+ psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
+ paddw xmm7,[rel PW_ONE]
+ paddw xmm0,[rel PW_ONE]
+ psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
+ psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
+
+ paddw xmm6,xmm5
+ paddw xmm4,xmm2
+ paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+ paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+ paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+ paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
+
+ movdqa xmm6,xmm5
+ movdqa xmm7,xmm2
+ punpcklwd xmm5,xmm1
+ punpckhwd xmm6,xmm1
+ pmaddwd xmm5,[rel PW_MF0344_F0285]
+ pmaddwd xmm6,[rel PW_MF0344_F0285]
+ punpcklwd xmm2,xmm3
+ punpckhwd xmm7,xmm3
+ pmaddwd xmm2,[rel PW_MF0344_F0285]
+ pmaddwd xmm7,[rel PW_MF0344_F0285]
+
+ paddd xmm5,[rel PD_ONEHALF]
+ paddd xmm6,[rel PD_ONEHALF]
+ psrad xmm5,SCALEBITS
+ psrad xmm6,SCALEBITS
+ paddd xmm2,[rel PD_ONEHALF]
+ paddd xmm7,[rel PD_ONEHALF]
+ psrad xmm2,SCALEBITS
+ psrad xmm7,SCALEBITS
+
+ packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
+
+ mov al,2 ; Yctr
+ jmp short .Yloop_1st
+
+.Yloop_2nd:
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
+
+.Yloop_1st:
+ movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm6,xmm6
+ psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
+ psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
+
+ movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
+ movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
+ movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
+
+ paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+ paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+ packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+ paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+ packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+ paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+ packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG,xmmA
+ movdqa xmmH,xmmA
+ punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC,xmmD
+ movdqa xmmB,xmmD
+ punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF,xmmE
+ punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB,xmmE
+ punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB,xmmF
+ punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+.out0:
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
+
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st32:
+ pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
+ lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp rcx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmF
+ sub rcx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmD
+ sub rcx, byte SIZEOF_XMMWORD
+.column_st15:
+ mov rax,rcx
+ xor rcx, byte 0x0F
+ shl rcx, 2
+ movd xmmB,ecx
+ psrlq xmmH,4
+ pcmpeqb xmmE,xmmE
+ psrlq xmmH,xmmB
+ psrlq xmmE,xmmB
+ punpcklbw xmmE,xmmH
+ ; ----------------
+ mov rcx,rdi
+ and rcx, byte SIZEOF_XMMWORD-1
+ jz short .adj0
+ add rax,rcx
+ cmp rax, byte SIZEOF_XMMWORD
+ ja short .adj0
+ and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
+ shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
+ movdqa xmmG,xmmA
+ movdqa xmmC,xmmE
+ pslldq xmmA, SIZEOF_XMMWORD/2
+ pslldq xmmE, SIZEOF_XMMWORD/2
+ movd xmmD,ecx
+ sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+ jb short .adj1
+ movd xmmF,ecx
+ psllq xmmA,xmmF
+ psllq xmmE,xmmF
+ jmp short .adj0
+.adj1: neg rcx
+ movd xmmF,ecx
+ psrlq xmmA,xmmF
+ psrlq xmmE,xmmF
+ psllq xmmG,xmmD
+ psllq xmmC,xmmD
+ por xmmA,xmmG
+ por xmmE,xmmC
+.adj0: ; ----------------
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC,xmmA
+ punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG,xmmB
+ punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD,xmmA
+ punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH,xmmC
+ punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+.out0:
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
+
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+
+.column_st32:
+ pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
+ cmp rcx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmC
+ movdqa xmmD,xmmH
+ sub rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp rcx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmD
+ sub rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ cmp rcx, byte SIZEOF_XMMWORD/16
+ jb near .endcolumn
+ mov rax,rcx
+ xor rcx, byte 0x03
+ inc rcx
+ shl rcx, 4
+ movd xmmF,ecx
+ psrlq xmmE,xmmF
+ punpcklbw xmmE,xmmE
+ ; ----------------
+ mov rcx,rdi
+ and rcx, byte SIZEOF_XMMWORD-1
+ jz short .adj0
+ lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
+ cmp rax, byte SIZEOF_XMMWORD
+ ja short .adj0
+ and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
+ shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
+ movdqa xmmB,xmmA
+ movdqa xmmG,xmmE
+ pslldq xmmA, SIZEOF_XMMWORD/2
+ pslldq xmmE, SIZEOF_XMMWORD/2
+ movd xmmC,ecx
+ sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+ jb short .adj1
+ movd xmmH,ecx
+ psllq xmmA,xmmH
+ psllq xmmE,xmmH
+ jmp short .adj0
+.adj1: neg rcx
+ movd xmmH,ecx
+ psrlq xmmA,xmmH
+ psrlq xmmE,xmmH
+ psllq xmmB,xmmC
+ psllq xmmG,xmmC
+ por xmmA,xmmB
+ por xmmE,xmmG
+.adj0: ; ----------------
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ sfence ; flush the write buffer
+
+.return:
+ uncollect_args
+ pop rbx
+ mov rsp,rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+; r10 = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+ align 16
+ global EXTN(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+ push rbp
+ mov rbp,rsp
+ push rbx
+ collect_args
+
+ mov rax, r10
+
+ mov rdi, r11
+ mov rcx, r12
+ mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rdi, r13
+ lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+ push rdx ; inptr2
+ push rbx ; inptr1
+ push rsi ; inptr00
+ mov rbx,rsp
+
+ push rdi
+ push rcx
+ push rax
+
+ mov rdx, rcx
+ mov rcx, rdi
+ mov rdi, rax
+ mov rsi, rbx
+
+ call EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ pop rax
+ pop rcx
+ pop rdi
+ pop rsi
+ pop rbx
+ pop rdx
+
+ add rdi, byte SIZEOF_JSAMPROW ; outptr1
+ add rsi, byte SIZEOF_JSAMPROW ; inptr01
+
+ push rdx ; inptr2
+ push rbx ; inptr1
+ push rsi ; inptr00
+ mov rbx,rsp
+
+ push rdi
+ push rcx
+ push rax
+
+ mov rdx, rcx
+ mov rcx, rdi
+ mov rdi, rax
+ mov rsi, rbx
+
+ call EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ pop rax
+ pop rcx
+ pop rdi
+ pop rsi
+ pop rbx
+ pop rdx
+
+ uncollect_args
+ pop rbx
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm
new file mode 100644
index 0000000..99b7eb9
--- /dev/null
+++ b/simd/jdmrgss2.asm
@@ -0,0 +1,564 @@
+;
+; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b)+8 ; JDIMENSION output_width
+%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 3
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+
+ align 16
+ global EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [output_width(eax)] ; col
+ test ecx,ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [in_row_group_ctr(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
+ mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
+ mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+
+ pop ecx ; col
+
+ alignx 16,7
+.columnloop:
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
+ movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
+
+ pxor xmm1,xmm1 ; xmm1=(all 0's)
+ pcmpeqw xmm3,xmm3
+ psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ movdqa xmm4,xmm6
+ punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
+ punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
+ movdqa xmm0,xmm7
+ punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
+ punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
+
+ paddw xmm6,xmm3
+ paddw xmm4,xmm3
+ paddw xmm7,xmm3
+ paddw xmm0,xmm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm5,xmm6 ; xmm5=CbH
+ movdqa xmm2,xmm4 ; xmm2=CbL
+ paddw xmm6,xmm6 ; xmm6=2*CbH
+ paddw xmm4,xmm4 ; xmm4=2*CbL
+ movdqa xmm1,xmm7 ; xmm1=CrH
+ movdqa xmm3,xmm0 ; xmm3=CrL
+ paddw xmm7,xmm7 ; xmm7=2*CrH
+ paddw xmm0,xmm0 ; xmm0=2*CrL
+
+ pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
+ pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
+ pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
+ pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
+
+ paddw xmm6,[GOTOFF(eax,PW_ONE)]
+ paddw xmm4,[GOTOFF(eax,PW_ONE)]
+ psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
+ psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
+ paddw xmm7,[GOTOFF(eax,PW_ONE)]
+ paddw xmm0,[GOTOFF(eax,PW_ONE)]
+ psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
+ psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
+
+ paddw xmm6,xmm5
+ paddw xmm4,xmm2
+ paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+ paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+ paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+ paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
+
+ movdqa xmm6,xmm5
+ movdqa xmm7,xmm2
+ punpcklwd xmm5,xmm1
+ punpckhwd xmm6,xmm1
+ pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd xmm2,xmm3
+ punpckhwd xmm7,xmm3
+ pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm6,[GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm5,SCALEBITS
+ psrad xmm6,SCALEBITS
+ paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm7,[GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm2,SCALEBITS
+ psrad xmm7,SCALEBITS
+
+ packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
+
+ mov al,2 ; Yctr
+ jmp short .Yloop_1st
+ alignx 16,7
+
+.Yloop_2nd:
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
+ alignx 16,7
+
+.Yloop_1st:
+ movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm6,xmm6
+ psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
+ psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
+
+ movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
+ movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
+ movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
+
+ paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+ paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+ packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+ paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+ packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+ paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+ packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG,xmmA
+ movdqa xmmH,xmmA
+ punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC,xmmD
+ movdqa xmmB,xmmD
+ punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF,xmmE
+ punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB,xmmE
+ punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB,xmmF
+ punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
+ add edi, byte SIZEOF_XMMWORD ; outptr
+.out0:
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16,7
+
+.column_st32:
+ pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmF
+ sub ecx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmD
+ sub ecx, byte SIZEOF_XMMWORD
+.column_st15:
+ mov eax,ecx
+ xor ecx, byte 0x0F
+ shl ecx, 2
+ movd xmmB,ecx
+ psrlq xmmH,4
+ pcmpeqb xmmE,xmmE
+ psrlq xmmH,xmmB
+ psrlq xmmE,xmmB
+ punpcklbw xmmE,xmmH
+ ; ----------------
+ mov ecx,edi
+ and ecx, byte SIZEOF_XMMWORD-1
+ jz short .adj0
+ add eax,ecx
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .adj0
+ and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
+ shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
+ movdqa xmmG,xmmA
+ movdqa xmmC,xmmE
+ pslldq xmmA, SIZEOF_XMMWORD/2
+ pslldq xmmE, SIZEOF_XMMWORD/2
+ movd xmmD,ecx
+ sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+ jb short .adj1
+ movd xmmF,ecx
+ psllq xmmA,xmmF
+ psllq xmmE,xmmF
+ jmp short .adj0
+.adj1: neg ecx
+ movd xmmF,ecx
+ psrlq xmmA,xmmF
+ psrlq xmmE,xmmF
+ psllq xmmG,xmmD
+ psllq xmmC,xmmD
+ por xmmA,xmmG
+ por xmmE,xmmC
+.adj0: ; ----------------
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+ pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+%else
+ pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+%endif
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC,xmmA
+ punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG,xmmB
+ punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD,xmmA
+ punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH,xmmC
+ punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ jmp short .out0
+.out1: ; --(unaligned)-----------------
+ pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
+ add edi, byte SIZEOF_XMMWORD ; outptr
+.out0:
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
+
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
+
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16,7
+
+.column_st32:
+ pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
+ cmp ecx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmC
+ movdqa xmmD,xmmH
+ sub ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+ cmp ecx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA,xmmD
+ sub ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+ cmp ecx, byte SIZEOF_XMMWORD/16
+ jb short .endcolumn
+ mov eax,ecx
+ xor ecx, byte 0x03
+ inc ecx
+ shl ecx, 4
+ movd xmmF,ecx
+ psrlq xmmE,xmmF
+ punpcklbw xmmE,xmmE
+ ; ----------------
+ mov ecx,edi
+ and ecx, byte SIZEOF_XMMWORD-1
+ jz short .adj0
+ lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .adj0
+ and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
+ shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
+ movdqa xmmB,xmmA
+ movdqa xmmG,xmmE
+ pslldq xmmA, SIZEOF_XMMWORD/2
+ pslldq xmmE, SIZEOF_XMMWORD/2
+ movd xmmC,ecx
+ sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+ jb short .adj1
+ movd xmmH,ecx
+ psllq xmmA,xmmH
+ psllq xmmE,xmmH
+ jmp short .adj0
+.adj1: neg ecx
+ movd xmmH,ecx
+ psrlq xmmA,xmmH
+ psrlq xmmE,xmmH
+ psllq xmmB,xmmC
+ psllq xmmG,xmmC
+ por xmmA,xmmB
+ por xmmE,xmmG
+.adj0: ; ----------------
+ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+ sfence ; flush the write buffer
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
+; JSAMPIMAGE input_buf,
+; JDIMENSION in_row_group_ctr,
+; JSAMPARRAY output_buf);
+;
+
+%define output_width(b) (b)+8 ; JDIMENSION output_width
+%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
+
+ align 16
+ global EXTN(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+ push ebp
+ mov ebp,esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov eax, POINTER [output_width(ebp)]
+
+ mov edi, JSAMPIMAGE [input_buf(ebp)]
+ mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(ebp)]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+ push edx ; inptr2
+ push ebx ; inptr1
+ push esi ; inptr00
+ mov ebx,esp
+
+ push edi ; output_buf (outptr0)
+ push ecx ; in_row_group_ctr
+ push ebx ; input_buf
+ push eax ; output_width
+
+ call near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ add esi, byte SIZEOF_JSAMPROW ; inptr01
+ add edi, byte SIZEOF_JSAMPROW ; outptr1
+ mov POINTER [ebx+0*SIZEOF_POINTER], esi
+ mov POINTER [ebx-1*SIZEOF_POINTER], edi
+
+ call near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ add esp, byte 7*SIZEOF_DWORD
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jdsammmx.asm b/simd/jdsammmx.asm
new file mode 100644
index 0000000..c09e5b9
--- /dev/null
+++ b/simd/jdsammmx.asm
@@ -0,0 +1,737 @@
+;
+; jdsammmx.asm - upsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE times 4 dw 1
+PW_TWO times 4 dw 2
+PW_THREE times 4 dw 3
+PW_SEVEN times 4 dw 7
+PW_EIGHT times 4 dw 8
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
+%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
+%define input_data(b) (b)+16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
+
+ align 16
+ global EXTN(jsimd_h2v1_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v1_fancy_upsample_mmx):
+ push ebp
+ mov ebp,esp
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
+ test eax,eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx,ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16,7
+.rowloop:
+ push eax ; colctr
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ test eax, SIZEOF_MMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ pxor mm0,mm0 ; mm0=(all 0's)
+ pcmpeqb mm7,mm7
+ psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
+ pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]
+
+ add eax, byte SIZEOF_MMWORD-1
+ and eax, byte -SIZEOF_MMWORD
+ cmp eax, byte SIZEOF_MMWORD
+ ja short .columnloop
+ alignx 16,7
+
+.columnloop_last:
+ pcmpeqb mm6,mm6
+ psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+ pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+ jmp short .upsample
+ alignx 16,7
+
+.columnloop:
+ movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+ psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+ movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+ movq mm2,mm1
+ movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7)
+ psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)
+ psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)
+
+ por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6)
+ por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8)
+
+ movq mm7,mm1
+ psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)
+
+ movq mm4,mm1
+ punpcklbw mm1,mm0 ; mm1=( 0 1 2 3)
+ punpckhbw mm4,mm0 ; mm4=( 4 5 6 7)
+ movq mm5,mm2
+ punpcklbw mm2,mm0 ; mm2=(-1 0 1 2)
+ punpckhbw mm5,mm0 ; mm5=( 3 4 5 6)
+ movq mm6,mm3
+ punpcklbw mm3,mm0 ; mm3=( 1 2 3 4)
+ punpckhbw mm6,mm0 ; mm6=( 5 6 7 8)
+
+ pmullw mm1,[GOTOFF(ebx,PW_THREE)]
+ pmullw mm4,[GOTOFF(ebx,PW_THREE)]
+ paddw mm2,[GOTOFF(ebx,PW_ONE)]
+ paddw mm5,[GOTOFF(ebx,PW_ONE)]
+ paddw mm3,[GOTOFF(ebx,PW_TWO)]
+ paddw mm6,[GOTOFF(ebx,PW_TWO)]
+
+ paddw mm2,mm1
+ paddw mm5,mm4
+ psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6)
+ psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14)
+ paddw mm3,mm1
+ paddw mm6,mm4
+ psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7)
+ psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15)
+
+ psllw mm3,BYTE_BIT
+ psllw mm6,BYTE_BIT
+ por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)
+ por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+ sub eax, byte SIZEOF_MMWORD
+ add esi, byte 1*SIZEOF_MMWORD ; inptr
+ add edi, byte 2*SIZEOF_MMWORD ; outptr
+ cmp eax, byte SIZEOF_MMWORD
+ ja near .columnloop
+ test eax,eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
+%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
+%define input_data(b) (b)+16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 4
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+
+ align 16
+ global EXTN(jsimd_h2v2_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v2_fancy_upsample_mmx):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov edx,eax ; edx = original ebp
+ mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
+ test eax,eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(edx)] ; rowctr
+ test ecx,ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(edx)] ; input_data
+ mov edi, POINTER [output_data_ptr(edx)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16,7
+.rowloop:
+ push eax ; colctr
+ push ecx
+ push edi
+ push esi
+
+ mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test eax, SIZEOF_MMWORD-1
+ jz short .skip
+ push edx
+ mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop edx
+.skip:
+ ; -- process the first column block
+
+ movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]
+ movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
+ movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pxor mm3,mm3 ; mm3=(all 0's)
+ movq mm4,mm0
+ punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3)
+ punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7)
+ movq mm5,mm1
+ punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3)
+ punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7)
+ movq mm6,mm2
+ punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3)
+ punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7)
+
+ pmullw mm0,[GOTOFF(ebx,PW_THREE)]
+ pmullw mm4,[GOTOFF(ebx,PW_THREE)]
+
+ pcmpeqb mm7,mm7
+ psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
+
+ paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)
+ paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)
+ paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)
+ paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)
+
+ movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save
+ movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+ pand mm1,mm7 ; mm1=( 0 - - -)
+ pand mm2,mm7 ; mm2=( 0 - - -)
+
+ movq MMWORD [wk(0)], mm1
+ movq MMWORD [wk(1)], mm2
+
+ poppic ebx
+
+ add eax, byte SIZEOF_MMWORD-1
+ and eax, byte -SIZEOF_MMWORD
+ cmp eax, byte SIZEOF_MMWORD
+ ja short .columnloop
+ alignx 16,7
+
+.columnloop_last:
+ ; -- process the last column block
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pcmpeqb mm1,mm1
+ psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
+ movq mm2,mm1
+
+ pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)
+ pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)
+
+ movq MMWORD [wk(2)], mm1
+ movq MMWORD [wk(3)], mm2
+
+ jmp short .upsample
+ alignx 16,7
+
+.columnloop:
+ ; -- process the next column block
+
+ movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]
+ movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
+ movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pxor mm3,mm3 ; mm3=(all 0's)
+ movq mm4,mm0
+ punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3)
+ punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7)
+ movq mm5,mm1
+ punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3)
+ punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7)
+ movq mm6,mm2
+ punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3)
+ punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7)
+
+ pmullw mm0,[GOTOFF(ebx,PW_THREE)]
+ pmullw mm4,[GOTOFF(ebx,PW_THREE)]
+
+ paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)
+ paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)
+ paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)
+ paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)
+
+ movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save
+ movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+ psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)
+ psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)
+
+ movq MMWORD [wk(2)], mm1
+ movq MMWORD [wk(3)], mm2
+
+.upsample:
+ ; -- process the upper row
+
+ movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)
+ movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)
+
+ movq mm0,mm7
+ movq mm4,mm3
+ psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -)
+ psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)
+ movq mm5,mm7
+ movq mm6,mm3
+ psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)
+ psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6)
+
+ por mm0,mm4 ; mm0=( 1 2 3 4)
+ por mm5,mm6 ; mm5=( 3 4 5 6)
+
+ movq mm1,mm7
+ movq mm2,mm3
+ psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)
+ psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -)
+ movq mm4,mm3
+ psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)
+
+ por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)
+ por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)
+
+ movq MMWORD [wk(0)], mm4
+
+ pmullw mm7,[GOTOFF(ebx,PW_THREE)]
+ pmullw mm3,[GOTOFF(ebx,PW_THREE)]
+ paddw mm1,[GOTOFF(ebx,PW_EIGHT)]
+ paddw mm5,[GOTOFF(ebx,PW_EIGHT)]
+ paddw mm0,[GOTOFF(ebx,PW_SEVEN)]
+ paddw mm2,[GOTOFF(ebx,PW_SEVEN)]
+
+ paddw mm1,mm7
+ paddw mm5,mm3
+ psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6)
+ psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14)
+ paddw mm0,mm7
+ paddw mm2,mm3
+ psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7)
+ psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15)
+
+ psllw mm0,BYTE_BIT
+ psllw mm2,BYTE_BIT
+ por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)
+ por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)
+
+ movq MMWORD [edx+0*SIZEOF_MMWORD], mm1
+ movq MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+ ; -- process the lower row
+
+ movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)
+ movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)
+
+ movq mm7,mm6
+ movq mm3,mm4
+ psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -)
+ psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)
+ movq mm0,mm6
+ movq mm2,mm4
+ psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)
+ psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6)
+
+ por mm7,mm3 ; mm7=( 1 2 3 4)
+ por mm0,mm2 ; mm0=( 3 4 5 6)
+
+ movq mm1,mm6
+ movq mm5,mm4
+ psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)
+ psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -)
+ movq mm3,mm4
+ psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)
+
+ por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)
+ por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)
+
+ movq MMWORD [wk(1)], mm3
+
+ pmullw mm6,[GOTOFF(ebx,PW_THREE)]
+ pmullw mm4,[GOTOFF(ebx,PW_THREE)]
+ paddw mm1,[GOTOFF(ebx,PW_EIGHT)]
+ paddw mm0,[GOTOFF(ebx,PW_EIGHT)]
+ paddw mm7,[GOTOFF(ebx,PW_SEVEN)]
+ paddw mm5,[GOTOFF(ebx,PW_SEVEN)]
+
+ paddw mm1,mm6
+ paddw mm0,mm4
+ psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6)
+ psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14)
+ paddw mm7,mm6
+ paddw mm5,mm4
+ psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7)
+ psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15)
+
+ psllw mm7,BYTE_BIT
+ psllw mm5,BYTE_BIT
+ por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)
+ por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+ poppic ebx
+
+ sub eax, byte SIZEOF_MMWORD
+ add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
+ add ebx, byte 1*SIZEOF_MMWORD ; inptr0
+ add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)
+ add edx, byte 2*SIZEOF_MMWORD ; outptr0
+ add edi, byte 2*SIZEOF_MMWORD ; outptr1
+ cmp eax, byte SIZEOF_MMWORD
+ ja near .columnloop
+ test eax,eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop ecx
+ pop eax
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg near .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
+; JDIMENSION output_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
+%define output_width(b) (b)+12 ; JDIMENSION output_width
+%define input_data(b) (b)+16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
+
+ align 16
+ global EXTN(jsimd_h2v1_upsample_mmx)
+
+EXTN(jsimd_h2v1_upsample_mmx):
+ push ebp
+ mov ebp,esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_MMWORD)-1
+ and edx, byte -(2*SIZEOF_MMWORD)
+ jz short .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx,ecx
+ jz short .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16,7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+ mov eax,edx ; colctr
+ alignx 16,7
+.columnloop:
+
+ movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+ movq mm1,mm0
+ punpcklbw mm0,mm0
+ punpckhbw mm1,mm1
+
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+ sub eax, byte 2*SIZEOF_MMWORD
+ jz short .nextrow
+
+ movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+ movq mm3,mm2
+ punpcklbw mm2,mm2
+ punpckhbw mm3,mm3
+
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+ sub eax, byte 2*SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte 2*SIZEOF_MMWORD ; inptr
+ add edi, byte 4*SIZEOF_MMWORD ; outptr
+ jmp short .columnloop
+ alignx 16,7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg short .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
+; JDIMENSION output_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
+%define output_width(b) (b)+12 ; JDIMENSION output_width
+%define input_data(b) (b)+16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
+
+ align 16
+ global EXTN(jsimd_h2v2_upsample_mmx)
+
+EXTN(jsimd_h2v2_upsample_mmx):
+ push ebp
+ mov ebp,esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_MMWORD)-1
+ and edx, byte -(2*SIZEOF_MMWORD)
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx,ecx
+ jz short .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16,7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov eax,edx ; colctr
+ alignx 16,7
+.columnloop:
+
+ movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+ movq mm1,mm0
+ punpcklbw mm0,mm0
+ punpckhbw mm1,mm1
+
+ movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+ movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+ movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
+ movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+ sub eax, byte 2*SIZEOF_MMWORD
+ jz short .nextrow
+
+ movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+ movq mm3,mm2
+ punpcklbw mm2,mm2
+ punpckhbw mm3,mm3
+
+ movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+ movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+ movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
+ movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+ sub eax, byte 2*SIZEOF_MMWORD
+ jz short .nextrow
+
+ add esi, byte 2*SIZEOF_MMWORD ; inptr
+ add ebx, byte 4*SIZEOF_MMWORD ; outptr0
+ add edi, byte 4*SIZEOF_MMWORD ; outptr1
+ jmp short .columnloop
+ alignx 16,7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg short .rowloop
+
+ emms ; empty MMX state
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jdsamss2-64.asm b/simd/jdsamss2-64.asm
new file mode 100644
index 0000000..8521491
--- /dev/null
+++ b/simd/jdsamss2-64.asm
@@ -0,0 +1,668 @@
+;
+; jdsamss2.asm - upsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE times 8 dw 1
+PW_TWO times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+ align 16
+ global EXTN(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+ push rbp
+ mov rbp,rsp
+ collect_args
+
+ mov rax, r11 ; colctr
+ test rax,rax
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx,rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdi, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rax ; colctr
+ push rdi
+ push rsi
+
+ mov rsi, JSAMPROW [rsi] ; inptr
+ mov rdi, JSAMPROW [rdi] ; outptr
+
+ test rax, SIZEOF_XMMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ pxor xmm0,xmm0 ; xmm0=(all 0's)
+ pcmpeqb xmm7,xmm7
+ psrldq xmm7,(SIZEOF_XMMWORD-1)
+ pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+ add rax, byte SIZEOF_XMMWORD-1
+ and rax, byte -SIZEOF_XMMWORD
+ cmp rax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+
+.columnloop_last:
+ pcmpeqb xmm6,xmm6
+ pslldq xmm6,(SIZEOF_XMMWORD-1)
+ pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ jmp short .upsample
+
+.columnloop:
+ movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ pslldq xmm6,(SIZEOF_XMMWORD-1)
+
+.upsample:
+ movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
+ pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
+ psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
+
+ por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
+ por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
+
+ movdqa xmm7,xmm1
+ psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
+
+ movdqa xmm4,xmm1
+ punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
+ movdqa xmm5,xmm2
+ punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
+ punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
+ movdqa xmm6,xmm3
+ punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
+ punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
+
+ pmullw xmm1,[rel PW_THREE]
+ pmullw xmm4,[rel PW_THREE]
+ paddw xmm2,[rel PW_ONE]
+ paddw xmm5,[rel PW_ONE]
+ paddw xmm3,[rel PW_TWO]
+ paddw xmm6,[rel PW_TWO]
+
+ paddw xmm2,xmm1
+ paddw xmm5,xmm4
+ psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+ paddw xmm3,xmm1
+ paddw xmm6,xmm4
+ psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm3,BYTE_BIT
+ psllw xmm6,BYTE_BIT
+ por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
+ por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+ sub rax, byte SIZEOF_XMMWORD
+ add rsi, byte 1*SIZEOF_XMMWORD ; inptr
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ cmp rax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test eax,eax
+ jnz near .columnloop_last
+
+ pop rsi
+ pop rdi
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rcx ; rowctr
+ jg near .rowloop
+
+.return:
+ uncollect_args
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 4
+
+ align 16
+ global EXTN(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+ push rbp
+ mov rax,rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp],rax
+ mov rbp,rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ push rbx
+ collect_args
+
+ mov rax, r11 ; colctr
+ test rax,rax
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx,rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdi, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rax ; colctr
+ push rcx
+ push rdi
+ push rsi
+
+ mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test rax, SIZEOF_XMMWORD-1
+ jz short .skip
+ push rdx
+ mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop rdx
+.skip:
+ ; -- process the first column block
+
+ movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
+ movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
+ movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
+
+ pxor xmm3,xmm3 ; xmm3=(all 0's)
+ movdqa xmm4,xmm0
+ punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5,xmm1
+ punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6,xmm2
+ punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0,[rel PW_THREE]
+ pmullw xmm4,[rel PW_THREE]
+
+ pcmpeqb xmm7,xmm7
+ psrldq xmm7,(SIZEOF_XMMWORD-2)
+
+ paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+ pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
+ pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
+
+ movdqa XMMWORD [wk(0)], xmm1
+ movdqa XMMWORD [wk(1)], xmm2
+
+ add rax, byte SIZEOF_XMMWORD-1
+ and rax, byte -SIZEOF_XMMWORD
+ cmp rax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+
+.columnloop_last:
+ ; -- process the last column block
+
+ pcmpeqb xmm1,xmm1
+ pslldq xmm1,(SIZEOF_XMMWORD-2)
+ movdqa xmm2,xmm1
+
+ pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+ pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+ movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
+ movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
+
+ jmp near .upsample
+
+.columnloop:
+ ; -- process the next column block
+
+ movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
+ movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
+ movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
+
+ pxor xmm3,xmm3 ; xmm3=(all 0's)
+ movdqa xmm4,xmm0
+ punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5,xmm1
+ punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6,xmm2
+ punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0,[rel PW_THREE]
+ pmullw xmm4,[rel PW_THREE]
+
+ paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+ pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
+ pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
+
+ movdqa XMMWORD [wk(2)], xmm1
+ movdqa XMMWORD [wk(3)], xmm2
+
+.upsample:
+ ; -- process the upper row
+
+ movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+ movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
+ pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
+ movdqa xmm5,xmm7
+ movdqa xmm6,xmm3
+ psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
+ pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
+
+ por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
+ por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1,xmm7
+ movdqa xmm2,xmm3
+ pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
+ movdqa xmm4,xmm3
+ psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(0)], xmm4
+
+ pmullw xmm7,[rel PW_THREE]
+ pmullw xmm3,[rel PW_THREE]
+ paddw xmm1,[rel PW_EIGHT]
+ paddw xmm5,[rel PW_EIGHT]
+ paddw xmm0,[rel PW_SEVEN]
+ paddw xmm2,[rel PW_SEVEN]
+
+ paddw xmm1,xmm7
+ paddw xmm5,xmm3
+ psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+ paddw xmm0,xmm7
+ paddw xmm2,xmm3
+ psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm0,BYTE_BIT
+ psllw xmm2,BYTE_BIT
+ por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
+ por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+ ; -- process the lower row
+
+ movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+ movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
+ pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm4
+ psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
+ pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
+
+ por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
+ por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1,xmm6
+ movdqa xmm5,xmm4
+ pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
+ movdqa xmm3,xmm4
+ psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(1)], xmm3
+
+ pmullw xmm6,[rel PW_THREE]
+ pmullw xmm4,[rel PW_THREE]
+ paddw xmm1,[rel PW_EIGHT]
+ paddw xmm0,[rel PW_EIGHT]
+ paddw xmm7,[rel PW_SEVEN]
+ paddw xmm5,[rel PW_SEVEN]
+
+ paddw xmm1,xmm6
+ paddw xmm0,xmm4
+ psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+ paddw xmm7,xmm6
+ paddw xmm5,xmm4
+ psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm7,BYTE_BIT
+ psllw xmm5,BYTE_BIT
+ por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
+ por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+ sub rax, byte SIZEOF_XMMWORD
+ add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
+ add rbx, byte 1*SIZEOF_XMMWORD ; inptr0
+ add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
+ add rdx, byte 2*SIZEOF_XMMWORD ; outptr0
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
+ cmp rax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test rax,rax
+ jnz near .columnloop_last
+
+ pop rsi
+ pop rdi
+ pop rcx
+ pop rax
+
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub rcx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ uncollect_args
+ pop rbx
+ mov rsp,rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
+; JDIMENSION output_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+ align 16
+ global EXTN(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+ push rbp
+ mov rbp,rsp
+ collect_args
+
+ mov rdx, r11
+ add rdx, byte (2*SIZEOF_XMMWORD)-1
+ and rdx, byte -(2*SIZEOF_XMMWORD)
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx,rcx
+ jz short .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdi, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rdi
+ push rsi
+
+ mov rsi, JSAMPROW [rsi] ; inptr
+ mov rdi, JSAMPROW [rdi] ; outptr
+ mov rax,rdx ; colctr
+.columnloop:
+
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+ movdqa xmm1,xmm0
+ punpcklbw xmm0,xmm0
+ punpckhbw xmm1,xmm1
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm3,xmm2
+ punpcklbw xmm2,xmm2
+ punpckhbw xmm3,xmm3
+
+ movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr
+ add rdi, byte 4*SIZEOF_XMMWORD ; outptr
+ jmp short .columnloop
+
+.nextrow:
+ pop rsi
+ pop rdi
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rcx ; rowctr
+ jg short .rowloop
+
+.return:
+ uncollect_args
+ pop rbp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
+; JDIMENSION output_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+ align 16
+ global EXTN(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+ push rbp
+ mov rbp,rsp
+ push rbx
+ collect_args
+
+ mov rdx, r11
+ add rdx, byte (2*SIZEOF_XMMWORD)-1
+ and rdx, byte -(2*SIZEOF_XMMWORD)
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx,rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdi, JSAMPARRAY [rdi] ; output_data
+.rowloop:
+ push rdi
+ push rsi
+
+ mov rsi, JSAMPROW [rsi] ; inptr
+ mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov rax,rdx ; colctr
+.columnloop:
+
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+ movdqa xmm1,xmm0
+ punpcklbw xmm0,xmm0
+ punpckhbw xmm1,xmm1
+
+ movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm3,xmm2
+ punpcklbw xmm2,xmm2
+ punpckhbw xmm3,xmm3
+
+ movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+ movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr
+ add rbx, byte 4*SIZEOF_XMMWORD ; outptr0
+ add rdi, byte 4*SIZEOF_XMMWORD ; outptr1
+ jmp short .columnloop
+
+.nextrow:
+ pop rsi
+ pop rdi
+
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub rcx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ uncollect_args
+ pop rbx
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jdsamss2.asm b/simd/jdsamss2.asm
new file mode 100644
index 0000000..b5c863b
--- /dev/null
+++ b/simd/jdsamss2.asm
@@ -0,0 +1,729 @@
+;
+; jdsamss2.asm - upsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE times 8 dw 1
+PW_TWO times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter". This is a good compromise between
+; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
+%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
+%define input_data(b) (b)+16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
+
+ align 16
+ global EXTN(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+ push ebp
+ mov ebp,esp
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
+ test eax,eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx,ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16,7
+.rowloop:
+ push eax ; colctr
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+
+ test eax, SIZEOF_XMMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+.skip:
+ pxor xmm0,xmm0 ; xmm0=(all 0's)
+ pcmpeqb xmm7,xmm7
+ psrldq xmm7,(SIZEOF_XMMWORD-1)
+ pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+ add eax, byte SIZEOF_XMMWORD-1
+ and eax, byte -SIZEOF_XMMWORD
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+ alignx 16,7
+
+.columnloop_last:
+ pcmpeqb xmm6,xmm6
+ pslldq xmm6,(SIZEOF_XMMWORD-1)
+ pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ jmp short .upsample
+ alignx 16,7
+
+.columnloop:
+ movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ pslldq xmm6,(SIZEOF_XMMWORD-1)
+
+.upsample:
+ movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
+ pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
+ psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
+
+ por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
+ por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
+
+ movdqa xmm7,xmm1
+ psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
+
+ movdqa xmm4,xmm1
+ punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
+ movdqa xmm5,xmm2
+ punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
+ punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
+ movdqa xmm6,xmm3
+ punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
+ punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
+
+ pmullw xmm1,[GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
+ paddw xmm2,[GOTOFF(ebx,PW_ONE)]
+ paddw xmm5,[GOTOFF(ebx,PW_ONE)]
+ paddw xmm3,[GOTOFF(ebx,PW_TWO)]
+ paddw xmm6,[GOTOFF(ebx,PW_TWO)]
+
+ paddw xmm2,xmm1
+ paddw xmm5,xmm4
+ psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+ paddw xmm3,xmm1
+ paddw xmm6,xmm4
+ psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm3,BYTE_BIT
+ psllw xmm6,BYTE_BIT
+ por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
+ por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+ sub eax, byte SIZEOF_XMMWORD
+ add esi, byte 1*SIZEOF_XMMWORD ; inptr
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ cmp eax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test eax,eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
+; JDIMENSION downsampled_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
+%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
+%define input_data(b) (b)+16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 4
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+
+ align 16
+ global EXTN(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov edx,eax ; edx = original ebp
+ mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
+ test eax,eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(edx)] ; rowctr
+ test ecx,ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(edx)] ; input_data
+ mov edi, POINTER [output_data_ptr(edx)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16,7
+.rowloop:
+ push eax ; colctr
+ push ecx
+ push edi
+ push esi
+
+ mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test eax, SIZEOF_XMMWORD-1
+ jz short .skip
+ push edx
+ mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop edx
+.skip:
+ ; -- process the first column block
+
+ movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
+ movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
+ movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pxor xmm3,xmm3 ; xmm3=(all 0's)
+ movdqa xmm4,xmm0
+ punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5,xmm1
+ punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6,xmm2
+ punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0,[GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
+
+ pcmpeqb xmm7,xmm7
+ psrldq xmm7,(SIZEOF_XMMWORD-2)
+
+ paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+ pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
+ pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
+
+ movdqa XMMWORD [wk(0)], xmm1
+ movdqa XMMWORD [wk(1)], xmm2
+
+ poppic ebx
+
+ add eax, byte SIZEOF_XMMWORD-1
+ and eax, byte -SIZEOF_XMMWORD
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+ alignx 16,7
+
+.columnloop_last:
+ ; -- process the last column block
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pcmpeqb xmm1,xmm1
+ pslldq xmm1,(SIZEOF_XMMWORD-2)
+ movdqa xmm2,xmm1
+
+ pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+ pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+ movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
+ movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
+
+ jmp near .upsample
+ alignx 16,7
+
+.columnloop:
+ ; -- process the next column block
+
+ movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
+ movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
+ movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
+
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
+
+ pxor xmm3,xmm3 ; xmm3=(all 0's)
+ movdqa xmm4,xmm0
+ punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5,xmm1
+ punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6,xmm2
+ punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0,[GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
+
+ paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+ pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
+ pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
+
+ movdqa XMMWORD [wk(2)], xmm1
+ movdqa XMMWORD [wk(3)], xmm2
+
+.upsample:
+ ; -- process the upper row
+
+ movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+ movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
+ pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
+ movdqa xmm5,xmm7
+ movdqa xmm6,xmm3
+ psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
+ pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
+
+ por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
+ por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1,xmm7
+ movdqa xmm2,xmm3
+ pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
+ movdqa xmm4,xmm3
+ psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(0)], xmm4
+
+ pmullw xmm7,[GOTOFF(ebx,PW_THREE)]
+ pmullw xmm3,[GOTOFF(ebx,PW_THREE)]
+ paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm5,[GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm0,[GOTOFF(ebx,PW_SEVEN)]
+ paddw xmm2,[GOTOFF(ebx,PW_SEVEN)]
+
+ paddw xmm1,xmm7
+ paddw xmm5,xmm3
+ psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+ paddw xmm0,xmm7
+ paddw xmm2,xmm3
+ psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm0,BYTE_BIT
+ psllw xmm2,BYTE_BIT
+ por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
+ por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+ ; -- process the lower row
+
+ movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+ movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
+ pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm4
+ psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
+ pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
+
+ por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
+ por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1,xmm6
+ movdqa xmm5,xmm4
+ pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
+ movdqa xmm3,xmm4
+ psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(1)], xmm3
+
+ pmullw xmm6,[GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
+ paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm0,[GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm7,[GOTOFF(ebx,PW_SEVEN)]
+ paddw xmm5,[GOTOFF(ebx,PW_SEVEN)]
+
+ paddw xmm1,xmm6
+ paddw xmm0,xmm4
+ psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+ paddw xmm7,xmm6
+ paddw xmm5,xmm4
+ psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm7,BYTE_BIT
+ psllw xmm5,BYTE_BIT
+ por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
+ por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+ poppic ebx
+
+ sub eax, byte SIZEOF_XMMWORD
+ add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
+ add ebx, byte 1*SIZEOF_XMMWORD ; inptr0
+ add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
+ add edx, byte 2*SIZEOF_XMMWORD ; outptr0
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr1
+ cmp eax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test eax,eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop ecx
+ pop eax
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg near .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
+; JDIMENSION output_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
+%define output_width(b) (b)+12 ; JDIMENSION output_width
+%define input_data(b) (b)+16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
+
+ align 16
+ global EXTN(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+ push ebp
+ mov ebp,esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_XMMWORD)-1
+ and edx, byte -(2*SIZEOF_XMMWORD)
+ jz short .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx,ecx
+ jz short .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16,7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+ mov eax,edx ; colctr
+ alignx 16,7
+.columnloop:
+
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+ movdqa xmm1,xmm0
+ punpcklbw xmm0,xmm0
+ punpckhbw xmm1,xmm1
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm3,xmm2
+ punpcklbw xmm2,xmm2
+ punpckhbw xmm3,xmm3
+
+ movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr
+ add edi, byte 4*SIZEOF_XMMWORD ; outptr
+ jmp short .columnloop
+ alignx 16,7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg short .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
+; JDIMENSION output_width,
+; JSAMPARRAY input_data,
+; JSAMPARRAY * output_data_ptr);
+;
+
+%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
+%define output_width(b) (b)+12 ; JDIMENSION output_width
+%define input_data(b) (b)+16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
+
+ align 16
+ global EXTN(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+ push ebp
+ mov ebp,esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_XMMWORD)-1
+ and edx, byte -(2*SIZEOF_XMMWORD)
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx,ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16,7
+.rowloop:
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov eax,edx ; colctr
+ alignx 16,7
+.columnloop:
+
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+ movdqa xmm1,xmm0
+ punpcklbw xmm0,xmm0
+ punpckhbw xmm1,xmm1
+
+ movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm3,xmm2
+ punpcklbw xmm2,xmm2
+ punpckhbw xmm3,xmm3
+
+ movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+ movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
+
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr
+ add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
+ add edi, byte 4*SIZEOF_XMMWORD ; outptr1
+ jmp short .columnloop
+ alignx 16,7
+
+.nextrow:
+ pop esi
+ pop edi
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg short .rowloop
+
+.return:
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jf3dnflt.asm b/simd/jf3dnflt.asm
new file mode 100644
index 0000000..542672d
--- /dev/null
+++ b/simd/jf3dnflt.asm
@@ -0,0 +1,320 @@
+;
+; jf3dnflt.asm - floating-point FDCT (3DNow!)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382 times 2 dd 0.382683432365089771728460
+PD_0_707 times 2 dd 0.707106781186547524400844
+PD_0_541 times 2 dd 0.541196100146196984399723
+PD_1_306 times 2 dd 1.306562964876376527856643
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_3dnow (FAST_FLOAT * data)
+;
+
+%define data(b) (b)+8 ; FAST_FLOAT * data
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 16
+ global EXTN(jsimd_fdct_float_3dnow)
+
+EXTN(jsimd_fdct_float_3dnow):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/2
+ alignx 16,7
+.rowloop:
+
+ movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+ ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+ movq mm4,mm0 ; transpose coefficients
+ punpckldq mm0,mm1 ; mm0=(00 10)=data0
+ punpckhdq mm4,mm1 ; mm4=(01 11)=data1
+ movq mm5,mm2 ; transpose coefficients
+ punpckldq mm2,mm3 ; mm2=(06 16)=data6
+ punpckhdq mm5,mm3 ; mm5=(07 17)=data7
+
+ movq mm6,mm4
+ movq mm7,mm0
+ pfsub mm4,mm2 ; mm4=data1-data6=tmp6
+ pfsub mm0,mm5 ; mm0=data0-data7=tmp7
+ pfadd mm6,mm2 ; mm6=data1+data6=tmp1
+ pfadd mm7,mm5 ; mm7=data0+data7=tmp0
+
+ movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+ movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+ ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
+
+ movq mm4,mm1 ; transpose coefficients
+ punpckldq mm1,mm3 ; mm1=(02 12)=data2
+ punpckhdq mm4,mm3 ; mm4=(03 13)=data3
+ movq mm0,mm2 ; transpose coefficients
+ punpckldq mm2,mm5 ; mm2=(04 14)=data4
+ punpckhdq mm0,mm5 ; mm0=(05 15)=data5
+
+ movq mm3,mm4
+ movq mm5,mm1
+ pfadd mm4,mm2 ; mm4=data3+data4=tmp3
+ pfadd mm1,mm0 ; mm1=data2+data5=tmp2
+ pfsub mm3,mm2 ; mm3=data3-data4=tmp4
+ pfsub mm5,mm0 ; mm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm2,mm7
+ movq mm0,mm6
+ pfsub mm7,mm4 ; mm7=tmp13
+ pfsub mm6,mm1 ; mm6=tmp12
+ pfadd mm2,mm4 ; mm2=tmp10
+ pfadd mm0,mm1 ; mm0=tmp11
+
+ pfadd mm6,mm7
+ pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+ movq mm4,mm2
+ movq mm1,mm7
+ pfsub mm2,mm0 ; mm2=data4
+ pfsub mm7,mm6 ; mm7=data6
+ pfadd mm4,mm0 ; mm4=data0
+ pfadd mm1,mm6 ; mm1=data2
+
+ movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+ movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [wk(0)] ; mm0=tmp6
+ movq mm6, MMWORD [wk(1)] ; mm6=tmp7
+
+ pfadd mm3,mm5 ; mm3=tmp10
+ pfadd mm5,mm0 ; mm5=tmp11
+ pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7
+
+ pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+ movq mm2,mm3 ; mm2=tmp10
+ pfsub mm3,mm0
+ pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
+ pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+ pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+ pfadd mm2,mm3 ; mm2=z2
+ pfadd mm0,mm3 ; mm0=z4
+
+ movq mm7,mm6
+ pfsub mm6,mm5 ; mm6=z13
+ pfadd mm7,mm5 ; mm7=z11
+
+ movq mm4,mm6
+ movq mm1,mm7
+ pfsub mm6,mm2 ; mm6=data3
+ pfsub mm7,mm0 ; mm7=data7
+ pfadd mm4,mm2 ; mm4=data5
+ pfadd mm1,mm0 ; mm1=data1
+
+ movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+ movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+ add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/2
+ alignx 16,7
+.columnloop:
+
+ movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+ movq mm4,mm0 ; transpose coefficients
+ punpckldq mm0,mm1 ; mm0=(00 01)=data0
+ punpckhdq mm4,mm1 ; mm4=(10 11)=data1
+ movq mm5,mm2 ; transpose coefficients
+ punpckldq mm2,mm3 ; mm2=(60 61)=data6
+ punpckhdq mm5,mm3 ; mm5=(70 71)=data7
+
+ movq mm6,mm4
+ movq mm7,mm0
+ pfsub mm4,mm2 ; mm4=data1-data6=tmp6
+ pfsub mm0,mm5 ; mm0=data0-data7=tmp7
+ pfadd mm6,mm2 ; mm6=data1+data6=tmp1
+ pfadd mm7,mm5 ; mm7=data0+data7=tmp0
+
+ movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+ movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
+
+ movq mm4,mm1 ; transpose coefficients
+ punpckldq mm1,mm3 ; mm1=(20 21)=data2
+ punpckhdq mm4,mm3 ; mm4=(30 31)=data3
+ movq mm0,mm2 ; transpose coefficients
+ punpckldq mm2,mm5 ; mm2=(40 41)=data4
+ punpckhdq mm0,mm5 ; mm0=(50 51)=data5
+
+ movq mm3,mm4
+ movq mm5,mm1
+ pfadd mm4,mm2 ; mm4=data3+data4=tmp3
+ pfadd mm1,mm0 ; mm1=data2+data5=tmp2
+ pfsub mm3,mm2 ; mm3=data3-data4=tmp4
+ pfsub mm5,mm0 ; mm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm2,mm7
+ movq mm0,mm6
+ pfsub mm7,mm4 ; mm7=tmp13
+ pfsub mm6,mm1 ; mm6=tmp12
+ pfadd mm2,mm4 ; mm2=tmp10
+ pfadd mm0,mm1 ; mm0=tmp11
+
+ pfadd mm6,mm7
+ pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+ movq mm4,mm2
+ movq mm1,mm7
+ pfsub mm2,mm0 ; mm2=data4
+ pfsub mm7,mm6 ; mm7=data6
+ pfadd mm4,mm0 ; mm4=data0
+ pfadd mm1,mm6 ; mm1=data2
+
+ movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+ movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [wk(0)] ; mm0=tmp6
+ movq mm6, MMWORD [wk(1)] ; mm6=tmp7
+
+ pfadd mm3,mm5 ; mm3=tmp10
+ pfadd mm5,mm0 ; mm5=tmp11
+ pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7
+
+ pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+ movq mm2,mm3 ; mm2=tmp10
+ pfsub mm3,mm0
+ pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
+ pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+ pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+ pfadd mm2,mm3 ; mm2=z2
+ pfadd mm0,mm3 ; mm0=z4
+
+ movq mm7,mm6
+ pfsub mm6,mm5 ; mm6=z13
+ pfadd mm7,mm5 ; mm7=z11
+
+ movq mm4,mm6
+ movq mm1,mm7
+ pfsub mm6,mm2 ; mm6=data3
+ pfsub mm7,mm0 ; mm7=data7
+ pfadd mm4,mm2 ; mm4=data5
+ pfadd mm1,mm0 ; mm1=data1
+
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+ movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+ add edx, byte 2*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .columnloop
+
+ femms ; empty MMX/3DNow! state
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jfmmxfst.asm b/simd/jfmmxfst.asm
new file mode 100644
index 0000000..0647242
--- /dev/null
+++ b/simd/jfmmxfst.asm
@@ -0,0 +1,397 @@
+;
+; jfmmxfst.asm - fast integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ 98 ; FIX(0.382683433)
+F_0_541 equ 139 ; FIX(0.541196100)
+F_0_707 equ 181 ; FIX(0.707106781)
+F_1_306 equ 334 ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 16
+ global EXTN(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_mmx (DCTELEM * data)
+;
+
+%define data(b) (b)+8 ; DCTELEM * data
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 16
+ global EXTN(jsimd_fdct_ifast_mmx)
+
+EXTN(jsimd_fdct_ifast_mmx):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16,7
+.rowloop:
+
+ movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+ movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+ ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+ ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+ movq mm4,mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0,mm1 ; mm0=(20 30 21 31)
+ punpckhwd mm4,mm1 ; mm4=(22 32 23 33)
+ movq mm5,mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2,mm3 ; mm2=(24 34 25 35)
+ punpckhwd mm5,mm3 ; mm5=(26 36 27 37)
+
+ movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+ ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+ ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
+ movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
+
+ movq mm4,mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
+ punpckhwd mm4,mm7 ; mm4=(02 12 03 13)
+ movq mm2,mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1,mm3 ; mm1=(04 14 05 15)
+ punpckhwd mm2,mm3 ; mm2=(06 16 07 17)
+
+ movq mm7,mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0
+ punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1
+ movq mm3,mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6
+ punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7
+
+ movq mm0,mm7
+ movq mm5,mm6
+ psubw mm7,mm2 ; mm7=data1-data6=tmp6
+ psubw mm6,mm3 ; mm6=data0-data7=tmp7
+ paddw mm0,mm2 ; mm0=data1+data6=tmp1
+ paddw mm5,mm3 ; mm5=data0+data7=tmp0
+
+ movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
+ movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
+
+ movq mm7,mm4 ; transpose coefficients(phase 2)
+ punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2
+ punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3
+ movq mm6,mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4
+ punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5
+
+ movq mm2,mm7
+ movq mm3,mm4
+ paddw mm7,mm1 ; mm7=data3+data4=tmp3
+ paddw mm4,mm6 ; mm4=data2+data5=tmp2
+ psubw mm2,mm1 ; mm2=data3-data4=tmp4
+ psubw mm3,mm6 ; mm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm1,mm5
+ movq mm6,mm0
+ psubw mm5,mm7 ; mm5=tmp13
+ psubw mm0,mm4 ; mm0=tmp12
+ paddw mm1,mm7 ; mm1=tmp10
+ paddw mm6,mm4 ; mm6=tmp11
+
+ paddw mm0,mm5
+ psllw mm0,PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+ movq mm7,mm1
+ movq mm4,mm5
+ psubw mm1,mm6 ; mm1=data4
+ psubw mm5,mm0 ; mm5=data6
+ paddw mm7,mm6 ; mm7=data0
+ paddw mm4,mm0 ; mm4=data2
+
+ movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+ ; -- Odd part
+
+ movq mm6, MMWORD [wk(0)] ; mm6=tmp6
+ movq mm0, MMWORD [wk(1)] ; mm0=tmp7
+
+ paddw mm2,mm3 ; mm2=tmp10
+ paddw mm3,mm6 ; mm3=tmp11
+ paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7
+
+ psllw mm2,PRE_MULTIPLY_SCALE_BITS
+ psllw mm6,PRE_MULTIPLY_SCALE_BITS
+
+ psllw mm3,PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+ movq mm1,mm2 ; mm1=tmp10
+ psubw mm2,mm6
+ pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+ pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+ pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+ paddw mm1,mm2 ; mm1=z2
+ paddw mm6,mm2 ; mm6=z4
+
+ movq mm5,mm0
+ psubw mm0,mm3 ; mm0=z13
+ paddw mm5,mm3 ; mm5=z11
+
+ movq mm7,mm0
+ movq mm4,mm5
+ psubw mm0,mm1 ; mm0=data3
+ psubw mm5,mm6 ; mm5=data7
+ paddw mm7,mm1 ; mm7=data5
+ paddw mm4,mm6 ; mm4=data1
+
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+ add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16,7
+.columnloop:
+
+ movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+ movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+ ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+ movq mm4,mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0,mm1 ; mm0=(02 03 12 13)
+ punpckhwd mm4,mm1 ; mm4=(22 23 32 33)
+ movq mm5,mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2,mm3 ; mm2=(42 43 52 53)
+ punpckhwd mm5,mm3 ; mm5=(62 63 72 73)
+
+ movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+ ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+ ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
+ movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
+
+ movq mm4,mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6,mm7 ; mm6=(00 01 10 11)
+ punpckhwd mm4,mm7 ; mm4=(20 21 30 31)
+ movq mm2,mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1,mm3 ; mm1=(40 41 50 51)
+ punpckhwd mm2,mm3 ; mm2=(60 61 70 71)
+
+ movq mm7,mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0
+ punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1
+ movq mm3,mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6
+ punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7
+
+ movq mm0,mm7
+ movq mm5,mm6
+ psubw mm7,mm2 ; mm7=data1-data6=tmp6
+ psubw mm6,mm3 ; mm6=data0-data7=tmp7
+ paddw mm0,mm2 ; mm0=data1+data6=tmp1
+ paddw mm5,mm3 ; mm5=data0+data7=tmp0
+
+ movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
+ movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
+
+ movq mm7,mm4 ; transpose coefficients(phase 2)
+ punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2
+ punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3
+ movq mm6,mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4
+ punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5
+
+ movq mm2,mm7
+ movq mm3,mm4
+ paddw mm7,mm1 ; mm7=data3+data4=tmp3
+ paddw mm4,mm6 ; mm4=data2+data5=tmp2
+ psubw mm2,mm1 ; mm2=data3-data4=tmp4
+ psubw mm3,mm6 ; mm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm1,mm5
+ movq mm6,mm0
+ psubw mm5,mm7 ; mm5=tmp13
+ psubw mm0,mm4 ; mm0=tmp12
+ paddw mm1,mm7 ; mm1=tmp10
+ paddw mm6,mm4 ; mm6=tmp11
+
+ paddw mm0,mm5
+ psllw mm0,PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+ movq mm7,mm1
+ movq mm4,mm5
+ psubw mm1,mm6 ; mm1=data4
+ psubw mm5,mm0 ; mm5=data6
+ paddw mm7,mm6 ; mm7=data0
+ paddw mm4,mm0 ; mm4=data2
+
+ movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+ ; -- Odd part
+
+ movq mm6, MMWORD [wk(0)] ; mm6=tmp6
+ movq mm0, MMWORD [wk(1)] ; mm0=tmp7
+
+ paddw mm2,mm3 ; mm2=tmp10
+ paddw mm3,mm6 ; mm3=tmp11
+ paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7
+
+ psllw mm2,PRE_MULTIPLY_SCALE_BITS
+ psllw mm6,PRE_MULTIPLY_SCALE_BITS
+
+ psllw mm3,PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+ movq mm1,mm2 ; mm1=tmp10
+ psubw mm2,mm6
+ pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+ pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+ pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+ paddw mm1,mm2 ; mm1=z2
+ paddw mm6,mm2 ; mm6=z4
+
+ movq mm5,mm0
+ psubw mm0,mm3 ; mm0=z13
+ paddw mm5,mm3 ; mm5=z11
+
+ movq mm7,mm0
+ movq mm4,mm5
+ psubw mm0,mm1 ; mm0=data3
+ psubw mm5,mm6 ; mm5=data7
+ paddw mm7,mm1 ; mm7=data5
+ paddw mm4,mm6 ; mm4=data1
+
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+ movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+ add edx, byte 4*SIZEOF_DCTELEM
+ dec ecx
+ jnz near .columnloop
+
+ emms ; empty MMX state
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jfmmxint.asm b/simd/jfmmxint.asm
new file mode 100644
index 0000000..a7e73f7
--- /dev/null
+++ b/simd/jfmmxint.asm
@@ -0,0 +1,622 @@
+;
+; jfmmxint.asm - accurate integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541
+PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175
+PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1)
+PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1)
+PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1)
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_mmx (DCTELEM * data)
+;
+
+%define data(b) (b)+8 ; DCTELEM * data
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 16
+ global EXTN(jsimd_fdct_islow_mmx)
+
+EXTN(jsimd_fdct_islow_mmx):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16,7
+.rowloop:
+
+ movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+ movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+ ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+ ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+ movq mm4,mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0,mm1 ; mm0=(20 30 21 31)
+ punpckhwd mm4,mm1 ; mm4=(22 32 23 33)
+ movq mm5,mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2,mm3 ; mm2=(24 34 25 35)
+ punpckhwd mm5,mm3 ; mm5=(26 36 27 37)
+
+ movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+ ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+ ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
+ movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
+
+ movq mm4,mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
+ punpckhwd mm4,mm7 ; mm4=(02 12 03 13)
+ movq mm2,mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1,mm3 ; mm1=(04 14 05 15)
+ punpckhwd mm2,mm3 ; mm2=(06 16 07 17)
+
+ movq mm7,mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0
+ punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1
+ movq mm3,mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6
+ punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7
+
+ movq mm0,mm7
+ movq mm5,mm6
+ psubw mm7,mm2 ; mm7=data1-data6=tmp6
+ psubw mm6,mm3 ; mm6=data0-data7=tmp7
+ paddw mm0,mm2 ; mm0=data1+data6=tmp1
+ paddw mm5,mm3 ; mm5=data0+data7=tmp0
+
+ movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
+ movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
+
+ movq mm7,mm4 ; transpose coefficients(phase 2)
+ punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2
+ punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3
+ movq mm6,mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4
+ punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5
+
+ movq mm2,mm7
+ movq mm3,mm4
+ paddw mm7,mm1 ; mm7=data3+data4=tmp3
+ paddw mm4,mm6 ; mm4=data2+data5=tmp2
+ psubw mm2,mm1 ; mm2=data3-data4=tmp4
+ psubw mm3,mm6 ; mm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm1,mm5
+ movq mm6,mm0
+ paddw mm5,mm7 ; mm5=tmp10
+ paddw mm0,mm4 ; mm0=tmp11
+ psubw mm1,mm7 ; mm1=tmp13
+ psubw mm6,mm4 ; mm6=tmp12
+
+ movq mm7,mm5
+ paddw mm5,mm0 ; mm5=tmp10+tmp11
+ psubw mm7,mm0 ; mm7=tmp10-tmp11
+
+ psllw mm5,PASS1_BITS ; mm5=data0
+ psllw mm7,PASS1_BITS ; mm7=data4
+
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movq mm4,mm1 ; mm1=tmp13
+ movq mm0,mm1
+ punpcklwd mm4,mm6 ; mm6=tmp12
+ punpckhwd mm0,mm6
+ movq mm1,mm4
+ movq mm6,mm0
+ pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
+ pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
+ pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
+ pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
+
+ paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm4,DESCALE_P1
+ psrad mm0,DESCALE_P1
+ paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm1,DESCALE_P1
+ psrad mm6,DESCALE_P1
+
+ packssdw mm4,mm0 ; mm4=data2
+ packssdw mm1,mm6 ; mm1=data6
+
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+ ; -- Odd part
+
+ movq mm5, MMWORD [wk(0)] ; mm5=tmp6
+ movq mm7, MMWORD [wk(1)] ; mm7=tmp7
+
+ movq mm0,mm2 ; mm2=tmp4
+ movq mm6,mm3 ; mm3=tmp5
+ paddw mm0,mm5 ; mm0=z3
+ paddw mm6,mm7 ; mm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movq mm4,mm0
+ movq mm1,mm0
+ punpcklwd mm4,mm6
+ punpckhwd mm1,mm6
+ movq mm0,mm4
+ movq mm6,mm1
+ pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
+ pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
+ pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
+ pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
+ movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movq mm4,mm2
+ movq mm1,mm2
+ punpcklwd mm4,mm7
+ punpckhwd mm1,mm7
+ movq mm2,mm4
+ movq mm7,mm1
+ pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
+ pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
+ pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
+ pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
+
+ paddd mm4, MMWORD [wk(0)] ; mm4=data7L
+ paddd mm1, MMWORD [wk(1)] ; mm1=data7H
+ paddd mm2,mm0 ; mm2=data1L
+ paddd mm7,mm6 ; mm7=data1H
+
+ paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm4,DESCALE_P1
+ psrad mm1,DESCALE_P1
+ paddd mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm2,DESCALE_P1
+ psrad mm7,DESCALE_P1
+
+ packssdw mm4,mm1 ; mm4=data7
+ packssdw mm2,mm7 ; mm2=data1
+
+ movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+ movq mm1,mm3
+ movq mm7,mm3
+ punpcklwd mm1,mm5
+ punpckhwd mm7,mm5
+ movq mm3,mm1
+ movq mm5,mm7
+ pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
+ pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
+ pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
+ pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
+
+ paddd mm1,mm0 ; mm1=data5L
+ paddd mm7,mm6 ; mm7=data5H
+ paddd mm3, MMWORD [wk(0)] ; mm3=data3L
+ paddd mm5, MMWORD [wk(1)] ; mm5=data3H
+
+ paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm1,DESCALE_P1
+ psrad mm7,DESCALE_P1
+ paddd mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad mm3,DESCALE_P1
+ psrad mm5,DESCALE_P1
+
+ packssdw mm1,mm7 ; mm1=data5
+ packssdw mm3,mm5 ; mm3=data3
+
+ movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+ add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16,7
+.columnloop:
+
+ movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+ movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+ ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+ movq mm4,mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0,mm1 ; mm0=(02 03 12 13)
+ punpckhwd mm4,mm1 ; mm4=(22 23 32 33)
+ movq mm5,mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2,mm3 ; mm2=(42 43 52 53)
+ punpckhwd mm5,mm3 ; mm5=(62 63 72 73)
+
+ movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+ ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+ ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
+ movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
+
+ movq mm4,mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6,mm7 ; mm6=(00 01 10 11)
+ punpckhwd mm4,mm7 ; mm4=(20 21 30 31)
+ movq mm2,mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1,mm3 ; mm1=(40 41 50 51)
+ punpckhwd mm2,mm3 ; mm2=(60 61 70 71)
+
+ movq mm7,mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0
+ punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1
+ movq mm3,mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6
+ punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7
+
+ movq mm0,mm7
+ movq mm5,mm6
+ psubw mm7,mm2 ; mm7=data1-data6=tmp6
+ psubw mm6,mm3 ; mm6=data0-data7=tmp7
+ paddw mm0,mm2 ; mm0=data1+data6=tmp1
+ paddw mm5,mm3 ; mm5=data0+data7=tmp0
+
+ movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
+ movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
+ movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
+
+ movq mm7,mm4 ; transpose coefficients(phase 2)
+ punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2
+ punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3
+ movq mm6,mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4
+ punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5
+
+ movq mm2,mm7
+ movq mm3,mm4
+ paddw mm7,mm1 ; mm7=data3+data4=tmp3
+ paddw mm4,mm6 ; mm4=data2+data5=tmp2
+ psubw mm2,mm1 ; mm2=data3-data4=tmp4
+ psubw mm3,mm6 ; mm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movq mm1,mm5
+ movq mm6,mm0
+ paddw mm5,mm7 ; mm5=tmp10
+ paddw mm0,mm4 ; mm0=tmp11
+ psubw mm1,mm7 ; mm1=tmp13
+ psubw mm6,mm4 ; mm6=tmp12
+
+ movq mm7,mm5
+ paddw mm5,mm0 ; mm5=tmp10+tmp11
+ psubw mm7,mm0 ; mm7=tmp10-tmp11
+
+ paddw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+ paddw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+ psraw mm5,PASS1_BITS ; mm5=data0
+ psraw mm7,PASS1_BITS ; mm7=data4
+
+ movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+ movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movq mm4,mm1 ; mm1=tmp13
+ movq mm0,mm1
+ punpcklwd mm4,mm6 ; mm6=tmp12
+ punpckhwd mm0,mm6
+ movq mm1,mm4
+ movq mm6,mm0
+ pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
+ pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
+ pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
+ pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
+
+ paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm4,DESCALE_P2
+ psrad mm0,DESCALE_P2
+ paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm1,DESCALE_P2
+ psrad mm6,DESCALE_P2
+
+ packssdw mm4,mm0 ; mm4=data2
+ packssdw mm1,mm6 ; mm1=data6
+
+ movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+ ; -- Odd part
+
+ movq mm5, MMWORD [wk(0)] ; mm5=tmp6
+ movq mm7, MMWORD [wk(1)] ; mm7=tmp7
+
+ movq mm0,mm2 ; mm2=tmp4
+ movq mm6,mm3 ; mm3=tmp5
+ paddw mm0,mm5 ; mm0=z3
+ paddw mm6,mm7 ; mm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movq mm4,mm0
+ movq mm1,mm0
+ punpcklwd mm4,mm6
+ punpckhwd mm1,mm6
+ movq mm0,mm4
+ movq mm6,mm1
+ pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
+ pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
+ pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
+ pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
+
+ movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
+ movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movq mm4,mm2
+ movq mm1,mm2
+ punpcklwd mm4,mm7
+ punpckhwd mm1,mm7
+ movq mm2,mm4
+ movq mm7,mm1
+ pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
+ pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
+ pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
+ pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
+
+ paddd mm4, MMWORD [wk(0)] ; mm4=data7L
+ paddd mm1, MMWORD [wk(1)] ; mm1=data7H
+ paddd mm2,mm0 ; mm2=data1L
+ paddd mm7,mm6 ; mm7=data1H
+
+ paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm4,DESCALE_P2
+ psrad mm1,DESCALE_P2
+ paddd mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm2,DESCALE_P2
+ psrad mm7,DESCALE_P2
+
+ packssdw mm4,mm1 ; mm4=data7
+ packssdw mm2,mm7 ; mm2=data1
+
+ movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+ movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+ movq mm1,mm3
+ movq mm7,mm3
+ punpcklwd mm1,mm5
+ punpckhwd mm7,mm5
+ movq mm3,mm1
+ movq mm5,mm7
+ pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
+ pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
+ pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
+ pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
+
+ paddd mm1,mm0 ; mm1=data5L
+ paddd mm7,mm6 ; mm7=data5H
+ paddd mm3, MMWORD [wk(0)] ; mm3=data3L
+ paddd mm5, MMWORD [wk(1)] ; mm5=data3H
+
+ paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm1,DESCALE_P2
+ psrad mm7,DESCALE_P2
+ paddd mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad mm3,DESCALE_P2
+ psrad mm5,DESCALE_P2
+
+ packssdw mm1,mm7 ; mm1=data5
+ packssdw mm3,mm5 ; mm3=data3
+
+ movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+ movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+ add edx, byte 4*SIZEOF_DCTELEM
+ dec ecx
+ jnz near .columnloop
+
+ emms ; empty MMX state
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jfss2fst-64.asm b/simd/jfss2fst-64.asm
new file mode 100644
index 0000000..c99acc2
--- /dev/null
+++ b/simd/jfss2fst-64.asm
@@ -0,0 +1,392 @@
+;
+; jfss2fst.asm - fast integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ 98 ; FIX(0.382683433)
+F_0_541 equ 139 ; FIX(0.541196100)
+F_0_707 equ 181 ; FIX(0.707106781)
+F_1_306 equ 334 ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 16
+ global EXTN(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2 (DCTELEM * data)
+;
+
+; r10 = DCTELEM * data
+
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 16
+ global EXTN(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+ push rbp
+ mov rax,rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp],rax
+ mov rbp,rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+
+ ; ---- Pass 1: process rows.
+
+ mov rdx, r10 ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6,xmm1
+ movdqa xmm3,xmm0
+ psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2,xmm1
+ movdqa xmm5,xmm7
+ paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4,xmm3
+ movdqa xmm0,xmm6
+ psubw xmm3,xmm1 ; xmm3=tmp13
+ psubw xmm6,xmm7 ; xmm6=tmp12
+ paddw xmm4,xmm1 ; xmm4=tmp10
+ paddw xmm0,xmm7 ; xmm0=tmp11
+
+ paddw xmm6,xmm3
+ psllw xmm6,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm6,[rel PW_F0707] ; xmm6=z1
+
+ movdqa xmm1,xmm4
+ movdqa xmm7,xmm3
+ psubw xmm4,xmm0 ; xmm4=data4
+ psubw xmm3,xmm6 ; xmm3=data6
+ paddw xmm1,xmm0 ; xmm1=data0
+ paddw xmm7,xmm6 ; xmm7=data2
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
+
+ ; -- Odd part
+
+ paddw xmm2,xmm5 ; xmm2=tmp10
+ paddw xmm5,xmm0 ; xmm5=tmp11
+ paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7
+
+ psllw xmm2,PRE_MULTIPLY_SCALE_BITS
+ psllw xmm0,PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm5,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5,[rel PW_F0707] ; xmm5=z3
+
+ movdqa xmm4,xmm2 ; xmm4=tmp10
+ psubw xmm2,xmm0
+ pmulhw xmm2,[rel PW_F0382] ; xmm2=z5
+ pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4,xmm2 ; xmm4=z2
+ paddw xmm0,xmm2 ; xmm0=z4
+
+ movdqa xmm3,xmm6
+ psubw xmm6,xmm5 ; xmm6=z13
+ paddw xmm3,xmm5 ; xmm3=z11
+
+ movdqa xmm2,xmm6
+ movdqa xmm5,xmm3
+ psubw xmm6,xmm4 ; xmm6=data3
+ psubw xmm3,xmm0 ; xmm3=data7
+ paddw xmm2,xmm4 ; xmm2=data5
+ paddw xmm5,xmm0 ; xmm5=data1
+
+ ; ---- Pass 2: process columns.
+
+ ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+ ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm4,xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
+ movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
+
+ ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+ ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm7,xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
+ movdqa xmm0,xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm2,xmm5 ; transpose coefficients(phase 2)
+ punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
+ movdqa xmm3,xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm2,xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
+ movdqa xmm7,xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm0,xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm5,xmm6
+ movdqa xmm3,xmm1
+ psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6
+ psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7
+ paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1
+ paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
+
+ movdqa xmm6,xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm1,xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm7,xmm6
+ movdqa xmm0,xmm2
+ paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3
+ paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2
+ psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4
+ psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4,xmm3
+ movdqa xmm1,xmm5
+ psubw xmm3,xmm6 ; xmm3=tmp13
+ psubw xmm5,xmm2 ; xmm5=tmp12
+ paddw xmm4,xmm6 ; xmm4=tmp10
+ paddw xmm1,xmm2 ; xmm1=tmp11
+
+ paddw xmm5,xmm3
+ psllw xmm5,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5,[rel PW_F0707] ; xmm5=z1
+
+ movdqa xmm6,xmm4
+ movdqa xmm2,xmm3
+ psubw xmm4,xmm1 ; xmm4=data4
+ psubw xmm3,xmm5 ; xmm3=data6
+ paddw xmm6,xmm1 ; xmm6=data0
+ paddw xmm2,xmm5 ; xmm2=data2
+
+ movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+ ; -- Odd part
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ paddw xmm7,xmm0 ; xmm7=tmp10
+ paddw xmm0,xmm1 ; xmm0=tmp11
+ paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7
+
+ psllw xmm7,PRE_MULTIPLY_SCALE_BITS
+ psllw xmm1,PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm0,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm0,[rel PW_F0707] ; xmm0=z3
+
+ movdqa xmm4,xmm7 ; xmm4=tmp10
+ psubw xmm7,xmm1
+ pmulhw xmm7,[rel PW_F0382] ; xmm7=z5
+ pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4,xmm7 ; xmm4=z2
+ paddw xmm1,xmm7 ; xmm1=z4
+
+ movdqa xmm3,xmm5
+ psubw xmm5,xmm0 ; xmm5=z13
+ paddw xmm3,xmm0 ; xmm3=z11
+
+ movdqa xmm6,xmm5
+ movdqa xmm2,xmm3
+ psubw xmm5,xmm4 ; xmm5=data3
+ psubw xmm3,xmm1 ; xmm3=data7
+ paddw xmm6,xmm4 ; xmm6=data5
+ paddw xmm2,xmm1 ; xmm2=data1
+
+ movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+ movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+ uncollect_args
+ mov rsp,rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jfss2fst.asm b/simd/jfss2fst.asm
new file mode 100644
index 0000000..73fc9e5
--- /dev/null
+++ b/simd/jfss2fst.asm
@@ -0,0 +1,404 @@
+;
+; jfss2fst.asm - fast integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ 98 ; FIX(0.382683433)
+F_0_541 equ 139 ; FIX(0.541196100)
+F_0_707 equ 181 ; FIX(0.707106781)
+F_1_306 equ 334 ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 16
+ global EXTN(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2 (DCTELEM * data)
+;
+
+%define data(b) (b)+8 ; DCTELEM * data
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 16
+ global EXTN(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6,xmm1
+ movdqa xmm3,xmm0
+ psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2,xmm1
+ movdqa xmm5,xmm7
+ paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4,xmm3
+ movdqa xmm0,xmm6
+ psubw xmm3,xmm1 ; xmm3=tmp13
+ psubw xmm6,xmm7 ; xmm6=tmp12
+ paddw xmm4,xmm1 ; xmm4=tmp10
+ paddw xmm0,xmm7 ; xmm0=tmp11
+
+ paddw xmm6,xmm3
+ psllw xmm6,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
+
+ movdqa xmm1,xmm4
+ movdqa xmm7,xmm3
+ psubw xmm4,xmm0 ; xmm4=data4
+ psubw xmm3,xmm6 ; xmm3=data6
+ paddw xmm1,xmm0 ; xmm1=data0
+ paddw xmm7,xmm6 ; xmm7=data2
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
+
+ ; -- Odd part
+
+ paddw xmm2,xmm5 ; xmm2=tmp10
+ paddw xmm5,xmm0 ; xmm5=tmp11
+ paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7
+
+ psllw xmm2,PRE_MULTIPLY_SCALE_BITS
+ psllw xmm0,PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm5,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
+
+ movdqa xmm4,xmm2 ; xmm4=tmp10
+ psubw xmm2,xmm0
+ pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
+ pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4,xmm2 ; xmm4=z2
+ paddw xmm0,xmm2 ; xmm0=z4
+
+ movdqa xmm3,xmm6
+ psubw xmm6,xmm5 ; xmm6=z13
+ paddw xmm3,xmm5 ; xmm3=z11
+
+ movdqa xmm2,xmm6
+ movdqa xmm5,xmm3
+ psubw xmm6,xmm4 ; xmm6=data3
+ psubw xmm3,xmm0 ; xmm3=data7
+ paddw xmm2,xmm4 ; xmm2=data5
+ paddw xmm5,xmm0 ; xmm5=data1
+
+ ; ---- Pass 2: process columns.
+
+; mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+ ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm4,xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
+ movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
+
+ ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+ ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm7,xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
+ movdqa xmm0,xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm2,xmm5 ; transpose coefficients(phase 2)
+ punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
+ movdqa xmm3,xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm2,xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
+ movdqa xmm7,xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm0,xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm5,xmm6
+ movdqa xmm3,xmm1
+ psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6
+ psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7
+ paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1
+ paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
+
+ movdqa xmm6,xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm1,xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm7,xmm6
+ movdqa xmm0,xmm2
+ paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3
+ paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2
+ psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4
+ psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4,xmm3
+ movdqa xmm1,xmm5
+ psubw xmm3,xmm6 ; xmm3=tmp13
+ psubw xmm5,xmm2 ; xmm5=tmp12
+ paddw xmm4,xmm6 ; xmm4=tmp10
+ paddw xmm1,xmm2 ; xmm1=tmp11
+
+ paddw xmm5,xmm3
+ psllw xmm5,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
+
+ movdqa xmm6,xmm4
+ movdqa xmm2,xmm3
+ psubw xmm4,xmm1 ; xmm4=data4
+ psubw xmm3,xmm5 ; xmm3=data6
+ paddw xmm6,xmm1 ; xmm6=data0
+ paddw xmm2,xmm5 ; xmm2=data2
+
+ movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+ ; -- Odd part
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ paddw xmm7,xmm0 ; xmm7=tmp10
+ paddw xmm0,xmm1 ; xmm0=tmp11
+ paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7
+
+ psllw xmm7,PRE_MULTIPLY_SCALE_BITS
+ psllw xmm1,PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm0,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
+
+ movdqa xmm4,xmm7 ; xmm4=tmp10
+ psubw xmm7,xmm1
+ pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
+ pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4,xmm7 ; xmm4=z2
+ paddw xmm1,xmm7 ; xmm1=z4
+
+ movdqa xmm3,xmm5
+ psubw xmm5,xmm0 ; xmm5=z13
+ paddw xmm3,xmm0 ; xmm3=z11
+
+ movdqa xmm6,xmm5
+ movdqa xmm2,xmm3
+ psubw xmm5,xmm4 ; xmm5=data3
+ psubw xmm3,xmm1 ; xmm3=data7
+ paddw xmm6,xmm4 ; xmm6=data5
+ paddw xmm2,xmm1 ; xmm2=data1
+
+ movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+ movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jfss2int-64.asm b/simd/jfss2int-64.asm
new file mode 100644
index 0000000..b8ec4b5
--- /dev/null
+++ b/simd/jfss2int-64.asm
@@ -0,0 +1,622 @@
+;
+; jfss2int.asm - accurate integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
+PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2 (DCTELEM * data)
+;
+
+; r10 = DCTELEM * data
+
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 6
+
+ align 16
+ global EXTN(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+ push rbp
+ mov rax,rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp],rax
+ mov rbp,rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+
+ ; ---- Pass 1: process rows.
+
+ mov rdx, r10 ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6,xmm1
+ movdqa xmm3,xmm0
+ psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2,xmm1
+ movdqa xmm5,xmm7
+ paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4,xmm3
+ movdqa xmm0,xmm6
+ paddw xmm3,xmm1 ; xmm3=tmp10
+ paddw xmm6,xmm7 ; xmm6=tmp11
+ psubw xmm4,xmm1 ; xmm4=tmp13
+ psubw xmm0,xmm7 ; xmm0=tmp12
+
+ movdqa xmm1,xmm3
+ paddw xmm3,xmm6 ; xmm3=tmp10+tmp11
+ psubw xmm1,xmm6 ; xmm1=tmp10-tmp11
+
+ psllw xmm3,PASS1_BITS ; xmm3=data0
+ psllw xmm1,PASS1_BITS ; xmm1=data4
+
+ movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
+ movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm7,xmm4 ; xmm4=tmp13
+ movdqa xmm6,xmm4
+ punpcklwd xmm7,xmm0 ; xmm0=tmp12
+ punpckhwd xmm6,xmm0
+ movdqa xmm4,xmm7
+ movdqa xmm0,xmm6
+ pmaddwd xmm7,[rel PW_F130_F054] ; xmm7=data2L
+ pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=data2H
+ pmaddwd xmm4,[rel PW_F054_MF130] ; xmm4=data6L
+ pmaddwd xmm0,[rel PW_F054_MF130] ; xmm0=data6H
+
+ paddd xmm7,[rel PD_DESCALE_P1]
+ paddd xmm6,[rel PD_DESCALE_P1]
+ psrad xmm7,DESCALE_P1
+ psrad xmm6,DESCALE_P1
+ paddd xmm4,[rel PD_DESCALE_P1]
+ paddd xmm0,[rel PD_DESCALE_P1]
+ psrad xmm4,DESCALE_P1
+ psrad xmm0,DESCALE_P1
+
+ packssdw xmm7,xmm6 ; xmm7=data2
+ packssdw xmm4,xmm0 ; xmm4=data6
+
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
+
+ ; -- Odd part
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
+
+ movdqa xmm6,xmm2 ; xmm2=tmp4
+ movdqa xmm0,xmm5 ; xmm5=tmp5
+ paddw xmm6,xmm3 ; xmm6=z3
+ paddw xmm0,xmm1 ; xmm0=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm7,xmm6
+ movdqa xmm4,xmm6
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm6,xmm7
+ movdqa xmm0,xmm4
+ pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3L
+ pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3H
+ pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4L
+ pmaddwd xmm0,[rel PW_F117_F078] ; xmm0=z4H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm7,xmm2
+ movdqa xmm4,xmm2
+ punpcklwd xmm7,xmm1
+ punpckhwd xmm4,xmm1
+ movdqa xmm2,xmm7
+ movdqa xmm1,xmm4
+ pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp4L
+ pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4H
+ pmaddwd xmm2,[rel PW_MF089_F060] ; xmm2=tmp7L
+ pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp7H
+
+ paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
+ paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
+ paddd xmm2,xmm6 ; xmm2=data1L
+ paddd xmm1,xmm0 ; xmm1=data1H
+
+ paddd xmm7,[rel PD_DESCALE_P1]
+ paddd xmm4,[rel PD_DESCALE_P1]
+ psrad xmm7,DESCALE_P1
+ psrad xmm4,DESCALE_P1
+ paddd xmm2,[rel PD_DESCALE_P1]
+ paddd xmm1,[rel PD_DESCALE_P1]
+ psrad xmm2,DESCALE_P1
+ psrad xmm1,DESCALE_P1
+
+ packssdw xmm7,xmm4 ; xmm7=data7
+ packssdw xmm2,xmm1 ; xmm2=data1
+
+ movdqa xmm4,xmm5
+ movdqa xmm1,xmm5
+ punpcklwd xmm4,xmm3
+ punpckhwd xmm1,xmm3
+ movdqa xmm5,xmm4
+ movdqa xmm3,xmm1
+ pmaddwd xmm4,[rel PW_MF050_MF256] ; xmm4=tmp5L
+ pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5H
+ pmaddwd xmm5,[rel PW_MF256_F050] ; xmm5=tmp6L
+ pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6H
+
+ paddd xmm4,xmm6 ; xmm4=data5L
+ paddd xmm1,xmm0 ; xmm1=data5H
+ paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
+ paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
+
+ paddd xmm4,[rel PD_DESCALE_P1]
+ paddd xmm1,[rel PD_DESCALE_P1]
+ psrad xmm4,DESCALE_P1
+ psrad xmm1,DESCALE_P1
+ paddd xmm5,[rel PD_DESCALE_P1]
+ paddd xmm3,[rel PD_DESCALE_P1]
+ psrad xmm5,DESCALE_P1
+ psrad xmm3,DESCALE_P1
+
+ packssdw xmm4,xmm1 ; xmm4=data5
+ packssdw xmm5,xmm3 ; xmm5=data3
+
+ ; ---- Pass 2: process columns.
+
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
+ movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
+
+ ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+ ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1,xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
+ movdqa xmm3,xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
+ movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
+
+ ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+ ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm0,xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
+ movdqa xmm3,xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm4,xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
+ movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
+ movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm5,xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm2,xmm5
+ movdqa xmm7,xmm6
+ psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6
+ psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7
+ paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1
+ paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0
+
+ movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movdqa xmm5,xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm0,xmm5
+ movdqa xmm3,xmm4
+ paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3
+ paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2
+ psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4
+ psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm1,xmm7
+ movdqa xmm6,xmm2
+ paddw xmm7,xmm5 ; xmm7=tmp10
+ paddw xmm2,xmm4 ; xmm2=tmp11
+ psubw xmm1,xmm5 ; xmm1=tmp13
+ psubw xmm6,xmm4 ; xmm6=tmp12
+
+ movdqa xmm5,xmm7
+ paddw xmm7,xmm2 ; xmm7=tmp10+tmp11
+ psubw xmm5,xmm2 ; xmm5=tmp10-tmp11
+
+ paddw xmm7,[rel PW_DESCALE_P2X]
+ paddw xmm5,[rel PW_DESCALE_P2X]
+ psraw xmm7,PASS1_BITS ; xmm7=data0
+ psraw xmm5,PASS1_BITS ; xmm5=data4
+
+ movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+ movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm4,xmm1 ; xmm1=tmp13
+ movdqa xmm2,xmm1
+ punpcklwd xmm4,xmm6 ; xmm6=tmp12
+ punpckhwd xmm2,xmm6
+ movdqa xmm1,xmm4
+ movdqa xmm6,xmm2
+ pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=data2L
+ pmaddwd xmm2,[rel PW_F130_F054] ; xmm2=data2H
+ pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=data6L
+ pmaddwd xmm6,[rel PW_F054_MF130] ; xmm6=data6H
+
+ paddd xmm4,[rel PD_DESCALE_P2]
+ paddd xmm2,[rel PD_DESCALE_P2]
+ psrad xmm4,DESCALE_P2
+ psrad xmm2,DESCALE_P2
+ paddd xmm1,[rel PD_DESCALE_P2]
+ paddd xmm6,[rel PD_DESCALE_P2]
+ psrad xmm1,DESCALE_P2
+ psrad xmm6,DESCALE_P2
+
+ packssdw xmm4,xmm2 ; xmm4=data2
+ packssdw xmm1,xmm6 ; xmm1=data6
+
+ movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+ ; -- Odd part
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ movdqa xmm2,xmm0 ; xmm0=tmp4
+ movdqa xmm6,xmm3 ; xmm3=tmp5
+ paddw xmm2,xmm7 ; xmm2=z3
+ paddw xmm6,xmm5 ; xmm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm4,xmm2
+ movdqa xmm1,xmm2
+ punpcklwd xmm4,xmm6
+ punpckhwd xmm1,xmm6
+ movdqa xmm2,xmm4
+ movdqa xmm6,xmm1
+ pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3L
+ pmaddwd xmm1,[rel PW_MF078_F117] ; xmm1=z3H
+ pmaddwd xmm2,[rel PW_F117_F078] ; xmm2=z4L
+ pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm4,xmm0
+ movdqa xmm1,xmm0
+ punpcklwd xmm4,xmm5
+ punpckhwd xmm1,xmm5
+ movdqa xmm0,xmm4
+ movdqa xmm5,xmm1
+ pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4L
+ pmaddwd xmm1,[rel PW_MF060_MF089] ; xmm1=tmp4H
+ pmaddwd xmm0,[rel PW_MF089_F060] ; xmm0=tmp7L
+ pmaddwd xmm5,[rel PW_MF089_F060] ; xmm5=tmp7H
+
+ paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
+ paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
+ paddd xmm0,xmm2 ; xmm0=data1L
+ paddd xmm5,xmm6 ; xmm5=data1H
+
+ paddd xmm4,[rel PD_DESCALE_P2]
+ paddd xmm1,[rel PD_DESCALE_P2]
+ psrad xmm4,DESCALE_P2
+ psrad xmm1,DESCALE_P2
+ paddd xmm0,[rel PD_DESCALE_P2]
+ paddd xmm5,[rel PD_DESCALE_P2]
+ psrad xmm0,DESCALE_P2
+ psrad xmm5,DESCALE_P2
+
+ packssdw xmm4,xmm1 ; xmm4=data7
+ packssdw xmm0,xmm5 ; xmm0=data1
+
+ movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+ movdqa xmm1,xmm3
+ movdqa xmm5,xmm3
+ punpcklwd xmm1,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm3,xmm1
+ movdqa xmm7,xmm5
+ pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5L
+ pmaddwd xmm5,[rel PW_MF050_MF256] ; xmm5=tmp5H
+ pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6L
+ pmaddwd xmm7,[rel PW_MF256_F050] ; xmm7=tmp6H
+
+ paddd xmm1,xmm2 ; xmm1=data5L
+ paddd xmm5,xmm6 ; xmm5=data5H
+ paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
+ paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
+
+ paddd xmm1,[rel PD_DESCALE_P2]
+ paddd xmm5,[rel PD_DESCALE_P2]
+ psrad xmm1,DESCALE_P2
+ psrad xmm5,DESCALE_P2
+ paddd xmm3,[rel PD_DESCALE_P2]
+ paddd xmm7,[rel PD_DESCALE_P2]
+ psrad xmm3,DESCALE_P2
+ psrad xmm7,DESCALE_P2
+
+ packssdw xmm1,xmm5 ; xmm1=data5
+ packssdw xmm3,xmm7 ; xmm3=data3
+
+ movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+ uncollect_args
+ mov rsp,rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jfss2int.asm b/simd/jfss2int.asm
new file mode 100644
index 0000000..5e3f2aa
--- /dev/null
+++ b/simd/jfss2int.asm
@@ -0,0 +1,634 @@
+;
+; jfss2int.asm - accurate integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
+PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2 (DCTELEM * data)
+;
+
+%define data(b) (b)+8 ; DCTELEM * data
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 6
+
+ align 16
+ global EXTN(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6,xmm1
+ movdqa xmm3,xmm0
+ psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2,xmm1
+ movdqa xmm5,xmm7
+ paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4,xmm3
+ movdqa xmm0,xmm6
+ paddw xmm3,xmm1 ; xmm3=tmp10
+ paddw xmm6,xmm7 ; xmm6=tmp11
+ psubw xmm4,xmm1 ; xmm4=tmp13
+ psubw xmm0,xmm7 ; xmm0=tmp12
+
+ movdqa xmm1,xmm3
+ paddw xmm3,xmm6 ; xmm3=tmp10+tmp11
+ psubw xmm1,xmm6 ; xmm1=tmp10-tmp11
+
+ psllw xmm3,PASS1_BITS ; xmm3=data0
+ psllw xmm1,PASS1_BITS ; xmm1=data4
+
+ movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
+ movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm7,xmm4 ; xmm4=tmp13
+ movdqa xmm6,xmm4
+ punpcklwd xmm7,xmm0 ; xmm0=tmp12
+ punpckhwd xmm6,xmm0
+ movdqa xmm4,xmm7
+ movdqa xmm0,xmm6
+ pmaddwd xmm7,[GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L
+ pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H
+ pmaddwd xmm4,[GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L
+ pmaddwd xmm0,[GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H
+
+ paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm7,DESCALE_P1
+ psrad xmm6,DESCALE_P1
+ paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm4,DESCALE_P1
+ psrad xmm0,DESCALE_P1
+
+ packssdw xmm7,xmm6 ; xmm7=data2
+ packssdw xmm4,xmm0 ; xmm4=data6
+
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
+
+ ; -- Odd part
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
+
+ movdqa xmm6,xmm2 ; xmm2=tmp4
+ movdqa xmm0,xmm5 ; xmm5=tmp5
+ paddw xmm6,xmm3 ; xmm6=z3
+ paddw xmm0,xmm1 ; xmm0=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm7,xmm6
+ movdqa xmm4,xmm6
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm6,xmm7
+ movdqa xmm0,xmm4
+ pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L
+ pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H
+ pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L
+ pmaddwd xmm0,[GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm7,xmm2
+ movdqa xmm4,xmm2
+ punpcklwd xmm7,xmm1
+ punpckhwd xmm4,xmm1
+ movdqa xmm2,xmm7
+ movdqa xmm1,xmm4
+ pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L
+ pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H
+ pmaddwd xmm2,[GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L
+ pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H
+
+ paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
+ paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
+ paddd xmm2,xmm6 ; xmm2=data1L
+ paddd xmm1,xmm0 ; xmm1=data1H
+
+ paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm7,DESCALE_P1
+ psrad xmm4,DESCALE_P1
+ paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm2,DESCALE_P1
+ psrad xmm1,DESCALE_P1
+
+ packssdw xmm7,xmm4 ; xmm7=data7
+ packssdw xmm2,xmm1 ; xmm2=data1
+
+ movdqa xmm4,xmm5
+ movdqa xmm1,xmm5
+ punpcklwd xmm4,xmm3
+ punpckhwd xmm1,xmm3
+ movdqa xmm5,xmm4
+ movdqa xmm3,xmm1
+ pmaddwd xmm4,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L
+ pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H
+ pmaddwd xmm5,[GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L
+ pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H
+
+ paddd xmm4,xmm6 ; xmm4=data5L
+ paddd xmm1,xmm0 ; xmm1=data5H
+ paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
+ paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
+
+ paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm4,DESCALE_P1
+ psrad xmm1,DESCALE_P1
+ paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm5,DESCALE_P1
+ psrad xmm3,DESCALE_P1
+
+ packssdw xmm4,xmm1 ; xmm4=data5
+ packssdw xmm5,xmm3 ; xmm5=data3
+
+ ; ---- Pass 2: process columns.
+
+; mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
+ movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
+
+ ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+ ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1,xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
+ movdqa xmm3,xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
+ movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
+
+ ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+ ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm0,xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
+ movdqa xmm3,xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm4,xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
+ movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
+ movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm5,xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm2,xmm5
+ movdqa xmm7,xmm6
+ psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6
+ psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7
+ paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1
+ paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0
+
+ movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movdqa xmm5,xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm0,xmm5
+ movdqa xmm3,xmm4
+ paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3
+ paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2
+ psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4
+ psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm1,xmm7
+ movdqa xmm6,xmm2
+ paddw xmm7,xmm5 ; xmm7=tmp10
+ paddw xmm2,xmm4 ; xmm2=tmp11
+ psubw xmm1,xmm5 ; xmm1=tmp13
+ psubw xmm6,xmm4 ; xmm6=tmp12
+
+ movdqa xmm5,xmm7
+ paddw xmm7,xmm2 ; xmm7=tmp10+tmp11
+ psubw xmm5,xmm2 ; xmm5=tmp10-tmp11
+
+ paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+ paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+ psraw xmm7,PASS1_BITS ; xmm7=data0
+ psraw xmm5,PASS1_BITS ; xmm5=data4
+
+ movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+ movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm4,xmm1 ; xmm1=tmp13
+ movdqa xmm2,xmm1
+ punpcklwd xmm4,xmm6 ; xmm6=tmp12
+ punpckhwd xmm2,xmm6
+ movdqa xmm1,xmm4
+ movdqa xmm6,xmm2
+ pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L
+ pmaddwd xmm2,[GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H
+ pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L
+ pmaddwd xmm6,[GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H
+
+ paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm4,DESCALE_P2
+ psrad xmm2,DESCALE_P2
+ paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm1,DESCALE_P2
+ psrad xmm6,DESCALE_P2
+
+ packssdw xmm4,xmm2 ; xmm4=data2
+ packssdw xmm1,xmm6 ; xmm1=data6
+
+ movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+ ; -- Odd part
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ movdqa xmm2,xmm0 ; xmm0=tmp4
+ movdqa xmm6,xmm3 ; xmm3=tmp5
+ paddw xmm2,xmm7 ; xmm2=z3
+ paddw xmm6,xmm5 ; xmm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm4,xmm2
+ movdqa xmm1,xmm2
+ punpcklwd xmm4,xmm6
+ punpckhwd xmm1,xmm6
+ movdqa xmm2,xmm4
+ movdqa xmm6,xmm1
+ pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L
+ pmaddwd xmm1,[GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H
+ pmaddwd xmm2,[GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L
+ pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm4,xmm0
+ movdqa xmm1,xmm0
+ punpcklwd xmm4,xmm5
+ punpckhwd xmm1,xmm5
+ movdqa xmm0,xmm4
+ movdqa xmm5,xmm1
+ pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L
+ pmaddwd xmm1,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H
+ pmaddwd xmm0,[GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L
+ pmaddwd xmm5,[GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H
+
+ paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
+ paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
+ paddd xmm0,xmm2 ; xmm0=data1L
+ paddd xmm5,xmm6 ; xmm5=data1H
+
+ paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm4,DESCALE_P2
+ psrad xmm1,DESCALE_P2
+ paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm0,DESCALE_P2
+ psrad xmm5,DESCALE_P2
+
+ packssdw xmm4,xmm1 ; xmm4=data7
+ packssdw xmm0,xmm5 ; xmm0=data1
+
+ movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+ movdqa xmm1,xmm3
+ movdqa xmm5,xmm3
+ punpcklwd xmm1,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm3,xmm1
+ movdqa xmm7,xmm5
+ pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L
+ pmaddwd xmm5,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H
+ pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L
+ pmaddwd xmm7,[GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H
+
+ paddd xmm1,xmm2 ; xmm1=data5L
+ paddd xmm5,xmm6 ; xmm5=data5H
+ paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
+ paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
+
+ paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm1,DESCALE_P2
+ psrad xmm5,DESCALE_P2
+ paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm3,DESCALE_P2
+ psrad xmm7,DESCALE_P2
+
+ packssdw xmm1,xmm5 ; xmm1=data5
+ packssdw xmm3,xmm7 ; xmm3=data3
+
+ movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jfsseflt-64.asm b/simd/jfsseflt-64.asm
new file mode 100644
index 0000000..0f3e21a
--- /dev/null
+++ b/simd/jfsseflt-64.asm
@@ -0,0 +1,358 @@
+;
+; jfsseflt.asm - floating-point FDCT (64-bit SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1,%2,0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse (FAST_FLOAT * data)
+;
+
+; r10 = FAST_FLOAT * data
+
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 16
+ global EXTN(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+ push rbp
+ mov rax,rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp],rax
+ mov rbp,rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+
+ ; ---- Pass 1: process rows.
+
+ mov rdx, r10 ; (FAST_FLOAT *)
+ mov rcx, DCTSIZE/4
+.rowloop:
+
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+ ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+ movaps xmm4,xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31)
+ unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33)
+ movaps xmm5,xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35)
+ unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+ ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
+
+ movaps xmm4,xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13)
+ movaps xmm2,xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15)
+ unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17)
+
+ movaps xmm7,xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0
+ unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1
+ movaps xmm3,xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6
+ unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7
+
+ movaps xmm0,xmm7
+ movaps xmm5,xmm6
+ subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7,xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2
+ unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3
+ movaps xmm6,xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4
+ unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5
+
+ movaps xmm2,xmm7
+ movaps xmm3,xmm4
+ addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1,xmm5
+ movaps xmm6,xmm0
+ subps xmm5,xmm7 ; xmm5=tmp13
+ subps xmm0,xmm4 ; xmm0=tmp12
+ addps xmm1,xmm7 ; xmm1=tmp10
+ addps xmm6,xmm4 ; xmm6=tmp11
+
+ addps xmm0,xmm5
+ mulps xmm0,[rel PD_0_707] ; xmm0=z1
+
+ movaps xmm7,xmm1
+ movaps xmm4,xmm5
+ subps xmm1,xmm6 ; xmm1=data4
+ subps xmm5,xmm0 ; xmm5=data6
+ addps xmm7,xmm6 ; xmm7=data0
+ addps xmm4,xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2,xmm3 ; xmm2=tmp10
+ addps xmm3,xmm6 ; xmm3=tmp11
+ addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3,[rel PD_0_707] ; xmm3=z3
+
+ movaps xmm1,xmm2 ; xmm1=tmp10
+ subps xmm2,xmm6
+ mulps xmm2,[rel PD_0_382] ; xmm2=z5
+ mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1,xmm2 ; xmm1=z2
+ addps xmm6,xmm2 ; xmm6=z4
+
+ movaps xmm5,xmm0
+ subps xmm0,xmm3 ; xmm0=z13
+ addps xmm5,xmm3 ; xmm5=z11
+
+ movaps xmm7,xmm0
+ movaps xmm4,xmm5
+ subps xmm0,xmm1 ; xmm0=data3
+ subps xmm5,xmm6 ; xmm5=data7
+ addps xmm7,xmm1 ; xmm7=data5
+ addps xmm4,xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec rcx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov rdx, r10 ; (FAST_FLOAT *)
+ mov rcx, DCTSIZE/4
+.columnloop:
+
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+ ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+ movaps xmm4,xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13)
+ unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33)
+ movaps xmm5,xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53)
+ unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+ ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
+
+ movaps xmm4,xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11)
+ unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31)
+ movaps xmm2,xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51)
+ unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71)
+
+ movaps xmm7,xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0
+ unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1
+ movaps xmm3,xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6
+ unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7
+
+ movaps xmm0,xmm7
+ movaps xmm5,xmm6
+ subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7,xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2
+ unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3
+ movaps xmm6,xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4
+ unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5
+
+ movaps xmm2,xmm7
+ movaps xmm3,xmm4
+ addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1,xmm5
+ movaps xmm6,xmm0
+ subps xmm5,xmm7 ; xmm5=tmp13
+ subps xmm0,xmm4 ; xmm0=tmp12
+ addps xmm1,xmm7 ; xmm1=tmp10
+ addps xmm6,xmm4 ; xmm6=tmp11
+
+ addps xmm0,xmm5
+ mulps xmm0,[rel PD_0_707] ; xmm0=z1
+
+ movaps xmm7,xmm1
+ movaps xmm4,xmm5
+ subps xmm1,xmm6 ; xmm1=data4
+ subps xmm5,xmm0 ; xmm5=data6
+ addps xmm7,xmm6 ; xmm7=data0
+ addps xmm4,xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2,xmm3 ; xmm2=tmp10
+ addps xmm3,xmm6 ; xmm3=tmp11
+ addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3,[rel PD_0_707] ; xmm3=z3
+
+ movaps xmm1,xmm2 ; xmm1=tmp10
+ subps xmm2,xmm6
+ mulps xmm2,[rel PD_0_382] ; xmm2=z5
+ mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1,xmm2 ; xmm1=z2
+ addps xmm6,xmm2 ; xmm6=z4
+
+ movaps xmm5,xmm0
+ subps xmm0,xmm3 ; xmm0=z13
+ addps xmm5,xmm3 ; xmm5=z11
+
+ movaps xmm7,xmm0
+ movaps xmm4,xmm5
+ subps xmm0,xmm1 ; xmm0=data3
+ subps xmm5,xmm6 ; xmm5=data7
+ addps xmm7,xmm1 ; xmm7=data5
+ addps xmm4,xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add rdx, byte 4*SIZEOF_FAST_FLOAT
+ dec rcx
+ jnz near .columnloop
+
+ uncollect_args
+ mov rsp,rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jfsseflt.asm b/simd/jfsseflt.asm
new file mode 100644
index 0000000..bc54ccc
--- /dev/null
+++ b/simd/jfsseflt.asm
@@ -0,0 +1,370 @@
+;
+; jfsseflt.asm - floating-point FDCT (SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1,%2,0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse (FAST_FLOAT * data)
+;
+
+%define data(b) (b)+8 ; FAST_FLOAT * data
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 16
+ global EXTN(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/4
+ alignx 16,7
+.rowloop:
+
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+ ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+ movaps xmm4,xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31)
+ unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33)
+ movaps xmm5,xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35)
+ unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+ ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
+
+ movaps xmm4,xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13)
+ movaps xmm2,xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15)
+ unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17)
+
+ movaps xmm7,xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0
+ unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1
+ movaps xmm3,xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6
+ unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7
+
+ movaps xmm0,xmm7
+ movaps xmm5,xmm6
+ subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7,xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2
+ unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3
+ movaps xmm6,xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4
+ unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5
+
+ movaps xmm2,xmm7
+ movaps xmm3,xmm4
+ addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1,xmm5
+ movaps xmm6,xmm0
+ subps xmm5,xmm7 ; xmm5=tmp13
+ subps xmm0,xmm4 ; xmm0=tmp12
+ addps xmm1,xmm7 ; xmm1=tmp10
+ addps xmm6,xmm4 ; xmm6=tmp11
+
+ addps xmm0,xmm5
+ mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+ movaps xmm7,xmm1
+ movaps xmm4,xmm5
+ subps xmm1,xmm6 ; xmm1=data4
+ subps xmm5,xmm0 ; xmm5=data6
+ addps xmm7,xmm6 ; xmm7=data0
+ addps xmm4,xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2,xmm3 ; xmm2=tmp10
+ addps xmm3,xmm6 ; xmm3=tmp11
+ addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+ movaps xmm1,xmm2 ; xmm1=tmp10
+ subps xmm2,xmm6
+ mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+ mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1,xmm2 ; xmm1=z2
+ addps xmm6,xmm2 ; xmm6=z4
+
+ movaps xmm5,xmm0
+ subps xmm0,xmm3 ; xmm0=z13
+ addps xmm5,xmm3 ; xmm5=z11
+
+ movaps xmm7,xmm0
+ movaps xmm4,xmm5
+ subps xmm0,xmm1 ; xmm0=data3
+ subps xmm5,xmm6 ; xmm5=data7
+ addps xmm7,xmm1 ; xmm7=data5
+ addps xmm4,xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/4
+ alignx 16,7
+.columnloop:
+
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+ ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+ movaps xmm4,xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13)
+ unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33)
+ movaps xmm5,xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53)
+ unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+ ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
+
+ movaps xmm4,xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11)
+ unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31)
+ movaps xmm2,xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51)
+ unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71)
+
+ movaps xmm7,xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0
+ unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1
+ movaps xmm3,xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6
+ unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7
+
+ movaps xmm0,xmm7
+ movaps xmm5,xmm6
+ subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7,xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2
+ unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3
+ movaps xmm6,xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4
+ unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5
+
+ movaps xmm2,xmm7
+ movaps xmm3,xmm4
+ addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1,xmm5
+ movaps xmm6,xmm0
+ subps xmm5,xmm7 ; xmm5=tmp13
+ subps xmm0,xmm4 ; xmm0=tmp12
+ addps xmm1,xmm7 ; xmm1=tmp10
+ addps xmm6,xmm4 ; xmm6=tmp11
+
+ addps xmm0,xmm5
+ mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+ movaps xmm7,xmm1
+ movaps xmm4,xmm5
+ subps xmm1,xmm6 ; xmm1=data4
+ subps xmm5,xmm0 ; xmm5=data6
+ addps xmm7,xmm6 ; xmm7=data0
+ addps xmm4,xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2,xmm3 ; xmm2=tmp10
+ addps xmm3,xmm6 ; xmm3=tmp11
+ addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+ movaps xmm1,xmm2 ; xmm1=tmp10
+ subps xmm2,xmm6
+ mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+ mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1,xmm2 ; xmm1=z2
+ addps xmm6,xmm2 ; xmm6=z4
+
+ movaps xmm5,xmm0
+ subps xmm0,xmm3 ; xmm0=z13
+ addps xmm5,xmm3 ; xmm5=z11
+
+ movaps xmm7,xmm0
+ movaps xmm4,xmm5
+ subps xmm0,xmm1 ; xmm0=data3
+ subps xmm5,xmm6 ; xmm5=data7
+ addps xmm7,xmm1 ; xmm7=data5
+ addps xmm4,xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add edx, byte 4*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .columnloop
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/ji3dnflt.asm b/simd/ji3dnflt.asm
new file mode 100644
index 0000000..dc2076f
--- /dev/null
+++ b/simd/ji3dnflt.asm
@@ -0,0 +1,452 @@
+;
+; ji3dnflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414 times 2 dd 1.414213562373095048801689
+PD_1_847 times 2 dd 1.847759065022573512256366
+PD_1_082 times 2 dd 1.082392200292393968799446
+PD_2_613 times 2 dd 2.613125929752753055713286
+PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_3dnow (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b)+8 ; void * dct_table
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+ ; FAST_FLOAT workspace[DCTSIZE2]
+
+ align 16
+ global EXTN(jsimd_idct_float_3dnow)
+
+EXTN(jsimd_idct_float_3dnow):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; FAST_FLOAT * wsptr
+ mov ecx, DCTSIZE/2 ; ctr
+ alignx 16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ pushpic ebx ; save GOT address
+ mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ or eax,ebx
+ poppic ebx ; restore GOT address
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd mm0,mm0
+ psrad mm0,(DWORD_BIT-WORD_BIT)
+ pi2fd mm0,mm0
+
+ pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movq mm1,mm0
+ punpckldq mm0,mm0
+ punpckhdq mm1,mm1
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+ movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+ movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+ jmp near .nextcolumn
+ alignx 16,7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd mm0,mm0
+ punpcklwd mm1,mm1
+ psrad mm0,(DWORD_BIT-WORD_BIT)
+ psrad mm1,(DWORD_BIT-WORD_BIT)
+ pi2fd mm0,mm0
+ pi2fd mm1,mm1
+
+ pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ punpcklwd mm2,mm2
+ punpcklwd mm3,mm3
+ psrad mm2,(DWORD_BIT-WORD_BIT)
+ psrad mm3,(DWORD_BIT-WORD_BIT)
+ pi2fd mm2,mm2
+ pi2fd mm3,mm3
+
+ pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movq mm4,mm0
+ movq mm5,mm1
+ pfsub mm0,mm2 ; mm0=tmp11
+ pfsub mm1,mm3
+ pfadd mm4,mm2 ; mm4=tmp10
+ pfadd mm5,mm3 ; mm5=tmp13
+
+ pfmul mm1,[GOTOFF(ebx,PD_1_414)]
+ pfsub mm1,mm5 ; mm1=tmp12
+
+ movq mm6,mm4
+ movq mm7,mm0
+ pfsub mm4,mm5 ; mm4=tmp3
+ pfsub mm0,mm1 ; mm0=tmp2
+ pfadd mm6,mm5 ; mm6=tmp0
+ pfadd mm7,mm1 ; mm7=tmp1
+
+ movq MMWORD [wk(1)], mm4 ; tmp3
+ movq MMWORD [wk(0)], mm0 ; tmp2
+
+ ; -- Odd part
+
+ movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd mm2,mm2
+ punpcklwd mm3,mm3
+ psrad mm2,(DWORD_BIT-WORD_BIT)
+ psrad mm3,(DWORD_BIT-WORD_BIT)
+ pi2fd mm2,mm2
+ pi2fd mm3,mm3
+
+ pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ punpcklwd mm5,mm5
+ punpcklwd mm1,mm1
+ psrad mm5,(DWORD_BIT-WORD_BIT)
+ psrad mm1,(DWORD_BIT-WORD_BIT)
+ pi2fd mm5,mm5
+ pi2fd mm1,mm1
+
+ pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movq mm4,mm2
+ movq mm0,mm5
+ pfadd mm2,mm1 ; mm2=z11
+ pfadd mm5,mm3 ; mm5=z13
+ pfsub mm4,mm1 ; mm4=z12
+ pfsub mm0,mm3 ; mm0=z10
+
+ movq mm1,mm2
+ pfsub mm2,mm5
+ pfadd mm1,mm5 ; mm1=tmp7
+
+ pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11
+
+ movq mm3,mm0
+ pfadd mm0,mm4
+ pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5
+ pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930)
+ pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200)
+ pfsubr mm3,mm0 ; mm3=tmp12
+ pfsub mm4,mm0 ; mm4=tmp10
+
+ ; -- Final output stage
+
+ pfsub mm3,mm1 ; mm3=tmp6
+ movq mm5,mm6
+ movq mm0,mm7
+ pfadd mm6,mm1 ; mm6=data0=(00 01)
+ pfadd mm7,mm3 ; mm7=data1=(10 11)
+ pfsub mm5,mm1 ; mm5=data7=(70 71)
+ pfsub mm0,mm3 ; mm0=data6=(60 61)
+ pfsub mm2,mm3 ; mm2=tmp5
+
+ movq mm1,mm6 ; transpose coefficients
+ punpckldq mm6,mm7 ; mm6=(00 10)
+ punpckhdq mm1,mm7 ; mm1=(01 11)
+ movq mm3,mm0 ; transpose coefficients
+ punpckldq mm0,mm5 ; mm0=(60 70)
+ punpckhdq mm3,mm5 ; mm3=(61 71)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+ movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+ movq mm7, MMWORD [wk(0)] ; mm7=tmp2
+ movq mm5, MMWORD [wk(1)] ; mm5=tmp3
+
+ pfadd mm4,mm2 ; mm4=tmp4
+ movq mm6,mm7
+ movq mm1,mm5
+ pfadd mm7,mm2 ; mm7=data2=(20 21)
+ pfadd mm5,mm4 ; mm5=data4=(40 41)
+ pfsub mm6,mm2 ; mm6=data5=(50 51)
+ pfsub mm1,mm4 ; mm1=data3=(30 31)
+
+ movq mm0,mm7 ; transpose coefficients
+ punpckldq mm7,mm1 ; mm7=(20 30)
+ punpckhdq mm0,mm1 ; mm0=(21 31)
+ movq mm3,mm5 ; transpose coefficients
+ punpckldq mm5,mm6 ; mm5=(40 50)
+ punpckhdq mm3,mm6 ; mm3=(41 51)
+
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+ movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+ movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+ add esi, byte 2*SIZEOF_JCOEF ; coef_block
+ add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; FAST_FLOAT * wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/2 ; ctr
+ alignx 16,7
+.rowloop:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movq mm4,mm0
+ movq mm5,mm1
+ pfsub mm0,mm2 ; mm0=tmp11
+ pfsub mm1,mm3
+ pfadd mm4,mm2 ; mm4=tmp10
+ pfadd mm5,mm3 ; mm5=tmp13
+
+ pfmul mm1,[GOTOFF(ebx,PD_1_414)]
+ pfsub mm1,mm5 ; mm1=tmp12
+
+ movq mm6,mm4
+ movq mm7,mm0
+ pfsub mm4,mm5 ; mm4=tmp3
+ pfsub mm0,mm1 ; mm0=tmp2
+ pfadd mm6,mm5 ; mm6=tmp0
+ pfadd mm7,mm1 ; mm7=tmp1
+
+ movq MMWORD [wk(1)], mm4 ; tmp3
+ movq MMWORD [wk(0)], mm0 ; tmp2
+
+ ; -- Odd part
+
+ movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+ movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movq mm4,mm2
+ movq mm0,mm5
+ pfadd mm2,mm1 ; mm2=z11
+ pfadd mm5,mm3 ; mm5=z13
+ pfsub mm4,mm1 ; mm4=z12
+ pfsub mm0,mm3 ; mm0=z10
+
+ movq mm1,mm2
+ pfsub mm2,mm5
+ pfadd mm1,mm5 ; mm1=tmp7
+
+ pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11
+
+ movq mm3,mm0
+ pfadd mm0,mm4
+ pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5
+ pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930)
+ pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200)
+ pfsubr mm3,mm0 ; mm3=tmp12
+ pfsub mm4,mm0 ; mm4=tmp10
+
+ ; -- Final output stage
+
+ pfsub mm3,mm1 ; mm3=tmp6
+ movq mm5,mm6
+ movq mm0,mm7
+ pfadd mm6,mm1 ; mm6=data0=(00 10)
+ pfadd mm7,mm3 ; mm7=data1=(01 11)
+ pfsub mm5,mm1 ; mm5=data7=(07 17)
+ pfsub mm0,mm3 ; mm0=data6=(06 16)
+ pfsub mm2,mm3 ; mm2=tmp5
+
+ movq mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC]
+ pcmpeqd mm3,mm3
+ psrld mm3,WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+ pfadd mm6,mm1 ; mm6=roundint(data0/8)=(00 ** 10 **)
+ pfadd mm7,mm1 ; mm7=roundint(data1/8)=(01 ** 11 **)
+ pfadd mm0,mm1 ; mm0=roundint(data6/8)=(06 ** 16 **)
+ pfadd mm5,mm1 ; mm5=roundint(data7/8)=(07 ** 17 **)
+
+ pand mm6,mm3 ; mm6=(00 -- 10 --)
+ pslld mm7,WORD_BIT ; mm7=(-- 01 -- 11)
+ pand mm0,mm3 ; mm0=(06 -- 16 --)
+ pslld mm5,WORD_BIT ; mm5=(-- 07 -- 17)
+ por mm6,mm7 ; mm6=(00 01 10 11)
+ por mm0,mm5 ; mm0=(06 07 16 17)
+
+ movq mm1, MMWORD [wk(0)] ; mm1=tmp2
+ movq mm3, MMWORD [wk(1)] ; mm3=tmp3
+
+ pfadd mm4,mm2 ; mm4=tmp4
+ movq mm7,mm1
+ movq mm5,mm3
+ pfadd mm1,mm2 ; mm1=data2=(02 12)
+ pfadd mm3,mm4 ; mm3=data4=(04 14)
+ pfsub mm7,mm2 ; mm7=data5=(05 15)
+ pfsub mm5,mm4 ; mm5=data3=(03 13)
+
+ movq mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC]
+ pcmpeqd mm4,mm4
+ psrld mm4,WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+ pfadd mm3,mm2 ; mm3=roundint(data4/8)=(04 ** 14 **)
+ pfadd mm7,mm2 ; mm7=roundint(data5/8)=(05 ** 15 **)
+ pfadd mm1,mm2 ; mm1=roundint(data2/8)=(02 ** 12 **)
+ pfadd mm5,mm2 ; mm5=roundint(data3/8)=(03 ** 13 **)
+
+ pand mm3,mm4 ; mm3=(04 -- 14 --)
+ pslld mm7,WORD_BIT ; mm7=(-- 05 -- 15)
+ pand mm1,mm4 ; mm1=(02 -- 12 --)
+ pslld mm5,WORD_BIT ; mm5=(-- 03 -- 13)
+ por mm3,mm7 ; mm3=(04 05 14 15)
+ por mm1,mm5 ; mm1=(02 03 12 13)
+
+ movq mm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP]
+
+ packsswb mm6,mm3 ; mm6=(00 01 10 11 04 05 14 15)
+ packsswb mm1,mm0 ; mm1=(02 03 12 13 06 07 16 17)
+ paddb mm6,mm2
+ paddb mm1,mm2
+
+ movq mm4,mm6 ; transpose coefficients(phase 2)
+ punpcklwd mm6,mm1 ; mm6=(00 01 02 03 10 11 12 13)
+ punpckhwd mm4,mm1 ; mm4=(04 05 06 07 14 15 16 17)
+
+ movq mm7,mm6 ; transpose coefficients(phase 3)
+ punpckldq mm6,mm4 ; mm6=(00 01 02 03 04 05 06 07)
+ punpckhdq mm7,mm4 ; mm7=(10 11 12 13 14 15 16 17)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr
+ add edi, byte 2*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ femms ; empty MMX/3DNow! state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jimmxfst.asm b/simd/jimmxfst.asm
new file mode 100644
index 0000000..3b05572
--- /dev/null
+++ b/simd/jimmxfst.asm
@@ -0,0 +1,500 @@
+;
+; jimmxfst.asm - fast integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+%define PASS1_BITS 2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277 ; FIX(1.082392200)
+F_1_414 equ 362 ; FIX(1.414213562)
+F_1_847 equ 473 ; FIX(1.847759065)
+F_2_613 equ 669 ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 16
+ global EXTN(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414 times 4 dw F_1_414 << CONST_SHIFT
+PW_F1847 times 4 dw F_1_847 << CONST_SHIFT
+PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082 times 4 dw F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_mmx (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b)+8 ; jpeg_component_info * compptr
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
+ ; JCOEF workspace[DCTSIZE2]
+
+ align 16
+ global EXTN(jsimd_idct_ifast_mmx)
+
+EXTN(jsimd_idct_ifast_mmx):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; JCOEF * wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por mm1,mm0
+ packsswb mm1,mm1
+ movd eax,mm1
+ test eax,eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movq mm2,mm0 ; mm0=in0=(00 01 02 03)
+ punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
+ punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
+
+ movq mm1,mm0
+ punpckldq mm0,mm0 ; mm0=(00 00 00 00)
+ punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
+ movq mm3,mm2
+ punpckldq mm2,mm2 ; mm2=(02 02 02 02)
+ punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+ jmp near .nextcolumn
+ alignx 16,7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movq mm4,mm0
+ movq mm5,mm1
+ psubw mm0,mm2 ; mm0=tmp11
+ psubw mm1,mm3
+ paddw mm4,mm2 ; mm4=tmp10
+ paddw mm5,mm3 ; mm5=tmp13
+
+ psllw mm1,PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm1,[GOTOFF(ebx,PW_F1414)]
+ psubw mm1,mm5 ; mm1=tmp12
+
+ movq mm6,mm4
+ movq mm7,mm0
+ psubw mm4,mm5 ; mm4=tmp3
+ psubw mm0,mm1 ; mm0=tmp2
+ paddw mm6,mm5 ; mm6=tmp0
+ paddw mm7,mm1 ; mm7=tmp1
+
+ movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
+ movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movq mm4,mm2
+ movq mm0,mm5
+ psubw mm2,mm1 ; mm2=z12
+ psubw mm5,mm3 ; mm5=z10
+ paddw mm4,mm1 ; mm4=z11
+ paddw mm0,mm3 ; mm0=z13
+
+ movq mm1,mm5 ; mm1=z10(unscaled)
+ psllw mm2,PRE_MULTIPLY_SCALE_BITS
+ psllw mm5,PRE_MULTIPLY_SCALE_BITS
+
+ movq mm3,mm4
+ psubw mm4,mm0
+ paddw mm3,mm0 ; mm3=tmp7
+
+ psllw mm4,PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movq mm0,mm5
+ paddw mm5,mm2
+ pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5
+ pmulhw mm0,[GOTOFF(ebx,PW_MF1613)]
+ pmulhw mm2,[GOTOFF(ebx,PW_F1082)]
+ psubw mm0,mm1
+ psubw mm2,mm5 ; mm2=tmp10
+ paddw mm0,mm5 ; mm0=tmp12
+
+ ; -- Final output stage
+
+ psubw mm0,mm3 ; mm0=tmp6
+ movq mm1,mm6
+ movq mm5,mm7
+ paddw mm6,mm3 ; mm6=data0=(00 01 02 03)
+ paddw mm7,mm0 ; mm7=data1=(10 11 12 13)
+ psubw mm1,mm3 ; mm1=data7=(70 71 72 73)
+ psubw mm5,mm0 ; mm5=data6=(60 61 62 63)
+ psubw mm4,mm0 ; mm4=tmp5
+
+ movq mm3,mm6 ; transpose coefficients(phase 1)
+ punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
+ punpckhwd mm3,mm7 ; mm3=(02 12 03 13)
+ movq mm0,mm5 ; transpose coefficients(phase 1)
+ punpcklwd mm5,mm1 ; mm5=(60 70 61 71)
+ punpckhwd mm0,mm1 ; mm0=(62 72 63 73)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=tmp2
+ movq mm1, MMWORD [wk(1)] ; mm1=tmp3
+
+ movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71)
+ movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73)
+
+ paddw mm2,mm4 ; mm2=tmp4
+ movq mm5,mm7
+ movq mm0,mm1
+ paddw mm7,mm4 ; mm7=data2=(20 21 22 23)
+ paddw mm1,mm2 ; mm1=data4=(40 41 42 43)
+ psubw mm5,mm4 ; mm5=data5=(50 51 52 53)
+ psubw mm0,mm2 ; mm0=data3=(30 31 32 33)
+
+ movq mm4,mm7 ; transpose coefficients(phase 1)
+ punpcklwd mm7,mm0 ; mm7=(20 30 21 31)
+ punpckhwd mm4,mm0 ; mm4=(22 32 23 33)
+ movq mm2,mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1,mm5 ; mm1=(40 50 41 51)
+ punpckhwd mm2,mm5 ; mm2=(42 52 43 53)
+
+ movq mm0,mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6,mm7 ; mm6=(00 10 20 30)
+ punpckhdq mm0,mm7 ; mm0=(01 11 21 31)
+ movq mm5,mm3 ; transpose coefficients(phase 2)
+ punpckldq mm3,mm4 ; mm3=(02 12 22 32)
+ punpckhdq mm5,mm4 ; mm5=(03 13 23 33)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71)
+ movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+ movq mm6,mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1,mm7 ; mm1=(40 50 60 70)
+ punpckhdq mm6,mm7 ; mm6=(41 51 61 71)
+ movq mm0,mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2,mm4 ; mm2=(42 52 62 72)
+ punpckhdq mm0,mm4 ; mm0=(43 53 63 73)
+
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr
+ add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; JCOEF * wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16,7
+.rowloop:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ movq mm4,mm0
+ movq mm5,mm1
+ psubw mm0,mm2 ; mm0=tmp11
+ psubw mm1,mm3
+ paddw mm4,mm2 ; mm4=tmp10
+ paddw mm5,mm3 ; mm5=tmp13
+
+ psllw mm1,PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm1,[GOTOFF(ebx,PW_F1414)]
+ psubw mm1,mm5 ; mm1=tmp12
+
+ movq mm6,mm4
+ movq mm7,mm0
+ psubw mm4,mm5 ; mm4=tmp3
+ psubw mm0,mm1 ; mm0=tmp2
+ paddw mm6,mm5 ; mm6=tmp0
+ paddw mm7,mm1 ; mm7=tmp1
+
+ movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
+ movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ movq mm4,mm2
+ movq mm0,mm5
+ psubw mm2,mm1 ; mm2=z12
+ psubw mm5,mm3 ; mm5=z10
+ paddw mm4,mm1 ; mm4=z11
+ paddw mm0,mm3 ; mm0=z13
+
+ movq mm1,mm5 ; mm1=z10(unscaled)
+ psllw mm2,PRE_MULTIPLY_SCALE_BITS
+ psllw mm5,PRE_MULTIPLY_SCALE_BITS
+
+ movq mm3,mm4
+ psubw mm4,mm0
+ paddw mm3,mm0 ; mm3=tmp7
+
+ psllw mm4,PRE_MULTIPLY_SCALE_BITS
+ pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movq mm0,mm5
+ paddw mm5,mm2
+ pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5
+ pmulhw mm0,[GOTOFF(ebx,PW_MF1613)]
+ pmulhw mm2,[GOTOFF(ebx,PW_F1082)]
+ psubw mm0,mm1
+ psubw mm2,mm5 ; mm2=tmp10
+ paddw mm0,mm5 ; mm0=tmp12
+
+ ; -- Final output stage
+
+ psubw mm0,mm3 ; mm0=tmp6
+ movq mm1,mm6
+ movq mm5,mm7
+ paddw mm6,mm3 ; mm6=data0=(00 10 20 30)
+ paddw mm7,mm0 ; mm7=data1=(01 11 21 31)
+ psraw mm6,(PASS1_BITS+3) ; descale
+ psraw mm7,(PASS1_BITS+3) ; descale
+ psubw mm1,mm3 ; mm1=data7=(07 17 27 37)
+ psubw mm5,mm0 ; mm5=data6=(06 16 26 36)
+ psraw mm1,(PASS1_BITS+3) ; descale
+ psraw mm5,(PASS1_BITS+3) ; descale
+ psubw mm4,mm0 ; mm4=tmp5
+
+ packsswb mm6,mm5 ; mm6=(00 10 20 30 06 16 26 36)
+ packsswb mm7,mm1 ; mm7=(01 11 21 31 07 17 27 37)
+
+ movq mm3, MMWORD [wk(0)] ; mm3=tmp2
+ movq mm0, MMWORD [wk(1)] ; mm0=tmp3
+
+ paddw mm2,mm4 ; mm2=tmp4
+ movq mm5,mm3
+ movq mm1,mm0
+ paddw mm3,mm4 ; mm3=data2=(02 12 22 32)
+ paddw mm0,mm2 ; mm0=data4=(04 14 24 34)
+ psraw mm3,(PASS1_BITS+3) ; descale
+ psraw mm0,(PASS1_BITS+3) ; descale
+ psubw mm5,mm4 ; mm5=data5=(05 15 25 35)
+ psubw mm1,mm2 ; mm1=data3=(03 13 23 33)
+ psraw mm5,(PASS1_BITS+3) ; descale
+ psraw mm1,(PASS1_BITS+3) ; descale
+
+ movq mm4,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP]
+
+ packsswb mm3,mm0 ; mm3=(02 12 22 32 04 14 24 34)
+ packsswb mm1,mm5 ; mm1=(03 13 23 33 05 15 25 35)
+
+ paddb mm6,mm4
+ paddb mm7,mm4
+ paddb mm3,mm4
+ paddb mm1,mm4
+
+ movq mm2,mm6 ; transpose coefficients(phase 1)
+ punpcklbw mm6,mm7 ; mm6=(00 01 10 11 20 21 30 31)
+ punpckhbw mm2,mm7 ; mm2=(06 07 16 17 26 27 36 37)
+ movq mm0,mm3 ; transpose coefficients(phase 1)
+ punpcklbw mm3,mm1 ; mm3=(02 03 12 13 22 23 32 33)
+ punpckhbw mm0,mm1 ; mm0=(04 05 14 15 24 25 34 35)
+
+ movq mm5,mm6 ; transpose coefficients(phase 2)
+ punpcklwd mm6,mm3 ; mm6=(00 01 02 03 10 11 12 13)
+ punpckhwd mm5,mm3 ; mm5=(20 21 22 23 30 31 32 33)
+ movq mm4,mm0 ; transpose coefficients(phase 2)
+ punpcklwd mm0,mm2 ; mm0=(04 05 06 07 14 15 16 17)
+ punpckhwd mm4,mm2 ; mm4=(24 25 26 27 34 35 36 37)
+
+ movq mm7,mm6 ; transpose coefficients(phase 3)
+ punpckldq mm6,mm0 ; mm6=(00 01 02 03 04 05 06 07)
+ punpckhdq mm7,mm0 ; mm7=(10 11 12 13 14 15 16 17)
+ movq mm1,mm5 ; transpose coefficients(phase 3)
+ punpckldq mm5,mm4 ; mm5=(20 21 22 23 24 25 26 27)
+ punpckhdq mm1,mm4 ; mm1=(30 31 32 33 34 35 36 37)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_JCOEF ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jimmxint.asm b/simd/jimmxint.asm
new file mode 100644
index 0000000..7b52fae
--- /dev/null
+++ b/simd/jimmxint.asm
@@ -0,0 +1,852 @@
+;
+; jimmxint.asm - accurate integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541
+PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175
+PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1)
+PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1)
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_mmx (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b)+8 ; jpeg_component_info * compptr
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 12
+%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
+ ; JCOEF workspace[DCTSIZE2]
+
+ align 16
+ global EXTN(jsimd_idct_islow_mmx)
+
+EXTN(jsimd_idct_islow_mmx):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; JCOEF * wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por mm1,mm0
+ packsswb mm1,mm1
+ movd eax,mm1
+ test eax,eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw mm0,PASS1_BITS
+
+ movq mm2,mm0 ; mm0=in0=(00 01 02 03)
+ punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
+ punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
+
+ movq mm1,mm0
+ punpckldq mm0,mm0 ; mm0=(00 00 00 00)
+ punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
+ movq mm3,mm2
+ punpckldq mm2,mm2 ; mm2=(02 02 02 02)
+ punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+ jmp near .nextcolumn
+ alignx 16,7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movq mm4,mm1 ; mm1=in2=z2
+ movq mm5,mm1
+ punpcklwd mm4,mm3 ; mm3=in6=z3
+ punpckhwd mm5,mm3
+ movq mm1,mm4
+ movq mm3,mm5
+ pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L
+ pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H
+ pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L
+ pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H
+
+ movq mm6,mm0
+ paddw mm0,mm2 ; mm0=in0+in4
+ psubw mm6,mm2 ; mm6=in0-in4
+
+ pxor mm7,mm7
+ pxor mm2,mm2
+ punpcklwd mm7,mm0 ; mm7=tmp0L
+ punpckhwd mm2,mm0 ; mm2=tmp0H
+ psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
+ psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+ movq mm0,mm7
+ paddd mm7,mm4 ; mm7=tmp10L
+ psubd mm0,mm4 ; mm0=tmp13L
+ movq mm4,mm2
+ paddd mm2,mm5 ; mm2=tmp10H
+ psubd mm4,mm5 ; mm4=tmp13H
+
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L
+ movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H
+ movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L
+ movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H
+
+ pxor mm5,mm5
+ pxor mm7,mm7
+ punpcklwd mm5,mm6 ; mm5=tmp1L
+ punpckhwd mm7,mm6 ; mm7=tmp1H
+ psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS
+ psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+ movq mm2,mm5
+ paddd mm5,mm1 ; mm5=tmp11L
+ psubd mm2,mm1 ; mm2=tmp12L
+ movq mm0,mm7
+ paddd mm7,mm3 ; mm7=tmp11H
+ psubd mm0,mm3 ; mm0=tmp12H
+
+ movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L
+ movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H
+ movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L
+ movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movq mm5,mm6
+ movq mm7,mm4
+ paddw mm5,mm3 ; mm5=z3
+ paddw mm7,mm1 ; mm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movq mm2,mm5
+ movq mm0,mm5
+ punpcklwd mm2,mm7
+ punpckhwd mm0,mm7
+ movq mm5,mm2
+ movq mm7,mm0
+ pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L
+ pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H
+ pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L
+ pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H
+
+ movq MMWORD [wk(10)], mm2 ; wk(10)=z3L
+ movq MMWORD [wk(11)], mm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movq mm2,mm3
+ movq mm0,mm3
+ punpcklwd mm2,mm4
+ punpckhwd mm0,mm4
+ movq mm3,mm2
+ movq mm4,mm0
+ pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L
+ pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H
+ pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L
+ pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H
+
+ paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L
+ paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H
+ paddd mm3,mm5 ; mm3=tmp3L
+ paddd mm4,mm7 ; mm4=tmp3H
+
+ movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L
+ movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H
+
+ movq mm2,mm1
+ movq mm0,mm1
+ punpcklwd mm2,mm6
+ punpckhwd mm0,mm6
+ movq mm1,mm2
+ movq mm6,mm0
+ pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L
+ pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H
+ pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L
+ pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H
+
+ paddd mm2,mm5 ; mm2=tmp1L
+ paddd mm0,mm7 ; mm0=tmp1H
+ paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L
+ paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H
+
+ movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L
+ movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movq mm5, MMWORD [wk(0)] ; mm5=tmp10L
+ movq mm7, MMWORD [wk(1)] ; mm7=tmp10H
+
+ movq mm2,mm5
+ movq mm0,mm7
+ paddd mm5,mm3 ; mm5=data0L
+ paddd mm7,mm4 ; mm7=data0H
+ psubd mm2,mm3 ; mm2=data7L
+ psubd mm0,mm4 ; mm0=data7H
+
+ movq mm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1]
+
+ paddd mm5,mm3
+ paddd mm7,mm3
+ psrad mm5,DESCALE_P1
+ psrad mm7,DESCALE_P1
+ paddd mm2,mm3
+ paddd mm0,mm3
+ psrad mm2,DESCALE_P1
+ psrad mm0,DESCALE_P1
+
+ packssdw mm5,mm7 ; mm5=data0=(00 01 02 03)
+ packssdw mm2,mm0 ; mm2=data7=(70 71 72 73)
+
+ movq mm4, MMWORD [wk(4)] ; mm4=tmp11L
+ movq mm3, MMWORD [wk(5)] ; mm3=tmp11H
+
+ movq mm7,mm4
+ movq mm0,mm3
+ paddd mm4,mm1 ; mm4=data1L
+ paddd mm3,mm6 ; mm3=data1H
+ psubd mm7,mm1 ; mm7=data6L
+ psubd mm0,mm6 ; mm0=data6H
+
+ movq mm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1]
+
+ paddd mm4,mm1
+ paddd mm3,mm1
+ psrad mm4,DESCALE_P1
+ psrad mm3,DESCALE_P1
+ paddd mm7,mm1
+ paddd mm0,mm1
+ psrad mm7,DESCALE_P1
+ psrad mm0,DESCALE_P1
+
+ packssdw mm4,mm3 ; mm4=data1=(10 11 12 13)
+ packssdw mm7,mm0 ; mm7=data6=(60 61 62 63)
+
+ movq mm6,mm5 ; transpose coefficients(phase 1)
+ punpcklwd mm5,mm4 ; mm5=(00 10 01 11)
+ punpckhwd mm6,mm4 ; mm6=(02 12 03 13)
+ movq mm1,mm7 ; transpose coefficients(phase 1)
+ punpcklwd mm7,mm2 ; mm7=(60 70 61 71)
+ punpckhwd mm1,mm2 ; mm1=(62 72 63 73)
+
+ movq mm3, MMWORD [wk(6)] ; mm3=tmp12L
+ movq mm0, MMWORD [wk(7)] ; mm0=tmp12H
+ movq mm4, MMWORD [wk(10)] ; mm4=tmp1L
+ movq mm2, MMWORD [wk(11)] ; mm2=tmp1H
+
+ movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11)
+ movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13)
+ movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71)
+ movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73)
+
+ movq mm5,mm3
+ movq mm6,mm0
+ paddd mm3,mm4 ; mm3=data2L
+ paddd mm0,mm2 ; mm0=data2H
+ psubd mm5,mm4 ; mm5=data5L
+ psubd mm6,mm2 ; mm6=data5H
+
+ movq mm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1]
+
+ paddd mm3,mm7
+ paddd mm0,mm7
+ psrad mm3,DESCALE_P1
+ psrad mm0,DESCALE_P1
+ paddd mm5,mm7
+ paddd mm6,mm7
+ psrad mm5,DESCALE_P1
+ psrad mm6,DESCALE_P1
+
+ packssdw mm3,mm0 ; mm3=data2=(20 21 22 23)
+ packssdw mm5,mm6 ; mm5=data5=(50 51 52 53)
+
+ movq mm1, MMWORD [wk(2)] ; mm1=tmp13L
+ movq mm4, MMWORD [wk(3)] ; mm4=tmp13H
+ movq mm2, MMWORD [wk(8)] ; mm2=tmp0L
+ movq mm7, MMWORD [wk(9)] ; mm7=tmp0H
+
+ movq mm0,mm1
+ movq mm6,mm4
+ paddd mm1,mm2 ; mm1=data3L
+ paddd mm4,mm7 ; mm4=data3H
+ psubd mm0,mm2 ; mm0=data4L
+ psubd mm6,mm7 ; mm6=data4H
+
+ movq mm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1]
+
+ paddd mm1,mm2
+ paddd mm4,mm2
+ psrad mm1,DESCALE_P1
+ psrad mm4,DESCALE_P1
+ paddd mm0,mm2
+ paddd mm6,mm2
+ psrad mm0,DESCALE_P1
+ psrad mm6,DESCALE_P1
+
+ packssdw mm1,mm4 ; mm1=data3=(30 31 32 33)
+ packssdw mm0,mm6 ; mm0=data4=(40 41 42 43)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11)
+ movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13)
+
+ movq mm4,mm3 ; transpose coefficients(phase 1)
+ punpcklwd mm3,mm1 ; mm3=(20 30 21 31)
+ punpckhwd mm4,mm1 ; mm4=(22 32 23 33)
+ movq mm6,mm0 ; transpose coefficients(phase 1)
+ punpcklwd mm0,mm5 ; mm0=(40 50 41 51)
+ punpckhwd mm6,mm5 ; mm6=(42 52 43 53)
+
+ movq mm1,mm7 ; transpose coefficients(phase 2)
+ punpckldq mm7,mm3 ; mm7=(00 10 20 30)
+ punpckhdq mm1,mm3 ; mm1=(01 11 21 31)
+ movq mm5,mm2 ; transpose coefficients(phase 2)
+ punpckldq mm2,mm4 ; mm2=(02 12 22 32)
+ punpckhdq mm5,mm4 ; mm5=(03 13 23 33)
+
+ movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71)
+ movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+ movq mm7,mm0 ; transpose coefficients(phase 2)
+ punpckldq mm0,mm3 ; mm0=(40 50 60 70)
+ punpckhdq mm7,mm3 ; mm7=(41 51 61 71)
+ movq mm1,mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6,mm4 ; mm6=(42 52 62 72)
+ punpckhdq mm1,mm4 ; mm1=(43 53 63 73)
+
+ movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+ movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
+ add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; JCOEF * wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16,7
+.rowloop:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movq mm4,mm1 ; mm1=in2=z2
+ movq mm5,mm1
+ punpcklwd mm4,mm3 ; mm3=in6=z3
+ punpckhwd mm5,mm3
+ movq mm1,mm4
+ movq mm3,mm5
+ pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L
+ pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H
+ pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L
+ pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H
+
+ movq mm6,mm0
+ paddw mm0,mm2 ; mm0=in0+in4
+ psubw mm6,mm2 ; mm6=in0-in4
+
+ pxor mm7,mm7
+ pxor mm2,mm2
+ punpcklwd mm7,mm0 ; mm7=tmp0L
+ punpckhwd mm2,mm0 ; mm2=tmp0H
+ psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
+ psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+ movq mm0,mm7
+ paddd mm7,mm4 ; mm7=tmp10L
+ psubd mm0,mm4 ; mm0=tmp13L
+ movq mm4,mm2
+ paddd mm2,mm5 ; mm2=tmp10H
+ psubd mm4,mm5 ; mm4=tmp13H
+
+ movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L
+ movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H
+ movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L
+ movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H
+
+ pxor mm5,mm5
+ pxor mm7,mm7
+ punpcklwd mm5,mm6 ; mm5=tmp1L
+ punpckhwd mm7,mm6 ; mm7=tmp1H
+ psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS
+ psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+ movq mm2,mm5
+ paddd mm5,mm1 ; mm5=tmp11L
+ psubd mm2,mm1 ; mm2=tmp12L
+ movq mm0,mm7
+ paddd mm7,mm3 ; mm7=tmp11H
+ psubd mm0,mm3 ; mm0=tmp12H
+
+ movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L
+ movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H
+ movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L
+ movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ movq mm5,mm6
+ movq mm7,mm4
+ paddw mm5,mm3 ; mm5=z3
+ paddw mm7,mm1 ; mm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movq mm2,mm5
+ movq mm0,mm5
+ punpcklwd mm2,mm7
+ punpckhwd mm0,mm7
+ movq mm5,mm2
+ movq mm7,mm0
+ pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L
+ pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H
+ pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L
+ pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H
+
+ movq MMWORD [wk(10)], mm2 ; wk(10)=z3L
+ movq MMWORD [wk(11)], mm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movq mm2,mm3
+ movq mm0,mm3
+ punpcklwd mm2,mm4
+ punpckhwd mm0,mm4
+ movq mm3,mm2
+ movq mm4,mm0
+ pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L
+ pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H
+ pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L
+ pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H
+
+ paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L
+ paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H
+ paddd mm3,mm5 ; mm3=tmp3L
+ paddd mm4,mm7 ; mm4=tmp3H
+
+ movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L
+ movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H
+
+ movq mm2,mm1
+ movq mm0,mm1
+ punpcklwd mm2,mm6
+ punpckhwd mm0,mm6
+ movq mm1,mm2
+ movq mm6,mm0
+ pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L
+ pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H
+ pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L
+ pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H
+
+ paddd mm2,mm5 ; mm2=tmp1L
+ paddd mm0,mm7 ; mm0=tmp1H
+ paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L
+ paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H
+
+ movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L
+ movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movq mm5, MMWORD [wk(0)] ; mm5=tmp10L
+ movq mm7, MMWORD [wk(1)] ; mm7=tmp10H
+
+ movq mm2,mm5
+ movq mm0,mm7
+ paddd mm5,mm3 ; mm5=data0L
+ paddd mm7,mm4 ; mm7=data0H
+ psubd mm2,mm3 ; mm2=data7L
+ psubd mm0,mm4 ; mm0=data7H
+
+ movq mm3,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2]
+
+ paddd mm5,mm3
+ paddd mm7,mm3
+ psrad mm5,DESCALE_P2
+ psrad mm7,DESCALE_P2
+ paddd mm2,mm3
+ paddd mm0,mm3
+ psrad mm2,DESCALE_P2
+ psrad mm0,DESCALE_P2
+
+ packssdw mm5,mm7 ; mm5=data0=(00 10 20 30)
+ packssdw mm2,mm0 ; mm2=data7=(07 17 27 37)
+
+ movq mm4, MMWORD [wk(4)] ; mm4=tmp11L
+ movq mm3, MMWORD [wk(5)] ; mm3=tmp11H
+
+ movq mm7,mm4
+ movq mm0,mm3
+ paddd mm4,mm1 ; mm4=data1L
+ paddd mm3,mm6 ; mm3=data1H
+ psubd mm7,mm1 ; mm7=data6L
+ psubd mm0,mm6 ; mm0=data6H
+
+ movq mm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2]
+
+ paddd mm4,mm1
+ paddd mm3,mm1
+ psrad mm4,DESCALE_P2
+ psrad mm3,DESCALE_P2
+ paddd mm7,mm1
+ paddd mm0,mm1
+ psrad mm7,DESCALE_P2
+ psrad mm0,DESCALE_P2
+
+ packssdw mm4,mm3 ; mm4=data1=(01 11 21 31)
+ packssdw mm7,mm0 ; mm7=data6=(06 16 26 36)
+
+ packsswb mm5,mm7 ; mm5=(00 10 20 30 06 16 26 36)
+ packsswb mm4,mm2 ; mm4=(01 11 21 31 07 17 27 37)
+
+ movq mm6, MMWORD [wk(6)] ; mm6=tmp12L
+ movq mm1, MMWORD [wk(7)] ; mm1=tmp12H
+ movq mm3, MMWORD [wk(10)] ; mm3=tmp1L
+ movq mm0, MMWORD [wk(11)] ; mm0=tmp1H
+
+ movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36)
+ movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37)
+
+ movq mm7,mm6
+ movq mm2,mm1
+ paddd mm6,mm3 ; mm6=data2L
+ paddd mm1,mm0 ; mm1=data2H
+ psubd mm7,mm3 ; mm7=data5L
+ psubd mm2,mm0 ; mm2=data5H
+
+ movq mm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2]
+
+ paddd mm6,mm5
+ paddd mm1,mm5
+ psrad mm6,DESCALE_P2
+ psrad mm1,DESCALE_P2
+ paddd mm7,mm5
+ paddd mm2,mm5
+ psrad mm7,DESCALE_P2
+ psrad mm2,DESCALE_P2
+
+ packssdw mm6,mm1 ; mm6=data2=(02 12 22 32)
+ packssdw mm7,mm2 ; mm7=data5=(05 15 25 35)
+
+ movq mm4, MMWORD [wk(2)] ; mm4=tmp13L
+ movq mm3, MMWORD [wk(3)] ; mm3=tmp13H
+ movq mm0, MMWORD [wk(8)] ; mm0=tmp0L
+ movq mm5, MMWORD [wk(9)] ; mm5=tmp0H
+
+ movq mm1,mm4
+ movq mm2,mm3
+ paddd mm4,mm0 ; mm4=data3L
+ paddd mm3,mm5 ; mm3=data3H
+ psubd mm1,mm0 ; mm1=data4L
+ psubd mm2,mm5 ; mm2=data4H
+
+ movq mm0,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2]
+
+ paddd mm4,mm0
+ paddd mm3,mm0
+ psrad mm4,DESCALE_P2
+ psrad mm3,DESCALE_P2
+ paddd mm1,mm0
+ paddd mm2,mm0
+ psrad mm1,DESCALE_P2
+ psrad mm2,DESCALE_P2
+
+ movq mm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP]
+
+ packssdw mm4,mm3 ; mm4=data3=(03 13 23 33)
+ packssdw mm1,mm2 ; mm1=data4=(04 14 24 34)
+
+ movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36)
+ movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37)
+
+ packsswb mm6,mm1 ; mm6=(02 12 22 32 04 14 24 34)
+ packsswb mm4,mm7 ; mm4=(03 13 23 33 05 15 25 35)
+
+ paddb mm0,mm5
+ paddb mm3,mm5
+ paddb mm6,mm5
+ paddb mm4,mm5
+
+ movq mm2,mm0 ; transpose coefficients(phase 1)
+ punpcklbw mm0,mm3 ; mm0=(00 01 10 11 20 21 30 31)
+ punpckhbw mm2,mm3 ; mm2=(06 07 16 17 26 27 36 37)
+ movq mm1,mm6 ; transpose coefficients(phase 1)
+ punpcklbw mm6,mm4 ; mm6=(02 03 12 13 22 23 32 33)
+ punpckhbw mm1,mm4 ; mm1=(04 05 14 15 24 25 34 35)
+
+ movq mm7,mm0 ; transpose coefficients(phase 2)
+ punpcklwd mm0,mm6 ; mm0=(00 01 02 03 10 11 12 13)
+ punpckhwd mm7,mm6 ; mm7=(20 21 22 23 30 31 32 33)
+ movq mm5,mm1 ; transpose coefficients(phase 2)
+ punpcklwd mm1,mm2 ; mm1=(04 05 06 07 14 15 16 17)
+ punpckhwd mm5,mm2 ; mm5=(24 25 26 27 34 35 36 37)
+
+ movq mm3,mm0 ; transpose coefficients(phase 3)
+ punpckldq mm0,mm1 ; mm0=(00 01 02 03 04 05 06 07)
+ punpckhdq mm3,mm1 ; mm3=(10 11 12 13 14 15 16 17)
+ movq mm4,mm7 ; transpose coefficients(phase 3)
+ punpckldq mm7,mm5 ; mm7=(20 21 22 23 24 25 26 27)
+ punpckhdq mm4,mm5 ; mm4=(30 31 32 33 34 35 36 37)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_JCOEF ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jimmxred.asm b/simd/jimmxred.asm
new file mode 100644
index 0000000..a2b7103
--- /dev/null
+++ b/simd/jimmxred.asm
@@ -0,0 +1,706 @@
+;
+; jimmxred.asm - reduced-size IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211 equ 1730 ; FIX(0.211164243)
+F_0_509 equ 4176 ; FIX(0.509795579)
+F_0_601 equ 4926 ; FIX(0.601344887)
+F_0_720 equ 5906 ; FIX(0.720959822)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_850 equ 6967 ; FIX(0.850430095)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_061 equ 8697 ; FIX(1.061594337)
+F_1_272 equ 10426 ; FIX(1.272758580)
+F_1_451 equ 11893 ; FIX(1.451774981)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_2_172 equ 17799 ; FIX(2.172734803)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_624 equ 29692 ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076 times 2 dw F_1_847,-F_0_765
+PW_F256_F089 times 2 dw F_2_562, F_0_899
+PW_F106_MF217 times 2 dw F_1_061,-F_2_172
+PW_MF060_MF050 times 2 dw -F_0_601,-F_0_509
+PW_F145_MF021 times 2 dw F_1_451,-F_0_211
+PW_F362_MF127 times 2 dw F_3_624,-F_1_272
+PW_F085_MF072 times 2 dw F_0_850,-F_0_720
+PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_mmx (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b)+8 ; void * dct_table
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
+ ; JCOEF workspace[DCTSIZE2]
+
+ align 16
+ global EXTN(jsimd_idct_4x4_mmx)
+
+EXTN(jsimd_idct_4x4_mmx):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; JCOEF * wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por mm0,mm1
+ packsswb mm0,mm0
+ movd eax,mm0
+ test eax,eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw mm0,PASS1_BITS
+
+ movq mm2,mm0 ; mm0=in0=(00 01 02 03)
+ punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
+ punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
+
+ movq mm1,mm0
+ punpckldq mm0,mm0 ; mm0=(00 00 00 00)
+ punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
+ movq mm3,mm2
+ punpckldq mm2,mm2 ; mm2=(02 02 02 02)
+ punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+ jmp near .nextcolumn
+ alignx 16,7
+%endif
+.columnDCT:
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movq mm4,mm0
+ movq mm5,mm0
+ punpcklwd mm4,mm1
+ punpckhwd mm5,mm1
+ movq mm0,mm4
+ movq mm1,mm5
+ pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
+ pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
+ pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
+ pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
+
+ movq mm6,mm2
+ movq mm7,mm2
+ punpcklwd mm6,mm3
+ punpckhwd mm7,mm3
+ movq mm2,mm6
+ movq mm3,mm7
+ pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
+ pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
+ pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
+ pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
+
+ paddd mm6,mm4 ; mm6=tmp2L
+ paddd mm7,mm5 ; mm7=tmp2H
+ paddd mm2,mm0 ; mm2=tmp0L
+ paddd mm3,mm1 ; mm3=tmp0H
+
+ movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
+ movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ pxor mm1,mm1
+ pxor mm2,mm2
+ punpcklwd mm1,mm4 ; mm1=tmp0L
+ punpckhwd mm2,mm4 ; mm2=tmp0H
+ psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+ psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+ movq mm3,mm5 ; mm5=in2=z2
+ punpcklwd mm5,mm0 ; mm0=in6=z3
+ punpckhwd mm3,mm0
+ pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
+ pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
+
+ movq mm4,mm1
+ movq mm0,mm2
+ paddd mm1,mm5 ; mm1=tmp10L
+ paddd mm2,mm3 ; mm2=tmp10H
+ psubd mm4,mm5 ; mm4=tmp12L
+ psubd mm0,mm3 ; mm0=tmp12H
+
+ ; -- Final output stage
+
+ movq mm5,mm1
+ movq mm3,mm2
+ paddd mm1,mm6 ; mm1=data0L
+ paddd mm2,mm7 ; mm2=data0H
+ psubd mm5,mm6 ; mm5=data3L
+ psubd mm3,mm7 ; mm3=data3H
+
+ movq mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4]
+
+ paddd mm1,mm6
+ paddd mm2,mm6
+ psrad mm1,DESCALE_P1_4
+ psrad mm2,DESCALE_P1_4
+ paddd mm5,mm6
+ paddd mm3,mm6
+ psrad mm5,DESCALE_P1_4
+ psrad mm3,DESCALE_P1_4
+
+ packssdw mm1,mm2 ; mm1=data0=(00 01 02 03)
+ packssdw mm5,mm3 ; mm5=data3=(30 31 32 33)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
+ movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
+
+ movq mm2,mm4
+ movq mm3,mm0
+ paddd mm4,mm7 ; mm4=data1L
+ paddd mm0,mm6 ; mm0=data1H
+ psubd mm2,mm7 ; mm2=data2L
+ psubd mm3,mm6 ; mm3=data2H
+
+ movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4]
+
+ paddd mm4,mm7
+ paddd mm0,mm7
+ psrad mm4,DESCALE_P1_4
+ psrad mm0,DESCALE_P1_4
+ paddd mm2,mm7
+ paddd mm3,mm7
+ psrad mm2,DESCALE_P1_4
+ psrad mm3,DESCALE_P1_4
+
+ packssdw mm4,mm0 ; mm4=data1=(10 11 12 13)
+ packssdw mm2,mm3 ; mm2=data2=(20 21 22 23)
+
+ movq mm6,mm1 ; transpose coefficients(phase 1)
+ punpcklwd mm1,mm4 ; mm1=(00 10 01 11)
+ punpckhwd mm6,mm4 ; mm6=(02 12 03 13)
+ movq mm7,mm2 ; transpose coefficients(phase 1)
+ punpcklwd mm2,mm5 ; mm2=(20 30 21 31)
+ punpckhwd mm7,mm5 ; mm7=(22 32 23 33)
+
+ movq mm0,mm1 ; transpose coefficients(phase 2)
+ punpckldq mm1,mm2 ; mm1=(00 10 20 30)
+ punpckhdq mm0,mm2 ; mm0=(01 11 21 31)
+ movq mm3,mm6 ; transpose coefficients(phase 2)
+ punpckldq mm6,mm7 ; mm6=(02 12 22 32)
+ punpckhdq mm3,mm7 ; mm3=(03 13 23 33)
+
+ movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+ movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+ movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+ movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
+ add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; JCOEF * wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ movq mm4,mm0
+ movq mm5,mm0
+ punpcklwd mm4,mm1
+ punpckhwd mm5,mm1
+ movq mm0,mm4
+ movq mm1,mm5
+ pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
+ pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
+ pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
+ pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
+
+ movq mm6,mm2
+ movq mm7,mm2
+ punpcklwd mm6,mm3
+ punpckhwd mm7,mm3
+ movq mm2,mm6
+ movq mm3,mm7
+ pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
+ pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
+ pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
+ pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
+
+ paddd mm6,mm4 ; mm6=tmp2L
+ paddd mm7,mm5 ; mm7=tmp2H
+ paddd mm2,mm0 ; mm2=tmp0L
+ paddd mm3,mm1 ; mm3=tmp0H
+
+ movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
+ movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ pxor mm1,mm1
+ pxor mm2,mm2
+ punpcklwd mm1,mm4 ; mm1=tmp0L
+ punpckhwd mm2,mm4 ; mm2=tmp0H
+ psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+ psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+ movq mm3,mm5 ; mm5=in2=z2
+ punpcklwd mm5,mm0 ; mm0=in6=z3
+ punpckhwd mm3,mm0
+ pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
+ pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
+
+ movq mm4,mm1
+ movq mm0,mm2
+ paddd mm1,mm5 ; mm1=tmp10L
+ paddd mm2,mm3 ; mm2=tmp10H
+ psubd mm4,mm5 ; mm4=tmp12L
+ psubd mm0,mm3 ; mm0=tmp12H
+
+ ; -- Final output stage
+
+ movq mm5,mm1
+ movq mm3,mm2
+ paddd mm1,mm6 ; mm1=data0L
+ paddd mm2,mm7 ; mm2=data0H
+ psubd mm5,mm6 ; mm5=data3L
+ psubd mm3,mm7 ; mm3=data3H
+
+ movq mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4]
+
+ paddd mm1,mm6
+ paddd mm2,mm6
+ psrad mm1,DESCALE_P2_4
+ psrad mm2,DESCALE_P2_4
+ paddd mm5,mm6
+ paddd mm3,mm6
+ psrad mm5,DESCALE_P2_4
+ psrad mm3,DESCALE_P2_4
+
+ packssdw mm1,mm2 ; mm1=data0=(00 10 20 30)
+ packssdw mm5,mm3 ; mm5=data3=(03 13 23 33)
+
+ movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
+ movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
+
+ movq mm2,mm4
+ movq mm3,mm0
+ paddd mm4,mm7 ; mm4=data1L
+ paddd mm0,mm6 ; mm0=data1H
+ psubd mm2,mm7 ; mm2=data2L
+ psubd mm3,mm6 ; mm3=data2H
+
+ movq mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4]
+
+ paddd mm4,mm7
+ paddd mm0,mm7
+ psrad mm4,DESCALE_P2_4
+ psrad mm0,DESCALE_P2_4
+ paddd mm2,mm7
+ paddd mm3,mm7
+ psrad mm2,DESCALE_P2_4
+ psrad mm3,DESCALE_P2_4
+
+ packssdw mm4,mm0 ; mm4=data1=(01 11 21 31)
+ packssdw mm2,mm3 ; mm2=data2=(02 12 22 32)
+
+ movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP]
+
+ packsswb mm1,mm2 ; mm1=(00 10 20 30 02 12 22 32)
+ packsswb mm4,mm5 ; mm4=(01 11 21 31 03 13 23 33)
+ paddb mm1,mm6
+ paddb mm4,mm6
+
+ movq mm7,mm1 ; transpose coefficients(phase 1)
+ punpcklbw mm1,mm4 ; mm1=(00 01 10 11 20 21 30 31)
+ punpckhbw mm7,mm4 ; mm7=(02 03 12 13 22 23 32 33)
+
+ movq mm0,mm1 ; transpose coefficients(phase 2)
+ punpcklwd mm1,mm7 ; mm1=(00 01 02 03 10 11 12 13)
+ punpckhwd mm0,mm7 ; mm0=(20 21 22 23 30 31 32 33)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+ movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+ psrlq mm1,4*BYTE_BIT
+ psrlq mm0,4*BYTE_BIT
+
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+ movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_mmx (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b)+8 ; void * dct_table
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
+
+ align 16
+ global EXTN(jsimd_idct_2x2_mmx)
+
+EXTN(jsimd_idct_2x2_mmx):
+ push ebp
+ mov ebp,esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+ mov edx, POINTER [dct_table(ebp)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
+
+ ; | input: | result: |
+ ; | 00 01 ** 03 ** 05 ** 07 | |
+ ; | 10 11 ** 13 ** 15 ** 17 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+ ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+ ; | 50 51 ** 53 ** 55 ** 57 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 70 71 ** 73 ** 75 ** 77 | |
+
+ ; -- Odd part
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+ ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+ pcmpeqd mm7,mm7
+ pslld mm7,WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+ movq mm4,mm0 ; mm4=(10 11 ** 13)
+ movq mm5,mm2 ; mm5=(50 51 ** 53)
+ punpcklwd mm4,mm1 ; mm4=(10 30 11 31)
+ punpcklwd mm5,mm3 ; mm5=(50 70 51 71)
+ pmaddwd mm4,[GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+ psrld mm0,WORD_BIT ; mm0=(11 -- 13 --)
+ pand mm1,mm7 ; mm1=(-- 31 -- 33)
+ psrld mm2,WORD_BIT ; mm2=(51 -- 53 --)
+ pand mm3,mm7 ; mm3=(-- 71 -- 73)
+ por mm0,mm1 ; mm0=(11 31 13 33)
+ por mm2,mm3 ; mm2=(51 71 53 73)
+ pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd mm4,mm5 ; mm4=tmp0[col0 col1]
+
+ movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+ pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+ pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+ ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+ psrld mm6,WORD_BIT ; mm6=(15 -- 17 --)
+ pand mm1,mm7 ; mm1=(-- 35 -- 37)
+ psrld mm3,WORD_BIT ; mm3=(55 -- 57 --)
+ pand mm5,mm7 ; mm5=(-- 75 -- 77)
+ por mm6,mm1 ; mm6=(15 35 17 37)
+ por mm3,mm5 ; mm3=(55 75 57 77)
+ pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd mm0,mm2 ; mm0=tmp0[col1 col3]
+ paddd mm6,mm3 ; mm6=tmp0[col5 col7]
+
+ ; -- Even part
+
+ movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+ pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+ movq mm2,mm1 ; mm2=(00 01 ** 03)
+ pslld mm1,WORD_BIT ; mm1=(-- 00 -- **)
+ psrad mm1,(WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****]
+
+ pand mm2,mm7 ; mm2=(-- 01 -- 03)
+ pand mm5,mm7 ; mm5=(-- 05 -- 07)
+ psrad mm2,(WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3]
+ psrad mm5,(WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7]
+
+ ; -- Final output stage
+
+ movq mm3,mm1
+ paddd mm1,mm4 ; mm1=data0[col0 ****]=(A0 **)
+ psubd mm3,mm4 ; mm3=data1[col0 ****]=(B0 **)
+ punpckldq mm1,mm3 ; mm1=(A0 B0)
+
+ movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2]
+
+ movq mm4,mm2
+ movq mm3,mm5
+ paddd mm2,mm0 ; mm2=data0[col1 col3]=(A1 A3)
+ paddd mm5,mm6 ; mm5=data0[col5 col7]=(A5 A7)
+ psubd mm4,mm0 ; mm4=data1[col1 col3]=(B1 B3)
+ psubd mm3,mm6 ; mm3=data1[col5 col7]=(B5 B7)
+
+ paddd mm1,mm7
+ psrad mm1,DESCALE_P1_2
+
+ paddd mm2,mm7
+ paddd mm5,mm7
+ psrad mm2,DESCALE_P1_2
+ psrad mm5,DESCALE_P1_2
+ paddd mm4,mm7
+ paddd mm3,mm7
+ psrad mm4,DESCALE_P1_2
+ psrad mm3,DESCALE_P1_2
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(ebp)]
+
+ ; | input:| result:|
+ ; | A0 B0 | |
+ ; | A1 B1 | C0 C1 |
+ ; | A3 B3 | D0 D1 |
+ ; | A5 B5 | |
+ ; | A7 B7 | |
+
+ ; -- Odd part
+
+ packssdw mm2,mm4 ; mm2=(A1 A3 B1 B3)
+ packssdw mm5,mm3 ; mm5=(A5 A7 B5 B7)
+ pmaddwd mm2,[GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd mm2,mm5 ; mm2=tmp0[row0 row1]
+
+ ; -- Even part
+
+ pslld mm1,(CONST_BITS+2) ; mm1=tmp10[row0 row1]
+
+ ; -- Final output stage
+
+ movq mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2]
+
+ movq mm6,mm1
+ paddd mm1,mm2 ; mm1=data0[row0 row1]=(C0 C1)
+ psubd mm6,mm2 ; mm6=data1[row0 row1]=(D0 D1)
+
+ paddd mm1,mm0
+ paddd mm6,mm0
+ psrad mm1,DESCALE_P2_2
+ psrad mm6,DESCALE_P2_2
+
+ movq mm7,mm1 ; transpose coefficients
+ punpckldq mm1,mm6 ; mm1=(C0 D0)
+ punpckhdq mm7,mm6 ; mm7=(C1 D1)
+
+ packssdw mm1,mm7 ; mm1=(C0 D0 C1 D1)
+ packsswb mm1,mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+ paddb mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+ movd ecx,mm1
+ movd ebx,mm1 ; ebx=(C0 D0 C1 D1)
+ shr ecx,2*BYTE_BIT ; ecx=(C1 D1 -- --)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
+ mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jiss2flt-64.asm b/simd/jiss2flt-64.asm
new file mode 100644
index 0000000..0e8522d
--- /dev/null
+++ b/simd/jiss2flt-64.asm
@@ -0,0 +1,483 @@
+;
+; jiss2flt.asm - floating-point IDCT (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1,%2,0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414 times 4 dd 1.414213562373095048801689
+PD_1_847 times 4 dd 1.847759065022573512256366
+PD_1_082 times 4 dd 1.082392200292393968799446
+PD_M2_613 times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void * dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp rbp+0
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+ ; FAST_FLOAT workspace[DCTSIZE2]
+
+ align 16
+ global EXTN(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+ push rbp
+ mov rax,rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp],eax
+ mov rbp,rsp ; rbp = aligned rbp
+ lea rsp, [workspace]
+ push rbx
+ collect_args
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+ lea rdi, [workspace] ; FAST_FLOAT * wsptr
+ mov rcx, DCTSIZE/4 ; ctr
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+ mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm1,xmm2
+ por xmm3,xmm4
+ por xmm5,xmm6
+ por xmm1,xmm3
+ por xmm5,xmm7
+ por xmm1,xmm5
+ packsswb xmm1,xmm1
+ movd eax,xmm1
+ test rax,rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm1,xmm0
+ movaps xmm2,xmm0
+ movaps xmm3,xmm0
+
+ shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
+ shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
+ shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
+ shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
+
+ movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ jmp near .nextcolumn
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
+ psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
+ cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
+
+ punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
+ punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
+ psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
+ psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
+ cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
+ cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4,xmm0
+ movaps xmm5,xmm1
+ subps xmm0,xmm2 ; xmm0=tmp11
+ subps xmm1,xmm3
+ addps xmm4,xmm2 ; xmm4=tmp10
+ addps xmm5,xmm3 ; xmm5=tmp13
+
+ mulps xmm1,[rel PD_1_414]
+ subps xmm1,xmm5 ; xmm1=tmp12
+
+ movaps xmm6,xmm4
+ movaps xmm7,xmm0
+ subps xmm4,xmm5 ; xmm4=tmp3
+ subps xmm0,xmm1 ; xmm0=tmp2
+ addps xmm6,xmm5 ; xmm6=tmp0
+ addps xmm7,xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
+ punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
+ psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
+ psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
+ cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
+ cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
+
+ punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
+ punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
+ psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
+ psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
+ cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
+ cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
+
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4,xmm2
+ movaps xmm0,xmm5
+ addps xmm2,xmm1 ; xmm2=z11
+ addps xmm5,xmm3 ; xmm5=z13
+ subps xmm4,xmm1 ; xmm4=z12
+ subps xmm0,xmm3 ; xmm0=z10
+
+ movaps xmm1,xmm2
+ subps xmm2,xmm5
+ addps xmm1,xmm5 ; xmm1=tmp7
+
+ mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
+
+ movaps xmm3,xmm0
+ addps xmm0,xmm4
+ mulps xmm0,[rel PD_1_847] ; xmm0=z5
+ mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
+ addps xmm3,xmm0 ; xmm3=tmp12
+ subps xmm4,xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3,xmm1 ; xmm3=tmp6
+ movaps xmm5,xmm6
+ movaps xmm0,xmm7
+ addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
+ addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
+ subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
+ subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
+ subps xmm2,xmm3 ; xmm2=tmp5
+
+ movaps xmm1,xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
+ movaps xmm3,xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
+ unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
+
+ movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
+ movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
+
+ addps xmm4,xmm2 ; xmm4=tmp4
+ movaps xmm0,xmm7
+ movaps xmm3,xmm5
+ addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
+ addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
+ subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
+ subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
+
+ movaps xmm2,xmm7 ; transpose coefficients(phase 1)
+ unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
+ unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
+ movaps xmm4,xmm5 ; transpose coefficients(phase 1)
+ unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
+ unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
+
+ movaps xmm3,xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
+ unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
+ movaps xmm0,xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
+ unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
+ movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+ movaps xmm6,xmm5 ; transpose coefficients(phase 2)
+ unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
+ unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
+ movaps xmm3,xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
+ unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+ add rsi, byte 4*SIZEOF_JCOEF ; coef_block
+ add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec rcx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov rax, [original_rbp]
+ lea rsi, [workspace] ; FAST_FLOAT * wsptr
+ mov rdi, r12 ; (JSAMPROW *)
+ mov rax, r13
+ mov rcx, DCTSIZE/4 ; ctr
+.rowloop:
+
+ ; -- Even part
+
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4,xmm0
+ movaps xmm5,xmm1
+ subps xmm0,xmm2 ; xmm0=tmp11
+ subps xmm1,xmm3
+ addps xmm4,xmm2 ; xmm4=tmp10
+ addps xmm5,xmm3 ; xmm5=tmp13
+
+ mulps xmm1,[rel PD_1_414]
+ subps xmm1,xmm5 ; xmm1=tmp12
+
+ movaps xmm6,xmm4
+ movaps xmm7,xmm0
+ subps xmm4,xmm5 ; xmm4=tmp3
+ subps xmm0,xmm1 ; xmm0=tmp2
+ addps xmm6,xmm5 ; xmm6=tmp0
+ addps xmm7,xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4,xmm2
+ movaps xmm0,xmm5
+ addps xmm2,xmm1 ; xmm2=z11
+ addps xmm5,xmm3 ; xmm5=z13
+ subps xmm4,xmm1 ; xmm4=z12
+ subps xmm0,xmm3 ; xmm0=z10
+
+ movaps xmm1,xmm2
+ subps xmm2,xmm5
+ addps xmm1,xmm5 ; xmm1=tmp7
+
+ mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
+
+ movaps xmm3,xmm0
+ addps xmm0,xmm4
+ mulps xmm0,[rel PD_1_847] ; xmm0=z5
+ mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
+ addps xmm3,xmm0 ; xmm3=tmp12
+ subps xmm4,xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3,xmm1 ; xmm3=tmp6
+ movaps xmm5,xmm6
+ movaps xmm0,xmm7
+ addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
+ addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
+ subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
+ subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
+ subps xmm2,xmm3 ; xmm2=tmp5
+
+ movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
+ pcmpeqd xmm3,xmm3
+ psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+ addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+ addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+ addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+ pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
+ pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+ pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
+ pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+ por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
+ por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
+
+ movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
+
+ addps xmm4,xmm2 ; xmm4=tmp4
+ movaps xmm7,xmm1
+ movaps xmm5,xmm3
+ addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
+ addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
+ subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
+ subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
+
+ movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
+ pcmpeqd xmm4,xmm4
+ psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+ addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+ addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+ addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+ pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
+ pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+ pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
+ pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+ por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
+ por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
+
+ packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+ packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+ paddb xmm6,xmm2
+ paddb xmm1,xmm2
+
+ movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+ movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
+ punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+ pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+ mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+ mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+ add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
+ add rdi, byte 4*SIZEOF_JSAMPROW
+ dec rcx ; ctr
+ jnz near .rowloop
+
+ uncollect_args
+ pop rbx
+ mov rsp,rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jiss2flt.asm b/simd/jiss2flt.asm
new file mode 100644
index 0000000..17bc363
--- /dev/null
+++ b/simd/jiss2flt.asm
@@ -0,0 +1,498 @@
+;
+; jiss2flt.asm - floating-point IDCT (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1,%2,0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414 times 4 dd 1.414213562373095048801689
+PD_1_847 times 4 dd 1.847759065022573512256366
+PD_1_082 times 4 dd 1.082392200292393968799446
+PD_M2_613 times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b)+8 ; void * dct_table
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+ ; FAST_FLOAT workspace[DCTSIZE2]
+
+ align 16
+ global EXTN(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; FAST_FLOAT * wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm1,xmm2
+ por xmm3,xmm4
+ por xmm5,xmm6
+ por xmm1,xmm3
+ por xmm5,xmm7
+ por xmm1,xmm5
+ packsswb xmm1,xmm1
+ movd eax,xmm1
+ test eax,eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm1,xmm0
+ movaps xmm2,xmm0
+ movaps xmm3,xmm0
+
+ shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
+ shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
+ shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
+ shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+ jmp near .nextcolumn
+ alignx 16,7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
+ psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
+ cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
+
+ punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
+ punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
+ psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
+ psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
+ cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
+ cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4,xmm0
+ movaps xmm5,xmm1
+ subps xmm0,xmm2 ; xmm0=tmp11
+ subps xmm1,xmm3
+ addps xmm4,xmm2 ; xmm4=tmp10
+ addps xmm5,xmm3 ; xmm5=tmp13
+
+ mulps xmm1,[GOTOFF(ebx,PD_1_414)]
+ subps xmm1,xmm5 ; xmm1=tmp12
+
+ movaps xmm6,xmm4
+ movaps xmm7,xmm0
+ subps xmm4,xmm5 ; xmm4=tmp3
+ subps xmm0,xmm1 ; xmm0=tmp2
+ addps xmm6,xmm5 ; xmm6=tmp0
+ addps xmm7,xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
+ punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
+ psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
+ psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
+ cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
+ cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
+
+ punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
+ punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
+ psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
+ psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
+ cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
+ cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
+
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4,xmm2
+ movaps xmm0,xmm5
+ addps xmm2,xmm1 ; xmm2=z11
+ addps xmm5,xmm3 ; xmm5=z13
+ subps xmm4,xmm1 ; xmm4=z12
+ subps xmm0,xmm3 ; xmm0=z10
+
+ movaps xmm1,xmm2
+ subps xmm2,xmm5
+ addps xmm1,xmm5 ; xmm1=tmp7
+
+ mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3,xmm0
+ addps xmm0,xmm4
+ mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3,xmm0 ; xmm3=tmp12
+ subps xmm4,xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3,xmm1 ; xmm3=tmp6
+ movaps xmm5,xmm6
+ movaps xmm0,xmm7
+ addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
+ addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
+ subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
+ subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
+ subps xmm2,xmm3 ; xmm2=tmp5
+
+ movaps xmm1,xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
+ movaps xmm3,xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
+ unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
+
+ movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
+ movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
+
+ addps xmm4,xmm2 ; xmm4=tmp4
+ movaps xmm0,xmm7
+ movaps xmm3,xmm5
+ addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
+ addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
+ subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
+ subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
+
+ movaps xmm2,xmm7 ; transpose coefficients(phase 1)
+ unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
+ unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
+ movaps xmm4,xmm5 ; transpose coefficients(phase 1)
+ unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
+ unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
+
+ movaps xmm3,xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
+ unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
+ movaps xmm0,xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
+ unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
+ movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+ movaps xmm6,xmm5 ; transpose coefficients(phase 2)
+ unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
+ unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
+ movaps xmm3,xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
+ unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; FAST_FLOAT * wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16,7
+.rowloop:
+
+ ; -- Even part
+
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4,xmm0
+ movaps xmm5,xmm1
+ subps xmm0,xmm2 ; xmm0=tmp11
+ subps xmm1,xmm3
+ addps xmm4,xmm2 ; xmm4=tmp10
+ addps xmm5,xmm3 ; xmm5=tmp13
+
+ mulps xmm1,[GOTOFF(ebx,PD_1_414)]
+ subps xmm1,xmm5 ; xmm1=tmp12
+
+ movaps xmm6,xmm4
+ movaps xmm7,xmm0
+ subps xmm4,xmm5 ; xmm4=tmp3
+ subps xmm0,xmm1 ; xmm0=tmp2
+ addps xmm6,xmm5 ; xmm6=tmp0
+ addps xmm7,xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4,xmm2
+ movaps xmm0,xmm5
+ addps xmm2,xmm1 ; xmm2=z11
+ addps xmm5,xmm3 ; xmm5=z13
+ subps xmm4,xmm1 ; xmm4=z12
+ subps xmm0,xmm3 ; xmm0=z10
+
+ movaps xmm1,xmm2
+ subps xmm2,xmm5
+ addps xmm1,xmm5 ; xmm1=tmp7
+
+ mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3,xmm0
+ addps xmm0,xmm4
+ mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3,xmm0 ; xmm3=tmp12
+ subps xmm4,xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3,xmm1 ; xmm3=tmp6
+ movaps xmm5,xmm6
+ movaps xmm0,xmm7
+ addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
+ addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
+ subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
+ subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
+ subps xmm2,xmm3 ; xmm2=tmp5
+
+ movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
+ pcmpeqd xmm3,xmm3
+ psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+ addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+ addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+ addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+ pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
+ pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+ pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
+ pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+ por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
+ por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
+
+ movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
+
+ addps xmm4,xmm2 ; xmm4=tmp4
+ movaps xmm7,xmm1
+ movaps xmm5,xmm3
+ addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
+ addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
+ subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
+ subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
+
+ movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
+ pcmpeqd xmm4,xmm4
+ psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+ addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+ addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+ addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+ pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
+ pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+ pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
+ pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+ por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
+ por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
+
+ packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+ packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+ paddb xmm6,xmm2
+ paddb xmm1,xmm2
+
+ movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+ movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
+ punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+ pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jiss2fst-64.asm b/simd/jiss2fst-64.asm
new file mode 100644
index 0000000..8b664a6
--- /dev/null
+++ b/simd/jiss2fst-64.asm
@@ -0,0 +1,492 @@
+;
+; jiss2fst.asm - fast integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/projecpt/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+%define PASS1_BITS 2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277 ; FIX(1.082392200)
+F_1_414 equ 362 ; FIX(1.414213562)
+F_1_847 equ 473 ; FIX(1.847759065)
+F_2_613 equ 669 ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 16
+ global EXTN(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
+PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
+PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info * compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp rbp+0
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 16
+ global EXTN(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+ push rbp
+ mov rax,rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp],eax
+ mov rbp,rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+ mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm1,xmm0
+ packsswb xmm1,xmm1
+ packsswb xmm1,xmm1
+ movd eax,xmm1
+ test rax,rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
+ jmp near .column_end
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm1
+ psubw xmm0,xmm2 ; xmm0=tmp11
+ psubw xmm1,xmm3
+ paddw xmm4,xmm2 ; xmm4=tmp10
+ paddw xmm5,xmm3 ; xmm5=tmp13
+
+ psllw xmm1,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm1,[rel PW_F1414]
+ psubw xmm1,xmm5 ; xmm1=tmp12
+
+ movdqa xmm6,xmm4
+ movdqa xmm7,xmm0
+ psubw xmm4,xmm5 ; xmm4=tmp3
+ psubw xmm0,xmm1 ; xmm0=tmp2
+ paddw xmm6,xmm5 ; xmm6=tmp0
+ paddw xmm7,xmm1 ; xmm7=tmp1
+
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4,xmm2
+ movdqa xmm0,xmm5
+ psubw xmm2,xmm1 ; xmm2=z12
+ psubw xmm5,xmm3 ; xmm5=z10
+ paddw xmm4,xmm1 ; xmm4=z11
+ paddw xmm0,xmm3 ; xmm0=z13
+
+ movdqa xmm1,xmm5 ; xmm1=z10(unscaled)
+ psllw xmm2,PRE_MULTIPLY_SCALE_BITS
+ psllw xmm5,PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm3,xmm4
+ psubw xmm4,xmm0
+ paddw xmm3,xmm0 ; xmm3=tmp7
+
+ psllw xmm4,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm0,xmm5
+ paddw xmm5,xmm2
+ pmulhw xmm5,[rel PW_F1847] ; xmm5=z5
+ pmulhw xmm0,[rel PW_MF1613]
+ pmulhw xmm2,[rel PW_F1082]
+ psubw xmm0,xmm1
+ psubw xmm2,xmm5 ; xmm2=tmp10
+ paddw xmm0,xmm5 ; xmm0=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm0,xmm3 ; xmm0=tmp6
+ movdqa xmm1,xmm6
+ movdqa xmm5,xmm7
+ paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
+ paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
+ psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
+ psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
+ psubw xmm4,xmm0 ; xmm4=tmp5
+
+ movdqa xmm3,xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
+ movdqa xmm0,xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
+
+ paddw xmm2,xmm4 ; xmm2=tmp4
+ movdqa xmm5,xmm7
+ movdqa xmm0,xmm1
+ paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
+ paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
+ psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+ psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4,xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm2,xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm0,xmm3 ; transpose coefficients(phase 2)
+ punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
+ movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
+
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3,xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
+ movdqa xmm0,xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm4,xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm7,xmm5 ; transpose coefficients(phase 3)
+ punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
+
+ movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm7,xmm3 ; transpose coefficients(phase 3)
+ punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov rax, [original_rbp]
+ mov rdi, r12 ; (JSAMPROW *)
+ mov rax, r13
+
+ ; -- Even part
+
+ ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+ movdqa xmm2,xmm6
+ movdqa xmm0,xmm5
+ psubw xmm6,xmm1 ; xmm6=tmp11
+ psubw xmm5,xmm3
+ paddw xmm2,xmm1 ; xmm2=tmp10
+ paddw xmm0,xmm3 ; xmm0=tmp13
+
+ psllw xmm5,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5,[rel PW_F1414]
+ psubw xmm5,xmm0 ; xmm5=tmp12
+
+ movdqa xmm1,xmm2
+ movdqa xmm3,xmm6
+ psubw xmm2,xmm0 ; xmm2=tmp3
+ psubw xmm6,xmm5 ; xmm6=tmp2
+ paddw xmm1,xmm0 ; xmm1=tmp0
+ paddw xmm3,xmm5 ; xmm3=tmp1
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
+
+ ; -- Odd part
+
+ ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+ movdqa xmm2,xmm0
+ movdqa xmm6,xmm4
+ psubw xmm0,xmm7 ; xmm0=z12
+ psubw xmm4,xmm5 ; xmm4=z10
+ paddw xmm2,xmm7 ; xmm2=z11
+ paddw xmm6,xmm5 ; xmm6=z13
+
+ movdqa xmm7,xmm4 ; xmm7=z10(unscaled)
+ psllw xmm0,PRE_MULTIPLY_SCALE_BITS
+ psllw xmm4,PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm5,xmm2
+ psubw xmm2,xmm6
+ paddw xmm5,xmm6 ; xmm5=tmp7
+
+ psllw xmm2,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm6,xmm4
+ paddw xmm4,xmm0
+ pmulhw xmm4,[rel PW_F1847] ; xmm4=z5
+ pmulhw xmm6,[rel PW_MF1613]
+ pmulhw xmm0,[rel PW_F1082]
+ psubw xmm6,xmm7
+ psubw xmm0,xmm4 ; xmm0=tmp10
+ paddw xmm6,xmm4 ; xmm6=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm6,xmm5 ; xmm6=tmp6
+ movdqa xmm7,xmm1
+ movdqa xmm4,xmm3
+ paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
+ paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ psraw xmm1,(PASS1_BITS+3) ; descale
+ psraw xmm3,(PASS1_BITS+3) ; descale
+ psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
+ psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+ psraw xmm7,(PASS1_BITS+3) ; descale
+ psraw xmm4,(PASS1_BITS+3) ; descale
+ psubw xmm2,xmm6 ; xmm2=tmp5
+
+ packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
+
+ paddw xmm0,xmm2 ; xmm0=tmp4
+ movdqa xmm4,xmm5
+ movdqa xmm7,xmm6
+ paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
+ paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
+ psraw xmm5,(PASS1_BITS+3) ; descale
+ psraw xmm6,(PASS1_BITS+3) ; descale
+ psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+ psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
+ psraw xmm4,(PASS1_BITS+3) ; descale
+ psraw xmm7,(PASS1_BITS+3) ; descale
+
+ movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
+
+ packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm1,xmm2
+ paddb xmm3,xmm2
+ paddb xmm5,xmm2
+ paddb xmm7,xmm2
+
+ movdqa xmm0,xmm1 ; transpose coefficients(phase 1)
+ punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
+ punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4,xmm1 ; transpose coefficients(phase 2)
+ punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm2,xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm3,xmm1 ; transpose coefficients(phase 3)
+ punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm7,xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+ mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+ mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+ mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+ uncollect_args
+ mov rsp,rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jiss2fst.asm b/simd/jiss2fst.asm
new file mode 100644
index 0000000..b53664d
--- /dev/null
+++ b/simd/jiss2fst.asm
@@ -0,0 +1,502 @@
+;
+; jiss2fst.asm - fast integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 8 ; 14 is also OK.
+%define PASS1_BITS 2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277 ; FIX(1.082392200)
+F_1_414 equ 362 ; FIX(1.414213562)
+F_1_847 equ 473 ; FIX(1.847759065)
+F_2_613 equ 669 ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+ alignz 16
+ global EXTN(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
+PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
+PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b)+8 ; jpeg_component_info * compptr
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 16
+ global EXTN(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm1,xmm0
+ packsswb xmm1,xmm1
+ packsswb xmm1,xmm1
+ movd eax,xmm1
+ test eax,eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
+ jmp near .column_end
+ alignx 16,7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm1
+ psubw xmm0,xmm2 ; xmm0=tmp11
+ psubw xmm1,xmm3
+ paddw xmm4,xmm2 ; xmm4=tmp10
+ paddw xmm5,xmm3 ; xmm5=tmp13
+
+ psllw xmm1,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm1,[GOTOFF(ebx,PW_F1414)]
+ psubw xmm1,xmm5 ; xmm1=tmp12
+
+ movdqa xmm6,xmm4
+ movdqa xmm7,xmm0
+ psubw xmm4,xmm5 ; xmm4=tmp3
+ psubw xmm0,xmm1 ; xmm0=tmp2
+ paddw xmm6,xmm5 ; xmm6=tmp0
+ paddw xmm7,xmm1 ; xmm7=tmp1
+
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4,xmm2
+ movdqa xmm0,xmm5
+ psubw xmm2,xmm1 ; xmm2=z12
+ psubw xmm5,xmm3 ; xmm5=z10
+ paddw xmm4,xmm1 ; xmm4=z11
+ paddw xmm0,xmm3 ; xmm0=z13
+
+ movdqa xmm1,xmm5 ; xmm1=z10(unscaled)
+ psllw xmm2,PRE_MULTIPLY_SCALE_BITS
+ psllw xmm5,PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm3,xmm4
+ psubw xmm4,xmm0
+ paddw xmm3,xmm0 ; xmm3=tmp7
+
+ psllw xmm4,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm0,xmm5
+ paddw xmm5,xmm2
+ pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5
+ pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)]
+ pmulhw xmm2,[GOTOFF(ebx,PW_F1082)]
+ psubw xmm0,xmm1
+ psubw xmm2,xmm5 ; xmm2=tmp10
+ paddw xmm0,xmm5 ; xmm0=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm0,xmm3 ; xmm0=tmp6
+ movdqa xmm1,xmm6
+ movdqa xmm5,xmm7
+ paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
+ paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
+ psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
+ psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
+ psubw xmm4,xmm0 ; xmm4=tmp5
+
+ movdqa xmm3,xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
+ movdqa xmm0,xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
+
+ paddw xmm2,xmm4 ; xmm2=tmp4
+ movdqa xmm5,xmm7
+ movdqa xmm0,xmm1
+ paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
+ paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
+ psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+ psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4,xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm2,xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm0,xmm3 ; transpose coefficients(phase 2)
+ punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
+ movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
+
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3,xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
+ movdqa xmm0,xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm4,xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm7,xmm5 ; transpose coefficients(phase 3)
+ punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
+
+ movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm7,xmm3 ; transpose coefficients(phase 3)
+ punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Even part
+
+ ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+ movdqa xmm2,xmm6
+ movdqa xmm0,xmm5
+ psubw xmm6,xmm1 ; xmm6=tmp11
+ psubw xmm5,xmm3
+ paddw xmm2,xmm1 ; xmm2=tmp10
+ paddw xmm0,xmm3 ; xmm0=tmp13
+
+ psllw xmm5,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5,[GOTOFF(ebx,PW_F1414)]
+ psubw xmm5,xmm0 ; xmm5=tmp12
+
+ movdqa xmm1,xmm2
+ movdqa xmm3,xmm6
+ psubw xmm2,xmm0 ; xmm2=tmp3
+ psubw xmm6,xmm5 ; xmm6=tmp2
+ paddw xmm1,xmm0 ; xmm1=tmp0
+ paddw xmm3,xmm5 ; xmm3=tmp1
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
+
+ ; -- Odd part
+
+ ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+ movdqa xmm2,xmm0
+ movdqa xmm6,xmm4
+ psubw xmm0,xmm7 ; xmm0=z12
+ psubw xmm4,xmm5 ; xmm4=z10
+ paddw xmm2,xmm7 ; xmm2=z11
+ paddw xmm6,xmm5 ; xmm6=z13
+
+ movdqa xmm7,xmm4 ; xmm7=z10(unscaled)
+ psllw xmm0,PRE_MULTIPLY_SCALE_BITS
+ psllw xmm4,PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm5,xmm2
+ psubw xmm2,xmm6
+ paddw xmm5,xmm6 ; xmm5=tmp7
+
+ psllw xmm2,PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm6,xmm4
+ paddw xmm4,xmm0
+ pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5
+ pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)]
+ pmulhw xmm0,[GOTOFF(ebx,PW_F1082)]
+ psubw xmm6,xmm7
+ psubw xmm0,xmm4 ; xmm0=tmp10
+ paddw xmm6,xmm4 ; xmm6=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm6,xmm5 ; xmm6=tmp6
+ movdqa xmm7,xmm1
+ movdqa xmm4,xmm3
+ paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
+ paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ psraw xmm1,(PASS1_BITS+3) ; descale
+ psraw xmm3,(PASS1_BITS+3) ; descale
+ psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
+ psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+ psraw xmm7,(PASS1_BITS+3) ; descale
+ psraw xmm4,(PASS1_BITS+3) ; descale
+ psubw xmm2,xmm6 ; xmm2=tmp5
+
+ packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
+
+ paddw xmm0,xmm2 ; xmm0=tmp4
+ movdqa xmm4,xmm5
+ movdqa xmm7,xmm6
+ paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
+ paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
+ psraw xmm5,(PASS1_BITS+3) ; descale
+ psraw xmm6,(PASS1_BITS+3) ; descale
+ psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+ psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
+ psraw xmm4,(PASS1_BITS+3) ; descale
+ psraw xmm7,(PASS1_BITS+3) ; descale
+
+ movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
+
+ packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm1,xmm2
+ paddb xmm3,xmm2
+ paddb xmm5,xmm2
+ paddb xmm7,xmm2
+
+ movdqa xmm0,xmm1 ; transpose coefficients(phase 1)
+ punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
+ punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4,xmm1 ; transpose coefficients(phase 2)
+ punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm2,xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm3,xmm1 ; transpose coefficients(phase 3)
+ punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm7,xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+ mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+ mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jiss2int-64.asm b/simd/jiss2int-64.asm
new file mode 100644
index 0000000..82da0a7
--- /dev/null
+++ b/simd/jiss2int-64.asm
@@ -0,0 +1,848 @@
+;
+; jiss2int.asm - accurate integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info * compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp rbp+0
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 12
+
+ align 16
+ global EXTN(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+ push rbp
+ mov rax,rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp],rax
+ mov rbp,rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+ mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm1,xmm0
+ packsswb xmm1,xmm1
+ packsswb xmm1,xmm1
+ movd eax,xmm1
+ test rax,rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm5,PASS1_BITS
+
+ movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+ jmp near .column_end
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm4,xmm1 ; xmm1=in2=z2
+ movdqa xmm5,xmm1
+ punpcklwd xmm4,xmm3 ; xmm3=in6=z3
+ punpckhwd xmm5,xmm3
+ movdqa xmm1,xmm4
+ movdqa xmm3,xmm5
+ pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=tmp3L
+ pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H
+ pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L
+ pmaddwd xmm3,[rel PW_F054_MF130] ; xmm3=tmp2H
+
+ movdqa xmm6,xmm0
+ paddw xmm0,xmm2 ; xmm0=in0+in4
+ psubw xmm6,xmm2 ; xmm6=in0-in4
+
+ pxor xmm7,xmm7
+ pxor xmm2,xmm2
+ punpcklwd xmm7,xmm0 ; xmm7=tmp0L
+ punpckhwd xmm2,xmm0 ; xmm2=tmp0H
+ psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+ psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+ movdqa xmm0,xmm7
+ paddd xmm7,xmm4 ; xmm7=tmp10L
+ psubd xmm0,xmm4 ; xmm0=tmp13L
+ movdqa xmm4,xmm2
+ paddd xmm2,xmm5 ; xmm2=tmp10H
+ psubd xmm4,xmm5 ; xmm4=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
+
+ pxor xmm5,xmm5
+ pxor xmm7,xmm7
+ punpcklwd xmm5,xmm6 ; xmm5=tmp1L
+ punpckhwd xmm7,xmm6 ; xmm7=tmp1H
+ psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+ movdqa xmm2,xmm5
+ paddd xmm5,xmm1 ; xmm5=tmp11L
+ psubd xmm2,xmm1 ; xmm2=tmp12L
+ movdqa xmm0,xmm7
+ paddd xmm7,xmm3 ; xmm7=tmp11H
+ psubd xmm0,xmm3 ; xmm0=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm5,xmm6
+ movdqa xmm7,xmm4
+ paddw xmm5,xmm3 ; xmm5=z3
+ paddw xmm7,xmm1 ; xmm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm2,xmm5
+ movdqa xmm0,xmm5
+ punpcklwd xmm2,xmm7
+ punpckhwd xmm0,xmm7
+ movdqa xmm5,xmm2
+ movdqa xmm7,xmm0
+ pmaddwd xmm2,[rel PW_MF078_F117] ; xmm2=z3L
+ pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3H
+ pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L
+ pmaddwd xmm7,[rel PW_F117_F078] ; xmm7=z4H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm2,xmm3
+ movdqa xmm0,xmm3
+ punpcklwd xmm2,xmm4
+ punpckhwd xmm0,xmm4
+ movdqa xmm3,xmm2
+ movdqa xmm4,xmm0
+ pmaddwd xmm2,[rel PW_MF060_MF089] ; xmm2=tmp0L
+ pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0H
+ pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3L
+ pmaddwd xmm4,[rel PW_MF089_F060] ; xmm4=tmp3H
+
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
+ paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
+ paddd xmm3,xmm5 ; xmm3=tmp3L
+ paddd xmm4,xmm7 ; xmm4=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
+
+ movdqa xmm2,xmm1
+ movdqa xmm0,xmm1
+ punpcklwd xmm2,xmm6
+ punpckhwd xmm0,xmm6
+ movdqa xmm1,xmm2
+ movdqa xmm6,xmm0
+ pmaddwd xmm2,[rel PW_MF050_MF256] ; xmm2=tmp1L
+ pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1H
+ pmaddwd xmm1,[rel PW_MF256_F050] ; xmm1=tmp2L
+ pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H
+
+ paddd xmm2,xmm5 ; xmm2=tmp1L
+ paddd xmm0,xmm7 ; xmm0=tmp1H
+ paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
+
+ movdqa xmm2,xmm5
+ movdqa xmm0,xmm7
+ paddd xmm5,xmm3 ; xmm5=data0L
+ paddd xmm7,xmm4 ; xmm7=data0H
+ psubd xmm2,xmm3 ; xmm2=data7L
+ psubd xmm0,xmm4 ; xmm0=data7H
+
+ movdqa xmm3,[rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1]
+
+ paddd xmm5,xmm3
+ paddd xmm7,xmm3
+ psrad xmm5,DESCALE_P1
+ psrad xmm7,DESCALE_P1
+ paddd xmm2,xmm3
+ paddd xmm0,xmm3
+ psrad xmm2,DESCALE_P1
+ psrad xmm0,DESCALE_P1
+
+ packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+ movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
+ movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
+
+ movdqa xmm7,xmm4
+ movdqa xmm0,xmm3
+ paddd xmm4,xmm1 ; xmm4=data1L
+ paddd xmm3,xmm6 ; xmm3=data1H
+ psubd xmm7,xmm1 ; xmm7=data6L
+ psubd xmm0,xmm6 ; xmm0=data6H
+
+ movdqa xmm1,[rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1]
+
+ paddd xmm4,xmm1
+ paddd xmm3,xmm1
+ psrad xmm4,DESCALE_P1
+ psrad xmm3,DESCALE_P1
+ paddd xmm7,xmm1
+ paddd xmm0,xmm1
+ psrad xmm7,DESCALE_P1
+ psrad xmm0,DESCALE_P1
+
+ packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+ movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm1,xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
+ movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
+ movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
+ movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm5,xmm3
+ movdqa xmm6,xmm0
+ paddd xmm3,xmm4 ; xmm3=data2L
+ paddd xmm0,xmm2 ; xmm0=data2H
+ psubd xmm5,xmm4 ; xmm5=data5L
+ psubd xmm6,xmm2 ; xmm6=data5H
+
+ movdqa xmm7,[rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1]
+
+ paddd xmm3,xmm7
+ paddd xmm0,xmm7
+ psrad xmm3,DESCALE_P1
+ psrad xmm0,DESCALE_P1
+ paddd xmm5,xmm7
+ paddd xmm6,xmm7
+ psrad xmm5,DESCALE_P1
+ psrad xmm6,DESCALE_P1
+
+ packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
+ packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
+ movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
+ movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
+ movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
+
+ movdqa xmm0,xmm1
+ movdqa xmm6,xmm4
+ paddd xmm1,xmm2 ; xmm1=data3L
+ paddd xmm4,xmm7 ; xmm4=data3H
+ psubd xmm0,xmm2 ; xmm0=data4L
+ psubd xmm6,xmm7 ; xmm6=data4H
+
+ movdqa xmm2,[rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1]
+
+ paddd xmm1,xmm2
+ paddd xmm4,xmm2
+ psrad xmm1,DESCALE_P1
+ psrad xmm4,DESCALE_P1
+ paddd xmm0,xmm2
+ paddd xmm6,xmm2
+ psrad xmm0,DESCALE_P1
+ psrad xmm6,DESCALE_P1
+
+ packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
+ packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
+ movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
+
+ movdqa xmm4,xmm3 ; transpose coefficients(phase 1)
+ punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm6,xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm1,xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
+ movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
+ movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm2,xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm3,xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
+
+ movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm4,xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov rax, [original_rbp]
+ mov rdi, r12 ; (JSAMPROW *)
+ mov rax, r13
+
+ ; -- Even part
+
+ ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm6,xmm1 ; xmm1=in2=z2
+ movdqa xmm5,xmm1
+ punpcklwd xmm6,xmm2 ; xmm2=in6=z3
+ punpckhwd xmm5,xmm2
+ movdqa xmm1,xmm6
+ movdqa xmm2,xmm5
+ pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=tmp3L
+ pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H
+ pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L
+ pmaddwd xmm2,[rel PW_F054_MF130] ; xmm2=tmp2H
+
+ movdqa xmm3,xmm7
+ paddw xmm7,xmm0 ; xmm7=in0+in4
+ psubw xmm3,xmm0 ; xmm3=in0-in4
+
+ pxor xmm4,xmm4
+ pxor xmm0,xmm0
+ punpcklwd xmm4,xmm7 ; xmm4=tmp0L
+ punpckhwd xmm0,xmm7 ; xmm0=tmp0H
+ psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+ psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+ movdqa xmm7,xmm4
+ paddd xmm4,xmm6 ; xmm4=tmp10L
+ psubd xmm7,xmm6 ; xmm7=tmp13L
+ movdqa xmm6,xmm0
+ paddd xmm0,xmm5 ; xmm0=tmp10H
+ psubd xmm6,xmm5 ; xmm6=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
+
+ pxor xmm5,xmm5
+ pxor xmm4,xmm4
+ punpcklwd xmm5,xmm3 ; xmm5=tmp1L
+ punpckhwd xmm4,xmm3 ; xmm4=tmp1H
+ psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+ movdqa xmm0,xmm5
+ paddd xmm5,xmm1 ; xmm5=tmp11L
+ psubd xmm0,xmm1 ; xmm0=tmp12L
+ movdqa xmm7,xmm4
+ paddd xmm4,xmm2 ; xmm4=tmp11H
+ psubd xmm7,xmm2 ; xmm7=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
+ movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
+ movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
+ movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
+
+ movdqa xmm5,xmm6
+ movdqa xmm4,xmm3
+ paddw xmm5,xmm1 ; xmm5=z3
+ paddw xmm4,xmm2 ; xmm4=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm0,xmm5
+ movdqa xmm7,xmm5
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm7,xmm4
+ movdqa xmm5,xmm0
+ movdqa xmm4,xmm7
+ pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3L
+ pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3H
+ pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L
+ pmaddwd xmm4,[rel PW_F117_F078] ; xmm4=z4H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm0,xmm1
+ movdqa xmm7,xmm1
+ punpcklwd xmm0,xmm3
+ punpckhwd xmm7,xmm3
+ movdqa xmm1,xmm0
+ movdqa xmm3,xmm7
+ pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0L
+ pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp0H
+ pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp3L
+ pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3H
+
+ paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
+ paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
+ paddd xmm1,xmm5 ; xmm1=tmp3L
+ paddd xmm3,xmm4 ; xmm3=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
+
+ movdqa xmm0,xmm2
+ movdqa xmm7,xmm2
+ punpcklwd xmm0,xmm6
+ punpckhwd xmm7,xmm6
+ movdqa xmm2,xmm0
+ movdqa xmm6,xmm7
+ pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1L
+ pmaddwd xmm7,[rel PW_MF050_MF256] ; xmm7=tmp1H
+ pmaddwd xmm2,[rel PW_MF256_F050] ; xmm2=tmp2L
+ pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H
+
+ paddd xmm0,xmm5 ; xmm0=tmp1L
+ paddd xmm7,xmm4 ; xmm7=tmp1H
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
+
+ movdqa xmm0,xmm5
+ movdqa xmm7,xmm4
+ paddd xmm5,xmm1 ; xmm5=data0L
+ paddd xmm4,xmm3 ; xmm4=data0H
+ psubd xmm0,xmm1 ; xmm0=data7L
+ psubd xmm7,xmm3 ; xmm7=data7H
+
+ movdqa xmm1,[rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2]
+
+ paddd xmm5,xmm1
+ paddd xmm4,xmm1
+ psrad xmm5,DESCALE_P2
+ psrad xmm4,DESCALE_P2
+ paddd xmm0,xmm1
+ paddd xmm7,xmm1
+ psrad xmm0,DESCALE_P2
+ psrad xmm7,DESCALE_P2
+
+ packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
+ packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
+ movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
+
+ movdqa xmm4,xmm3
+ movdqa xmm7,xmm1
+ paddd xmm3,xmm2 ; xmm3=data1L
+ paddd xmm1,xmm6 ; xmm1=data1H
+ psubd xmm4,xmm2 ; xmm4=data6L
+ psubd xmm7,xmm6 ; xmm7=data6H
+
+ movdqa xmm2,[rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2]
+
+ paddd xmm3,xmm2
+ paddd xmm1,xmm2
+ psrad xmm3,DESCALE_P2
+ psrad xmm1,DESCALE_P2
+ paddd xmm4,xmm2
+ paddd xmm7,xmm2
+ psrad xmm4,DESCALE_P2
+ psrad xmm7,DESCALE_P2
+
+ packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+ packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
+ movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
+ movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm4,xmm6
+ movdqa xmm0,xmm2
+ paddd xmm6,xmm1 ; xmm6=data2L
+ paddd xmm2,xmm7 ; xmm2=data2H
+ psubd xmm4,xmm1 ; xmm4=data5L
+ psubd xmm0,xmm7 ; xmm0=data5H
+
+ movdqa xmm5,[rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2]
+
+ paddd xmm6,xmm5
+ paddd xmm2,xmm5
+ psrad xmm6,DESCALE_P2
+ psrad xmm2,DESCALE_P2
+ paddd xmm4,xmm5
+ paddd xmm0,xmm5
+ psrad xmm4,DESCALE_P2
+ psrad xmm0,DESCALE_P2
+
+ packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
+ packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+ movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
+ movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
+ movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
+ movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
+
+ movdqa xmm2,xmm3
+ movdqa xmm0,xmm1
+ paddd xmm3,xmm7 ; xmm3=data3L
+ paddd xmm1,xmm5 ; xmm1=data3H
+ psubd xmm2,xmm7 ; xmm2=data4L
+ psubd xmm0,xmm5 ; xmm0=data4H
+
+ movdqa xmm7,[rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2]
+
+ paddd xmm3,xmm7
+ paddd xmm1,xmm7
+ psrad xmm3,DESCALE_P2
+ psrad xmm1,DESCALE_P2
+ paddd xmm2,xmm7
+ paddd xmm0,xmm7
+ psrad xmm2,DESCALE_P2
+ psrad xmm0,DESCALE_P2
+
+ movdqa xmm5,[rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP]
+
+ packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
+ packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm7,xmm5
+ paddb xmm1,xmm5
+ paddb xmm6,xmm5
+ paddb xmm3,xmm5
+
+ movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
+ punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
+ punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4,xmm7 ; transpose coefficients(phase 2)
+ punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
+ punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
+ punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm3,xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+ mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+ mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+ mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+ uncollect_args
+ mov rsp,rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jiss2int.asm b/simd/jiss2int.asm
new file mode 100644
index 0000000..adf39fb
--- /dev/null
+++ b/simd/jiss2int.asm
@@ -0,0 +1,859 @@
+;
+; jiss2int.asm - accurate integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b)+8 ; jpeg_component_info * compptr
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 12
+
+ align 16
+ global EXTN(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm1,xmm0
+ packsswb xmm1,xmm1
+ packsswb xmm1,xmm1
+ movd eax,xmm1
+ test eax,eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm5,PASS1_BITS
+
+ movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+ jmp near .column_end
+ alignx 16,7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm4,xmm1 ; xmm1=in2=z2
+ movdqa xmm5,xmm1
+ punpcklwd xmm4,xmm3 ; xmm3=in6=z3
+ punpckhwd xmm5,xmm3
+ movdqa xmm1,xmm4
+ movdqa xmm3,xmm5
+ pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L
+ pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
+ pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
+ pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H
+
+ movdqa xmm6,xmm0
+ paddw xmm0,xmm2 ; xmm0=in0+in4
+ psubw xmm6,xmm2 ; xmm6=in0-in4
+
+ pxor xmm7,xmm7
+ pxor xmm2,xmm2
+ punpcklwd xmm7,xmm0 ; xmm7=tmp0L
+ punpckhwd xmm2,xmm0 ; xmm2=tmp0H
+ psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+ psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+ movdqa xmm0,xmm7
+ paddd xmm7,xmm4 ; xmm7=tmp10L
+ psubd xmm0,xmm4 ; xmm0=tmp13L
+ movdqa xmm4,xmm2
+ paddd xmm2,xmm5 ; xmm2=tmp10H
+ psubd xmm4,xmm5 ; xmm4=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
+
+ pxor xmm5,xmm5
+ pxor xmm7,xmm7
+ punpcklwd xmm5,xmm6 ; xmm5=tmp1L
+ punpckhwd xmm7,xmm6 ; xmm7=tmp1H
+ psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+ movdqa xmm2,xmm5
+ paddd xmm5,xmm1 ; xmm5=tmp11L
+ psubd xmm2,xmm1 ; xmm2=tmp12L
+ movdqa xmm0,xmm7
+ paddd xmm7,xmm3 ; xmm7=tmp11H
+ psubd xmm0,xmm3 ; xmm0=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm5,xmm6
+ movdqa xmm7,xmm4
+ paddw xmm5,xmm3 ; xmm5=z3
+ paddw xmm7,xmm1 ; xmm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm2,xmm5
+ movdqa xmm0,xmm5
+ punpcklwd xmm2,xmm7
+ punpckhwd xmm0,xmm7
+ movdqa xmm5,xmm2
+ movdqa xmm7,xmm0
+ pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L
+ pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H
+ pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
+ pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm2,xmm3
+ movdqa xmm0,xmm3
+ punpcklwd xmm2,xmm4
+ punpckhwd xmm0,xmm4
+ movdqa xmm3,xmm2
+ movdqa xmm4,xmm0
+ pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L
+ pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H
+ pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L
+ pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H
+
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
+ paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
+ paddd xmm3,xmm5 ; xmm3=tmp3L
+ paddd xmm4,xmm7 ; xmm4=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
+
+ movdqa xmm2,xmm1
+ movdqa xmm0,xmm1
+ punpcklwd xmm2,xmm6
+ punpckhwd xmm0,xmm6
+ movdqa xmm1,xmm2
+ movdqa xmm6,xmm0
+ pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L
+ pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H
+ pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L
+ pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
+
+ paddd xmm2,xmm5 ; xmm2=tmp1L
+ paddd xmm0,xmm7 ; xmm0=tmp1H
+ paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
+
+ movdqa xmm2,xmm5
+ movdqa xmm0,xmm7
+ paddd xmm5,xmm3 ; xmm5=data0L
+ paddd xmm7,xmm4 ; xmm7=data0H
+ psubd xmm2,xmm3 ; xmm2=data7L
+ psubd xmm0,xmm4 ; xmm0=data7H
+
+ movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1]
+
+ paddd xmm5,xmm3
+ paddd xmm7,xmm3
+ psrad xmm5,DESCALE_P1
+ psrad xmm7,DESCALE_P1
+ paddd xmm2,xmm3
+ paddd xmm0,xmm3
+ psrad xmm2,DESCALE_P1
+ psrad xmm0,DESCALE_P1
+
+ packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+ movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
+ movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
+
+ movdqa xmm7,xmm4
+ movdqa xmm0,xmm3
+ paddd xmm4,xmm1 ; xmm4=data1L
+ paddd xmm3,xmm6 ; xmm3=data1H
+ psubd xmm7,xmm1 ; xmm7=data6L
+ psubd xmm0,xmm6 ; xmm0=data6H
+
+ movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1]
+
+ paddd xmm4,xmm1
+ paddd xmm3,xmm1
+ psrad xmm4,DESCALE_P1
+ psrad xmm3,DESCALE_P1
+ paddd xmm7,xmm1
+ paddd xmm0,xmm1
+ psrad xmm7,DESCALE_P1
+ psrad xmm0,DESCALE_P1
+
+ packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+ movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm1,xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
+ movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
+ movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
+ movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm5,xmm3
+ movdqa xmm6,xmm0
+ paddd xmm3,xmm4 ; xmm3=data2L
+ paddd xmm0,xmm2 ; xmm0=data2H
+ psubd xmm5,xmm4 ; xmm5=data5L
+ psubd xmm6,xmm2 ; xmm6=data5H
+
+ movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1]
+
+ paddd xmm3,xmm7
+ paddd xmm0,xmm7
+ psrad xmm3,DESCALE_P1
+ psrad xmm0,DESCALE_P1
+ paddd xmm5,xmm7
+ paddd xmm6,xmm7
+ psrad xmm5,DESCALE_P1
+ psrad xmm6,DESCALE_P1
+
+ packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
+ packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
+ movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
+ movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
+ movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
+
+ movdqa xmm0,xmm1
+ movdqa xmm6,xmm4
+ paddd xmm1,xmm2 ; xmm1=data3L
+ paddd xmm4,xmm7 ; xmm4=data3H
+ psubd xmm0,xmm2 ; xmm0=data4L
+ psubd xmm6,xmm7 ; xmm6=data4H
+
+ movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1]
+
+ paddd xmm1,xmm2
+ paddd xmm4,xmm2
+ psrad xmm1,DESCALE_P1
+ psrad xmm4,DESCALE_P1
+ paddd xmm0,xmm2
+ paddd xmm6,xmm2
+ psrad xmm0,DESCALE_P1
+ psrad xmm6,DESCALE_P1
+
+ packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
+ packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
+ movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
+
+ movdqa xmm4,xmm3 ; transpose coefficients(phase 1)
+ punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm6,xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm1,xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
+ movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
+ movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm2,xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm3,xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
+
+ movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm4,xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Even part
+
+ ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm6,xmm1 ; xmm1=in2=z2
+ movdqa xmm5,xmm1
+ punpcklwd xmm6,xmm2 ; xmm2=in6=z3
+ punpckhwd xmm5,xmm2
+ movdqa xmm1,xmm6
+ movdqa xmm2,xmm5
+ pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L
+ pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
+ pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
+ pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H
+
+ movdqa xmm3,xmm7
+ paddw xmm7,xmm0 ; xmm7=in0+in4
+ psubw xmm3,xmm0 ; xmm3=in0-in4
+
+ pxor xmm4,xmm4
+ pxor xmm0,xmm0
+ punpcklwd xmm4,xmm7 ; xmm4=tmp0L
+ punpckhwd xmm0,xmm7 ; xmm0=tmp0H
+ psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+ psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+ movdqa xmm7,xmm4
+ paddd xmm4,xmm6 ; xmm4=tmp10L
+ psubd xmm7,xmm6 ; xmm7=tmp13L
+ movdqa xmm6,xmm0
+ paddd xmm0,xmm5 ; xmm0=tmp10H
+ psubd xmm6,xmm5 ; xmm6=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
+
+ pxor xmm5,xmm5
+ pxor xmm4,xmm4
+ punpcklwd xmm5,xmm3 ; xmm5=tmp1L
+ punpckhwd xmm4,xmm3 ; xmm4=tmp1H
+ psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+ movdqa xmm0,xmm5
+ paddd xmm5,xmm1 ; xmm5=tmp11L
+ psubd xmm0,xmm1 ; xmm0=tmp12L
+ movdqa xmm7,xmm4
+ paddd xmm4,xmm2 ; xmm4=tmp11H
+ psubd xmm7,xmm2 ; xmm7=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
+ movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
+ movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
+ movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
+
+ movdqa xmm5,xmm6
+ movdqa xmm4,xmm3
+ paddw xmm5,xmm1 ; xmm5=z3
+ paddw xmm4,xmm2 ; xmm4=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm0,xmm5
+ movdqa xmm7,xmm5
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm7,xmm4
+ movdqa xmm5,xmm0
+ movdqa xmm4,xmm7
+ pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L
+ pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H
+ pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
+ pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm0,xmm1
+ movdqa xmm7,xmm1
+ punpcklwd xmm0,xmm3
+ punpckhwd xmm7,xmm3
+ movdqa xmm1,xmm0
+ movdqa xmm3,xmm7
+ pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L
+ pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H
+ pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L
+ pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H
+
+ paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
+ paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
+ paddd xmm1,xmm5 ; xmm1=tmp3L
+ paddd xmm3,xmm4 ; xmm3=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
+
+ movdqa xmm0,xmm2
+ movdqa xmm7,xmm2
+ punpcklwd xmm0,xmm6
+ punpckhwd xmm7,xmm6
+ movdqa xmm2,xmm0
+ movdqa xmm6,xmm7
+ pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L
+ pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H
+ pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L
+ pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
+
+ paddd xmm0,xmm5 ; xmm0=tmp1L
+ paddd xmm7,xmm4 ; xmm7=tmp1H
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
+
+ movdqa xmm0,xmm5
+ movdqa xmm7,xmm4
+ paddd xmm5,xmm1 ; xmm5=data0L
+ paddd xmm4,xmm3 ; xmm4=data0H
+ psubd xmm0,xmm1 ; xmm0=data7L
+ psubd xmm7,xmm3 ; xmm7=data7H
+
+ movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2]
+
+ paddd xmm5,xmm1
+ paddd xmm4,xmm1
+ psrad xmm5,DESCALE_P2
+ psrad xmm4,DESCALE_P2
+ paddd xmm0,xmm1
+ paddd xmm7,xmm1
+ psrad xmm0,DESCALE_P2
+ psrad xmm7,DESCALE_P2
+
+ packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
+ packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
+ movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
+
+ movdqa xmm4,xmm3
+ movdqa xmm7,xmm1
+ paddd xmm3,xmm2 ; xmm3=data1L
+ paddd xmm1,xmm6 ; xmm1=data1H
+ psubd xmm4,xmm2 ; xmm4=data6L
+ psubd xmm7,xmm6 ; xmm7=data6H
+
+ movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2]
+
+ paddd xmm3,xmm2
+ paddd xmm1,xmm2
+ psrad xmm3,DESCALE_P2
+ psrad xmm1,DESCALE_P2
+ paddd xmm4,xmm2
+ paddd xmm7,xmm2
+ psrad xmm4,DESCALE_P2
+ psrad xmm7,DESCALE_P2
+
+ packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+ packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
+ movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
+ movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm4,xmm6
+ movdqa xmm0,xmm2
+ paddd xmm6,xmm1 ; xmm6=data2L
+ paddd xmm2,xmm7 ; xmm2=data2H
+ psubd xmm4,xmm1 ; xmm4=data5L
+ psubd xmm0,xmm7 ; xmm0=data5H
+
+ movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2]
+
+ paddd xmm6,xmm5
+ paddd xmm2,xmm5
+ psrad xmm6,DESCALE_P2
+ psrad xmm2,DESCALE_P2
+ paddd xmm4,xmm5
+ paddd xmm0,xmm5
+ psrad xmm4,DESCALE_P2
+ psrad xmm0,DESCALE_P2
+
+ packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
+ packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+ movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
+ movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
+ movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
+ movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
+
+ movdqa xmm2,xmm3
+ movdqa xmm0,xmm1
+ paddd xmm3,xmm7 ; xmm3=data3L
+ paddd xmm1,xmm5 ; xmm1=data3H
+ psubd xmm2,xmm7 ; xmm2=data4L
+ psubd xmm0,xmm5 ; xmm0=data4H
+
+ movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2]
+
+ paddd xmm3,xmm7
+ paddd xmm1,xmm7
+ psrad xmm3,DESCALE_P2
+ psrad xmm1,DESCALE_P2
+ paddd xmm2,xmm7
+ paddd xmm0,xmm7
+ psrad xmm2,DESCALE_P2
+ psrad xmm0,DESCALE_P2
+
+ movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP]
+
+ packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
+ packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm7,xmm5
+ paddb xmm1,xmm5
+ paddb xmm6,xmm5
+ paddb xmm3,xmm5
+
+ movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
+ punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
+ punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4,xmm7 ; transpose coefficients(phase 2)
+ punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
+ punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
+ punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm3,xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+ mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+ mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jiss2red-64.asm b/simd/jiss2red-64.asm
new file mode 100644
index 0000000..85ba941
--- /dev/null
+++ b/simd/jiss2red-64.asm
@@ -0,0 +1,575 @@
+;
+; jiss2red.asm - reduced-size IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211 equ 1730 ; FIX(0.211164243)
+F_0_509 equ 4176 ; FIX(0.509795579)
+F_0_601 equ 4926 ; FIX(0.601344887)
+F_0_720 equ 5906 ; FIX(0.720959822)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_850 equ 6967 ; FIX(0.850430095)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_061 equ 8697 ; FIX(1.061594337)
+F_1_272 equ 10426 ; FIX(1.272758580)
+F_1_451 equ 11893 ; FIX(1.451774981)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_2_172 equ 17799 ; FIX(2.172734803)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_624 equ 29692 ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076 times 4 dw F_1_847,-F_0_765
+PW_F256_F089 times 4 dw F_2_562, F_0_899
+PW_F106_MF217 times 4 dw F_1_061,-F_2_172
+PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021 times 4 dw F_1_451,-F_0_211
+PW_F362_MF127 times 4 dw F_3_624,-F_1_272
+PW_F085_MF072 times 4 dw F_0_850,-F_0_720
+PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void * dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp rbp+0
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 16
+ global EXTN(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+ push rbp
+ mov rax,rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp],eax
+ mov rbp,rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+ mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm0,xmm1
+ packsswb xmm0,xmm0
+ packsswb xmm0,xmm0
+ movd eax,xmm0
+ test rax,rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm0,PASS1_BITS
+
+ movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+ pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+ pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+ pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+ jmp near .column_end
+%endif
+.columnDCT:
+
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm0
+ punpcklwd xmm4,xmm1
+ punpckhwd xmm5,xmm1
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm5
+ pmaddwd xmm4,[rel PW_F256_F089] ; xmm4=(tmp2L)
+ pmaddwd xmm5,[rel PW_F256_F089] ; xmm5=(tmp2H)
+ pmaddwd xmm0,[rel PW_F106_MF217] ; xmm0=(tmp0L)
+ pmaddwd xmm1,[rel PW_F106_MF217] ; xmm1=(tmp0H)
+
+ movdqa xmm6,xmm2
+ movdqa xmm7,xmm2
+ punpcklwd xmm6,xmm3
+ punpckhwd xmm7,xmm3
+ movdqa xmm2,xmm6
+ movdqa xmm3,xmm7
+ pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2L)
+ pmaddwd xmm7,[rel PW_MF060_MF050] ; xmm7=(tmp2H)
+ pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0L)
+ pmaddwd xmm3,[rel PW_F145_MF021] ; xmm3=(tmp0H)
+
+ paddd xmm6,xmm4 ; xmm6=tmp2L
+ paddd xmm7,xmm5 ; xmm7=tmp2H
+ paddd xmm2,xmm0 ; xmm2=tmp0L
+ paddd xmm3,xmm1 ; xmm3=tmp0H
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ punpcklwd xmm1,xmm4 ; xmm1=tmp0L
+ punpckhwd xmm2,xmm4 ; xmm2=tmp0H
+ psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+ psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+ movdqa xmm3,xmm5 ; xmm5=in2=z2
+ punpcklwd xmm5,xmm0 ; xmm0=in6=z3
+ punpckhwd xmm3,xmm0
+ pmaddwd xmm5,[rel PW_F184_MF076] ; xmm5=tmp2L
+ pmaddwd xmm3,[rel PW_F184_MF076] ; xmm3=tmp2H
+
+ movdqa xmm4,xmm1
+ movdqa xmm0,xmm2
+ paddd xmm1,xmm5 ; xmm1=tmp10L
+ paddd xmm2,xmm3 ; xmm2=tmp10H
+ psubd xmm4,xmm5 ; xmm4=tmp12L
+ psubd xmm0,xmm3 ; xmm0=tmp12H
+
+ ; -- Final output stage
+
+ movdqa xmm5,xmm1
+ movdqa xmm3,xmm2
+ paddd xmm1,xmm6 ; xmm1=data0L
+ paddd xmm2,xmm7 ; xmm2=data0H
+ psubd xmm5,xmm6 ; xmm5=data3L
+ psubd xmm3,xmm7 ; xmm3=data3H
+
+ movdqa xmm6,[rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
+
+ paddd xmm1,xmm6
+ paddd xmm2,xmm6
+ psrad xmm1,DESCALE_P1_4
+ psrad xmm2,DESCALE_P1_4
+ paddd xmm5,xmm6
+ paddd xmm3,xmm6
+ psrad xmm5,DESCALE_P1_4
+ psrad xmm3,DESCALE_P1_4
+
+ packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
+
+ movdqa xmm2,xmm4
+ movdqa xmm3,xmm0
+ paddd xmm4,xmm7 ; xmm4=data1L
+ paddd xmm0,xmm6 ; xmm0=data1H
+ psubd xmm2,xmm7 ; xmm2=data2L
+ psubd xmm3,xmm6 ; xmm3=data2H
+
+ movdqa xmm7,[rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
+
+ paddd xmm4,xmm7
+ paddd xmm0,xmm7
+ psrad xmm4,DESCALE_P1_4
+ psrad xmm0,DESCALE_P1_4
+ paddd xmm2,xmm7
+ paddd xmm3,xmm7
+ psrad xmm2,DESCALE_P1_4
+ psrad xmm3,DESCALE_P1_4
+
+ packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+ movdqa xmm6,xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm7,xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+ movdqa xmm3,xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov rax, [original_rbp]
+ mov rdi, r12 ; (JSAMPROW *)
+ mov rax, r13
+
+ ; -- Even part
+
+ pxor xmm4,xmm4
+ punpcklwd xmm4,xmm1 ; xmm4=tmp0
+ psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+ ; -- Odd part
+
+ punpckhwd xmm1,xmm0
+ punpckhwd xmm6,xmm3
+ movdqa xmm5,xmm1
+ movdqa xmm2,xmm6
+ pmaddwd xmm1,[rel PW_F256_F089] ; xmm1=(tmp2)
+ pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2)
+ pmaddwd xmm5,[rel PW_F106_MF217] ; xmm5=(tmp0)
+ pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0)
+
+ paddd xmm6,xmm1 ; xmm6=tmp2
+ paddd xmm2,xmm5 ; xmm2=tmp0
+
+ ; -- Even part
+
+ punpcklwd xmm0,xmm3
+ pmaddwd xmm0,[rel PW_F184_MF076] ; xmm0=tmp2
+
+ movdqa xmm7,xmm4
+ paddd xmm4,xmm0 ; xmm4=tmp10
+ psubd xmm7,xmm0 ; xmm7=tmp12
+
+ ; -- Final output stage
+
+ movdqa xmm1,[rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
+
+ movdqa xmm5,xmm4
+ movdqa xmm3,xmm7
+ paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30)
+ paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31)
+ psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33)
+ psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
+
+ paddd xmm4,xmm1
+ paddd xmm7,xmm1
+ psrad xmm4,DESCALE_P2_4
+ psrad xmm7,DESCALE_P2_4
+ paddd xmm5,xmm1
+ paddd xmm3,xmm1
+ psrad xmm5,DESCALE_P2_4
+ psrad xmm3,DESCALE_P2_4
+
+ packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
+ packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
+
+ movdqa xmm0,xmm4 ; transpose coefficients(phase 1)
+ punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm6,xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
+
+ packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+ paddb xmm4,[rel PB_CENTERJSAMP]
+
+ pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+ pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+ pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+ mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+ mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+ movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+ uncollect_args
+ mov rsp,rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void * dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+ align 16
+ global EXTN(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+ push rbp
+ mov rbp,rsp
+ push rbx
+ collect_args
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+ ; | input: | result: |
+ ; | 00 01 ** 03 ** 05 ** 07 | |
+ ; | 10 11 ** 13 ** 15 ** 17 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+ ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+ ; | 50 51 ** 53 ** 55 ** 57 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 70 71 ** 73 ** 75 ** 77 | |
+
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+ ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+ pcmpeqd xmm7,xmm7
+ pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+ movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
+ movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
+ punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
+ punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
+ pmaddwd xmm4,[rel PW_F362_MF127]
+ pmaddwd xmm5,[rel PW_F085_MF072]
+
+ psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
+ pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+ psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
+ pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+ por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
+ por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
+ pmaddwd xmm0,[rel PW_F362_MF127]
+ pmaddwd xmm2,[rel PW_F085_MF072]
+
+ paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3]
+ paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
+
+ ; -- Even part
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+ movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
+ pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
+ pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+ psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+ psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+ ; -- Final output stage
+
+ movdqa xmm3,xmm6
+ movdqa xmm5,xmm1
+ paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+ paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+ psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+ psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+ movdqa xmm2,[rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
+
+ punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **)
+
+ movdqa xmm7,xmm1
+ punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3)
+ punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7)
+
+ paddd xmm6,xmm2
+ psrad xmm6,DESCALE_P1_2
+
+ paddd xmm1,xmm2
+ paddd xmm7,xmm2
+ psrad xmm1,DESCALE_P1_2
+ psrad xmm7,DESCALE_P1_2
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov rdi, r12 ; (JSAMPROW *)
+ mov rax, r13
+
+ ; | input:| result:|
+ ; | A0 B0 | |
+ ; | A1 B1 | C0 C1 |
+ ; | A3 B3 | D0 D1 |
+ ; | A5 B5 | |
+ ; | A7 B7 | |
+
+ ; -- Odd part
+
+ packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+ packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+ pmaddwd xmm1,[rel PW_F362_MF127]
+ pmaddwd xmm7,[rel PW_F085_MF072]
+
+ paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
+
+ ; -- Even part
+
+ pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
+
+ ; -- Final output stage
+
+ movdqa xmm4,xmm6
+ paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+ psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+ punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1)
+
+ paddd xmm6,[rel PD_DESCALE_P2_2]
+ psrad xmm6,DESCALE_P2_2
+
+ packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+ packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+ paddb xmm6,[rel PB_CENTERJSAMP]
+
+ pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --)
+ pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --)
+
+ mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx
+ mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
+
+ uncollect_args
+ pop rbx
+ pop rbp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jiss2red.asm b/simd/jiss2red.asm
new file mode 100644
index 0000000..238c61d
--- /dev/null
+++ b/simd/jiss2red.asm
@@ -0,0 +1,594 @@
+;
+; jiss2red.asm - reduced-size IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS 13
+%define PASS1_BITS 2
+
+%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211 equ 1730 ; FIX(0.211164243)
+F_0_509 equ 4176 ; FIX(0.509795579)
+F_0_601 equ 4926 ; FIX(0.601344887)
+F_0_720 equ 5906 ; FIX(0.720959822)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_850 equ 6967 ; FIX(0.850430095)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_061 equ 8697 ; FIX(1.061594337)
+F_1_272 equ 10426 ; FIX(1.272758580)
+F_1_451 equ 11893 ; FIX(1.451774981)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_2_172 equ 17799 ; FIX(2.172734803)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_624 equ 29692 ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
+F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076 times 4 dw F_1_847,-F_0_765
+PW_F256_F089 times 4 dw F_2_562, F_0_899
+PW_F106_MF217 times 4 dw F_1_061,-F_2_172
+PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021 times 4 dw F_1_451,-F_0_211
+PW_F362_MF127 times 4 dw F_3_624,-F_1_272
+PW_F085_MF072 times 4 dw F_0_850,-F_0_720
+PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b)+8 ; void * dct_table
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+
+ align 16
+ global EXTN(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm0,xmm1
+ packsswb xmm0,xmm0
+ packsswb xmm0,xmm0
+ movd eax,xmm0
+ test eax,eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm0,PASS1_BITS
+
+ movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+ pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+ pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+ pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+ jmp near .column_end
+ alignx 16,7
+%endif
+.columnDCT:
+
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm0
+ punpcklwd xmm4,xmm1
+ punpckhwd xmm5,xmm1
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm5
+ pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
+ pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
+ pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
+ pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
+
+ movdqa xmm6,xmm2
+ movdqa xmm7,xmm2
+ punpcklwd xmm6,xmm3
+ punpckhwd xmm7,xmm3
+ movdqa xmm2,xmm6
+ movdqa xmm3,xmm7
+ pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
+ pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
+ pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
+ pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
+
+ paddd xmm6,xmm4 ; xmm6=tmp2L
+ paddd xmm7,xmm5 ; xmm7=tmp2H
+ paddd xmm2,xmm0 ; xmm2=tmp0L
+ paddd xmm3,xmm1 ; xmm3=tmp0H
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ punpcklwd xmm1,xmm4 ; xmm1=tmp0L
+ punpckhwd xmm2,xmm4 ; xmm2=tmp0H
+ psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+ psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+ movdqa xmm3,xmm5 ; xmm5=in2=z2
+ punpcklwd xmm5,xmm0 ; xmm0=in6=z3
+ punpckhwd xmm3,xmm0
+ pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
+ pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
+
+ movdqa xmm4,xmm1
+ movdqa xmm0,xmm2
+ paddd xmm1,xmm5 ; xmm1=tmp10L
+ paddd xmm2,xmm3 ; xmm2=tmp10H
+ psubd xmm4,xmm5 ; xmm4=tmp12L
+ psubd xmm0,xmm3 ; xmm0=tmp12H
+
+ ; -- Final output stage
+
+ movdqa xmm5,xmm1
+ movdqa xmm3,xmm2
+ paddd xmm1,xmm6 ; xmm1=data0L
+ paddd xmm2,xmm7 ; xmm2=data0H
+ psubd xmm5,xmm6 ; xmm5=data3L
+ psubd xmm3,xmm7 ; xmm3=data3H
+
+ movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
+
+ paddd xmm1,xmm6
+ paddd xmm2,xmm6
+ psrad xmm1,DESCALE_P1_4
+ psrad xmm2,DESCALE_P1_4
+ paddd xmm5,xmm6
+ paddd xmm3,xmm6
+ psrad xmm5,DESCALE_P1_4
+ psrad xmm3,DESCALE_P1_4
+
+ packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
+
+ movdqa xmm2,xmm4
+ movdqa xmm3,xmm0
+ paddd xmm4,xmm7 ; xmm4=data1L
+ paddd xmm0,xmm6 ; xmm0=data1H
+ psubd xmm2,xmm7 ; xmm2=data2L
+ psubd xmm3,xmm6 ; xmm3=data2H
+
+ movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
+
+ paddd xmm4,xmm7
+ paddd xmm0,xmm7
+ psrad xmm4,DESCALE_P1_4
+ psrad xmm0,DESCALE_P1_4
+ paddd xmm2,xmm7
+ paddd xmm3,xmm7
+ psrad xmm2,DESCALE_P1_4
+ psrad xmm3,DESCALE_P1_4
+
+ packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+ movdqa xmm6,xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm7,xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+ movdqa xmm3,xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Even part
+
+ pxor xmm4,xmm4
+ punpcklwd xmm4,xmm1 ; xmm4=tmp0
+ psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+ ; -- Odd part
+
+ punpckhwd xmm1,xmm0
+ punpckhwd xmm6,xmm3
+ movdqa xmm5,xmm1
+ movdqa xmm2,xmm6
+ pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
+ pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
+ pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
+ pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
+
+ paddd xmm6,xmm1 ; xmm6=tmp2
+ paddd xmm2,xmm5 ; xmm2=tmp0
+
+ ; -- Even part
+
+ punpcklwd xmm0,xmm3
+ pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
+
+ movdqa xmm7,xmm4
+ paddd xmm4,xmm0 ; xmm4=tmp10
+ psubd xmm7,xmm0 ; xmm7=tmp12
+
+ ; -- Final output stage
+
+ movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
+
+ movdqa xmm5,xmm4
+ movdqa xmm3,xmm7
+ paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30)
+ paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31)
+ psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33)
+ psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
+
+ paddd xmm4,xmm1
+ paddd xmm7,xmm1
+ psrad xmm4,DESCALE_P2_4
+ psrad xmm7,DESCALE_P2_4
+ paddd xmm5,xmm1
+ paddd xmm3,xmm1
+ psrad xmm5,DESCALE_P2_4
+ psrad xmm3,DESCALE_P2_4
+
+ packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
+ packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
+
+ movdqa xmm0,xmm4 ; transpose coefficients(phase 1)
+ punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm6,xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
+
+ packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+ paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+ pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+ pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+ pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+ movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b)+8 ; void * dct_table
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
+
+ align 16
+ global EXTN(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+ push ebp
+ mov ebp,esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+ mov edx, POINTER [dct_table(ebp)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
+
+ ; | input: | result: |
+ ; | 00 01 ** 03 ** 05 ** 07 | |
+ ; | 10 11 ** 13 ** 15 ** 17 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+ ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+ ; | 50 51 ** 53 ** 55 ** 57 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 70 71 ** 73 ** 75 ** 77 | |
+
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+ ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+ pcmpeqd xmm7,xmm7
+ pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+ movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
+ movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
+ punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
+ punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
+ pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+ psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
+ pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+ psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
+ pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+ por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
+ por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
+ pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3]
+ paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
+
+ ; -- Even part
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+ movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
+ pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
+ pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+ psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+ psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+ ; -- Final output stage
+
+ movdqa xmm3,xmm6
+ movdqa xmm5,xmm1
+ paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+ paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+ psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+ psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+ movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
+
+ punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **)
+
+ movdqa xmm7,xmm1
+ punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3)
+ punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7)
+
+ paddd xmm6,xmm2
+ psrad xmm6,DESCALE_P1_2
+
+ paddd xmm1,xmm2
+ paddd xmm7,xmm2
+ psrad xmm1,DESCALE_P1_2
+ psrad xmm7,DESCALE_P1_2
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows, store into output array.
+
+ mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(ebp)]
+
+ ; | input:| result:|
+ ; | A0 B0 | |
+ ; | A1 B1 | C0 C1 |
+ ; | A3 B3 | D0 D1 |
+ ; | A5 B5 | |
+ ; | A7 B7 | |
+
+ ; -- Odd part
+
+ packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+ packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+ pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)]
+
+ paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
+
+ ; -- Even part
+
+ pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
+
+ ; -- Final output stage
+
+ movdqa xmm4,xmm6
+ paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+ psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+ punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1)
+
+ paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
+ psrad xmm6,DESCALE_P2_2
+
+ packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+ packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+ paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+ pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --)
+ pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
+ mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jisseflt.asm b/simd/jisseflt.asm
new file mode 100644
index 0000000..d6147c1
--- /dev/null
+++ b/simd/jisseflt.asm
@@ -0,0 +1,572 @@
+;
+; jisseflt.asm - floating-point IDCT (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1,%2,0x44
+%endmacro
+
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+ SECTION SEG_CONST
+
+ alignz 16
+ global EXTN(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414 times 4 dd 1.414213562373095048801689
+PD_1_847 times 4 dd 1.847759065022573512256366
+PD_1_082 times 4 dd 1.082392200292393968799446
+PD_M2_613 times 4 dd -2.613125929752753055713286
+PD_0_125 times 4 dd 0.125 ; 1/8
+PB_CENTERJSAMP times 8 db CENTERJSAMPLE
+
+ alignz 16
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse (void * dct_table, JCOEFPTR coef_block,
+; JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b) (b)+8 ; void * dct_table
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
+
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+ ; FAST_FLOAT workspace[DCTSIZE2]
+
+ align 16
+ global EXTN(jsimd_idct_float_sse)
+
+EXTN(jsimd_idct_float_sse):
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; FAST_FLOAT * wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por mm1,mm0
+ packsswb mm1,mm1
+ movd eax,mm1
+ test eax,eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+ punpckhwd mm1,mm0 ; mm1=(** 02 ** 03)
+ punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
+ psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03)
+ psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01)
+ cvtpi2ps xmm3,mm1 ; xmm3=(02 03 ** **)
+ cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **)
+ movlhps xmm0,xmm3 ; xmm0=in0=(00 01 02 03)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm1,xmm0
+ movaps xmm2,xmm0
+ movaps xmm3,xmm0
+
+ shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
+ shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
+ shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
+ shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+ jmp near .nextcolumn
+ alignx 16,7
+%endif
+.columnDCT:
+
+ ; -- Even part
+
+ movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ punpckhwd mm4,mm0 ; mm4=(** 02 ** 03)
+ punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
+ punpckhwd mm5,mm1 ; mm5=(** 22 ** 23)
+ punpcklwd mm1,mm1 ; mm1=(20 20 21 21)
+
+ psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03)
+ psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01)
+ cvtpi2ps xmm4,mm4 ; xmm4=(02 03 ** **)
+ cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **)
+ psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23)
+ psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21)
+ cvtpi2ps xmm5,mm5 ; xmm5=(22 23 ** **)
+ cvtpi2ps xmm1,mm1 ; xmm1=(20 21 ** **)
+
+ punpckhwd mm6,mm2 ; mm6=(** 42 ** 43)
+ punpcklwd mm2,mm2 ; mm2=(40 40 41 41)
+ punpckhwd mm7,mm3 ; mm7=(** 62 ** 63)
+ punpcklwd mm3,mm3 ; mm3=(60 60 61 61)
+
+ psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43)
+ psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41)
+ cvtpi2ps xmm6,mm6 ; xmm6=(42 43 ** **)
+ cvtpi2ps xmm2,mm2 ; xmm2=(40 41 ** **)
+ psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63)
+ psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61)
+ cvtpi2ps xmm7,mm7 ; xmm7=(62 63 ** **)
+ cvtpi2ps xmm3,mm3 ; xmm3=(60 61 ** **)
+
+ movlhps xmm0,xmm4 ; xmm0=in0=(00 01 02 03)
+ movlhps xmm1,xmm5 ; xmm1=in2=(20 21 22 23)
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movlhps xmm2,xmm6 ; xmm2=in4=(40 41 42 43)
+ movlhps xmm3,xmm7 ; xmm3=in6=(60 61 62 63)
+ mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4,xmm0
+ movaps xmm5,xmm1
+ subps xmm0,xmm2 ; xmm0=tmp11
+ subps xmm1,xmm3
+ addps xmm4,xmm2 ; xmm4=tmp10
+ addps xmm5,xmm3 ; xmm5=tmp13
+
+ mulps xmm1,[GOTOFF(ebx,PD_1_414)]
+ subps xmm1,xmm5 ; xmm1=tmp12
+
+ movaps xmm6,xmm4
+ movaps xmm7,xmm0
+ subps xmm4,xmm5 ; xmm4=tmp3
+ subps xmm0,xmm1 ; xmm0=tmp2
+ addps xmm6,xmm5 ; xmm6=tmp0
+ addps xmm7,xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ punpckhwd mm6,mm4 ; mm6=(** 12 ** 13)
+ punpcklwd mm4,mm4 ; mm4=(10 10 11 11)
+ punpckhwd mm2,mm0 ; mm2=(** 32 ** 33)
+ punpcklwd mm0,mm0 ; mm0=(30 30 31 31)
+
+ psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13)
+ psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11)
+ cvtpi2ps xmm4,mm6 ; xmm4=(12 13 ** **)
+ cvtpi2ps xmm2,mm4 ; xmm2=(10 11 ** **)
+ psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33)
+ psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31)
+ cvtpi2ps xmm0,mm2 ; xmm0=(32 33 ** **)
+ cvtpi2ps xmm3,mm0 ; xmm3=(30 31 ** **)
+
+ punpckhwd mm7,mm5 ; mm7=(** 52 ** 53)
+ punpcklwd mm5,mm5 ; mm5=(50 50 51 51)
+ punpckhwd mm3,mm1 ; mm3=(** 72 ** 73)
+ punpcklwd mm1,mm1 ; mm1=(70 70 71 71)
+
+ movlhps xmm2,xmm4 ; xmm2=in1=(10 11 12 13)
+ movlhps xmm3,xmm0 ; xmm3=in3=(30 31 32 33)
+
+ psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53)
+ psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51)
+ cvtpi2ps xmm4,mm7 ; xmm4=(52 53 ** **)
+ cvtpi2ps xmm5,mm5 ; xmm5=(50 51 ** **)
+ psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73)
+ psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71)
+ cvtpi2ps xmm0,mm3 ; xmm0=(72 73 ** **)
+ cvtpi2ps xmm1,mm1 ; xmm1=(70 71 ** **)
+
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movlhps xmm5,xmm4 ; xmm5=in5=(50 51 52 53)
+ movlhps xmm1,xmm0 ; xmm1=in7=(70 71 72 73)
+ mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4,xmm2
+ movaps xmm0,xmm5
+ addps xmm2,xmm1 ; xmm2=z11
+ addps xmm5,xmm3 ; xmm5=z13
+ subps xmm4,xmm1 ; xmm4=z12
+ subps xmm0,xmm3 ; xmm0=z10
+
+ movaps xmm1,xmm2
+ subps xmm2,xmm5
+ addps xmm1,xmm5 ; xmm1=tmp7
+
+ mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3,xmm0
+ addps xmm0,xmm4
+ mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3,xmm0 ; xmm3=tmp12
+ subps xmm4,xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3,xmm1 ; xmm3=tmp6
+ movaps xmm5,xmm6
+ movaps xmm0,xmm7
+ addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
+ addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
+ subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
+ subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
+ subps xmm2,xmm3 ; xmm2=tmp5
+
+ movaps xmm1,xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
+ movaps xmm3,xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
+ unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
+
+ movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
+ movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
+
+ addps xmm4,xmm2 ; xmm4=tmp4
+ movaps xmm0,xmm7
+ movaps xmm3,xmm5
+ addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
+ addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
+ subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
+ subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
+
+ movaps xmm2,xmm7 ; transpose coefficients(phase 1)
+ unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
+ unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
+ movaps xmm4,xmm5 ; transpose coefficients(phase 1)
+ unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
+ unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
+
+ movaps xmm3,xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
+ unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
+ movaps xmm0,xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
+ unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
+ movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+ movaps xmm6,xmm5 ; transpose coefficients(phase 2)
+ unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
+ unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
+ movaps xmm3,xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
+ unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; FAST_FLOAT * wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16,7
+.rowloop:
+
+ ; -- Even part
+
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4,xmm0
+ movaps xmm5,xmm1
+ subps xmm0,xmm2 ; xmm0=tmp11
+ subps xmm1,xmm3
+ addps xmm4,xmm2 ; xmm4=tmp10
+ addps xmm5,xmm3 ; xmm5=tmp13
+
+ mulps xmm1,[GOTOFF(ebx,PD_1_414)]
+ subps xmm1,xmm5 ; xmm1=tmp12
+
+ movaps xmm6,xmm4
+ movaps xmm7,xmm0
+ subps xmm4,xmm5 ; xmm4=tmp3
+ subps xmm0,xmm1 ; xmm0=tmp2
+ addps xmm6,xmm5 ; xmm6=tmp0
+ addps xmm7,xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4,xmm2
+ movaps xmm0,xmm5
+ addps xmm2,xmm1 ; xmm2=z11
+ addps xmm5,xmm3 ; xmm5=z13
+ subps xmm4,xmm1 ; xmm4=z12
+ subps xmm0,xmm3 ; xmm0=z10
+
+ movaps xmm1,xmm2
+ subps xmm2,xmm5
+ addps xmm1,xmm5 ; xmm1=tmp7
+
+ mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3,xmm0
+ addps xmm0,xmm4
+ mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3,xmm0 ; xmm3=tmp12
+ subps xmm4,xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3,xmm1 ; xmm3=tmp6
+ movaps xmm5,xmm6
+ movaps xmm0,xmm7
+ addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
+ addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
+ subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
+ subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
+ subps xmm2,xmm3 ; xmm2=tmp5
+
+ movaps xmm1,[GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125]
+
+ mulps xmm6,xmm1 ; descale(1/8)
+ mulps xmm7,xmm1 ; descale(1/8)
+ mulps xmm5,xmm1 ; descale(1/8)
+ mulps xmm0,xmm1 ; descale(1/8)
+
+ movhlps xmm3,xmm6
+ movhlps xmm1,xmm7
+ cvtps2pi mm0,xmm6 ; round to int32, mm0=data0L=(00 10)
+ cvtps2pi mm1,xmm7 ; round to int32, mm1=data1L=(01 11)
+ cvtps2pi mm2,xmm3 ; round to int32, mm2=data0H=(20 30)
+ cvtps2pi mm3,xmm1 ; round to int32, mm3=data1H=(21 31)
+ packssdw mm0,mm2 ; mm0=data0=(00 10 20 30)
+ packssdw mm1,mm3 ; mm1=data1=(01 11 21 31)
+
+ movhlps xmm6,xmm5
+ movhlps xmm7,xmm0
+ cvtps2pi mm4,xmm5 ; round to int32, mm4=data7L=(07 17)
+ cvtps2pi mm5,xmm0 ; round to int32, mm5=data6L=(06 16)
+ cvtps2pi mm6,xmm6 ; round to int32, mm6=data7H=(27 37)
+ cvtps2pi mm7,xmm7 ; round to int32, mm7=data6H=(26 36)
+ packssdw mm4,mm6 ; mm4=data7=(07 17 27 37)
+ packssdw mm5,mm7 ; mm5=data6=(06 16 26 36)
+
+ packsswb mm0,mm5 ; mm0=(00 10 20 30 06 16 26 36)
+ packsswb mm1,mm4 ; mm1=(01 11 21 31 07 17 27 37)
+
+ movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2
+ movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
+
+ movaps xmm6,[GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125]
+
+ addps xmm4,xmm2 ; xmm4=tmp4
+ movaps xmm5,xmm3
+ movaps xmm0,xmm1
+ addps xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
+ addps xmm1,xmm4 ; xmm1=data4=(04 14 24 34)
+ subps xmm5,xmm2 ; xmm5=data5=(05 15 25 35)
+ subps xmm0,xmm4 ; xmm0=data3=(03 13 23 33)
+
+ mulps xmm3,xmm6 ; descale(1/8)
+ mulps xmm1,xmm6 ; descale(1/8)
+ mulps xmm5,xmm6 ; descale(1/8)
+ mulps xmm0,xmm6 ; descale(1/8)
+
+ movhlps xmm7,xmm3
+ movhlps xmm2,xmm1
+ cvtps2pi mm2,xmm3 ; round to int32, mm2=data2L=(02 12)
+ cvtps2pi mm3,xmm1 ; round to int32, mm3=data4L=(04 14)
+ cvtps2pi mm6,xmm7 ; round to int32, mm6=data2H=(22 32)
+ cvtps2pi mm7,xmm2 ; round to int32, mm7=data4H=(24 34)
+ packssdw mm2,mm6 ; mm2=data2=(02 12 22 32)
+ packssdw mm3,mm7 ; mm3=data4=(04 14 24 34)
+
+ movhlps xmm4,xmm5
+ movhlps xmm6,xmm0
+ cvtps2pi mm5,xmm5 ; round to int32, mm5=data5L=(05 15)
+ cvtps2pi mm4,xmm0 ; round to int32, mm4=data3L=(03 13)
+ cvtps2pi mm6,xmm4 ; round to int32, mm6=data5H=(25 35)
+ cvtps2pi mm7,xmm6 ; round to int32, mm7=data3H=(23 33)
+ packssdw mm5,mm6 ; mm5=data5=(05 15 25 35)
+ packssdw mm4,mm7 ; mm4=data3=(03 13 23 33)
+
+ movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP]
+
+ packsswb mm2,mm3 ; mm2=(02 12 22 32 04 14 24 34)
+ packsswb mm4,mm5 ; mm4=(03 13 23 33 05 15 25 35)
+
+ paddb mm0,mm6
+ paddb mm1,mm6
+ paddb mm2,mm6
+ paddb mm4,mm6
+
+ movq mm7,mm0 ; transpose coefficients(phase 1)
+ punpcklbw mm0,mm1 ; mm0=(00 01 10 11 20 21 30 31)
+ punpckhbw mm7,mm1 ; mm7=(06 07 16 17 26 27 36 37)
+ movq mm3,mm2 ; transpose coefficients(phase 1)
+ punpcklbw mm2,mm4 ; mm2=(02 03 12 13 22 23 32 33)
+ punpckhbw mm3,mm4 ; mm3=(04 05 14 15 24 25 34 35)
+
+ movq mm5,mm0 ; transpose coefficients(phase 2)
+ punpcklwd mm0,mm2 ; mm0=(00 01 02 03 10 11 12 13)
+ punpckhwd mm5,mm2 ; mm5=(20 21 22 23 30 31 32 33)
+ movq mm6,mm3 ; transpose coefficients(phase 2)
+ punpcklwd mm3,mm7 ; mm3=(04 05 06 07 14 15 16 17)
+ punpckhwd mm6,mm7 ; mm6=(24 25 26 27 34 35 36 37)
+
+ movq mm1,mm0 ; transpose coefficients(phase 3)
+ punpckldq mm0,mm3 ; mm0=(00 01 02 03 04 05 06 07)
+ punpckhdq mm1,mm3 ; mm1=(10 11 12 13 14 15 16 17)
+ movq mm4,mm5 ; transpose coefficients(phase 3)
+ punpckldq mm5,mm6 ; mm5=(20 21 22 23 24 25 26 27)
+ punpckhdq mm4,mm6 ; mm4=(30 31 32 33 34 35 36 37)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+ movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ emms ; empty MMX state
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jsimd.h b/simd/jsimd.h
new file mode 100644
index 0000000..c21cf29
--- /dev/null
+++ b/simd/jsimd.h
@@ -0,0 +1,503 @@
+/*
+ * simd/jsimd.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ */
+
+/* Bitmask for supported acceleration methods */
+
+#define JSIMD_NONE 0x00
+#define JSIMD_MMX 0x01
+#define JSIMD_3DNOW 0x02
+#define JSIMD_SSE 0x04
+#define JSIMD_SSE2 0x08
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jpeg_simd_cpu_support jSiCpuSupport
+#define jsimd_rgb_ycc_convert_mmx jSRGBYCCM
+#define jsimd_extrgb_ycc_convert_mmx jSEXTRGBYCCM
+#define jsimd_extrgbx_ycc_convert_mmx jSEXTRGBXYCCM
+#define jsimd_extbgr_ycc_convert_mmx jSEXTBGRYCCM
+#define jsimd_extbgrx_ycc_convert_mmx jSEXTBGRXYCCM
+#define jsimd_extxbgr_ycc_convert_mmx jSEXTXBGRYCCM
+#define jsimd_extxrgb_ycc_convert_mmx jSEXTXRGBYCCM
+#define jsimd_ycc_rgb_convert_mmx jSYCCRGBM
+#define jsimd_ycc_extrgb_convert_mmx jSYCCEXTRGBM
+#define jsimd_ycc_extrgbx_convert_mmx jSYCCEXTRGBXM
+#define jsimd_ycc_extbgr_convert_mmx jSYCCEXTBGRM
+#define jsimd_ycc_extbgrx_convert_mmx jSYCCEXTBGRXM
+#define jsimd_ycc_extxbgr_convert_mmx jSYCCEXTXBGRM
+#define jsimd_ycc_extxrgb_convert_mmx jSYCCEXTXRGBM
+#define jconst_rgb_ycc_convert_sse2 jSCRGBYCCS2
+#define jsimd_rgb_ycc_convert_sse2 jSRGBYCCS2
+#define jsimd_extrgb_ycc_convert_sse2 jSEXTRGBYCCS2
+#define jsimd_extrgbx_ycc_convert_sse2 jSEXTRGBXYCCS2
+#define jsimd_extbgr_ycc_convert_sse2 jSEXTBGRYCCS2
+#define jsimd_extbgrx_ycc_convert_sse2 jSEXTBGRXYCCS2
+#define jsimd_extxbgr_ycc_convert_sse2 jSEXTXBGRYCCS2
+#define jsimd_extxrgb_ycc_convert_sse2 jSEXTXRGBYCCS2
+#define jconst_ycc_rgb_convert_sse2 jSCYCCRGBS2
+#define jsimd_ycc_rgb_convert_sse2 jSYCCRGBS2
+#define jsimd_ycc_extrgb_convert_sse2 jSYCCEXTRGBS2
+#define jsimd_ycc_extrgbx_convert_sse2 jSYCCEXTRGBXS2
+#define jsimd_ycc_extbgr_convert_sse2 jSYCCEXTBGRS2
+#define jsimd_ycc_extbgrx_convert_sse2 jSYCCEXTBGRXS2
+#define jsimd_ycc_extxbgr_convert_sse2 jSYCCEXTXBGRS2
+#define jsimd_ycc_extxrgb_convert_sse2 jSYCCEXTXRGBS2
+#define jsimd_h2v2_downsample_mmx jSDnH2V2M
+#define jsimd_h2v1_downsample_mmx jSDnH2V1M
+#define jsimd_h2v2_downsample_sse2 jSDnH2V2S2
+#define jsimd_h2v1_downsample_sse2 jSDnH2V1S2
+#define jsimd_h2v2_upsample_mmx jSUpH2V2M
+#define jsimd_h2v1_upsample_mmx jSUpH2V1M
+#define jsimd_h2v2_fancy_upsample_mmx jSFUpH2V2M
+#define jsimd_h2v1_fancy_upsample_mmx jSFUpH2V1M
+#define jsimd_h2v2_merged_upsample_mmx jSMUpH2V2M
+#define jsimd_h2v2_extrgb_merged_upsample_mmx jSMUpH2V2EXTRGBM
+#define jsimd_h2v2_extrgbx_merged_upsample_mmx jSMUpH2V2EXTRGBXM
+#define jsimd_h2v2_extbgr_merged_upsample_mmx jSMUpH2V2EXTBGRM
+#define jsimd_h2v2_extbgrx_merged_upsample_mmx jSMUpH2V2EXTBGRXM
+#define jsimd_h2v2_extxbgr_merged_upsample_mmx jSMUpH2V2EXTXBGRM
+#define jsimd_h2v2_extxrgb_merged_upsample_mmx jSMUpH2V2EXTXRGBM
+#define jsimd_h2v1_merged_upsample_mmx jSMUpH2V1M
+#define jsimd_h2v1_extrgb_merged_upsample_mmx jSMUpH2V1EXTRGBM
+#define jsimd_h2v1_extrgbx_merged_upsample_mmx jSMUpH2V1EXTRGBXM
+#define jsimd_h2v1_extbgr_merged_upsample_mmx jSMUpH2V1EXTBGRM
+#define jsimd_h2v1_extbgrx_merged_upsample_mmx jSMUpH2V1EXTBGRXM
+#define jsimd_h2v1_extxbgr_merged_upsample_mmx jSMUpH2V1EXTXBGRM
+#define jsimd_h2v1_extxrgb_merged_upsample_mmx jSMUpH2V1EXTXRGBM
+#define jsimd_h2v2_upsample_sse2 jSUpH2V2S2
+#define jsimd_h2v1_upsample_sse2 jSUpH2V1S2
+#define jconst_fancy_upsample_sse2 jSCFUpS2
+#define jsimd_h2v2_fancy_upsample_sse2 jSFUpH2V2S2
+#define jsimd_h2v1_fancy_upsample_sse2 jSFUpH2V1S2
+#define jconst_merged_upsample_sse2 jSCMUpS2
+#define jsimd_h2v2_merged_upsample_sse2 jSMUpH2V2S2
+#define jsimd_h2v2_extrgb_merged_upsample_sse2 jSMUpH2V2EXTRGBS2
+#define jsimd_h2v2_extrgbx_merged_upsample_sse2 jSMUpH2V2EXTRGBXS2
+#define jsimd_h2v2_extbgr_merged_upsample_sse2 jSMUpH2V2EXTBGRS2
+#define jsimd_h2v2_extbgrx_merged_upsample_sse2 jSMUpH2V2EXTBGRXS2
+#define jsimd_h2v2_extxbgr_merged_upsample_sse2 jSMUpH2V2EXTXBGRS2
+#define jsimd_h2v2_extxrgb_merged_upsample_sse2 jSMUpH2V2EXTXRGBS2
+#define jsimd_h2v1_merged_upsample_sse2 jSMUpH2V1S2
+#define jsimd_h2v1_extrgb_merged_upsample_sse2 jSMUpH2V1EXTRGBS2
+#define jsimd_h2v1_extrgbx_merged_upsample_sse2 jSMUpH2V1EXTRGBXS2
+#define jsimd_h2v1_extbgr_merged_upsample_sse2 jSMUpH2V1EXTBGRS2
+#define jsimd_h2v1_extbgrx_merged_upsample_sse2 jSMUpH2V1EXTBGRXS2
+#define jsimd_h2v1_extxbgr_merged_upsample_sse2 jSMUpH2V1EXTXBGRS2
+#define jsimd_h2v1_extxrgb_merged_upsample_sse2 jSMUpH2V1EXTXRGBS2
+#define jsimd_convsamp_mmx jSConvM
+#define jsimd_convsamp_sse2 jSConvS2
+#define jsimd_convsamp_float_3dnow jSConvF3D
+#define jsimd_convsamp_float_sse jSConvFS
+#define jsimd_convsamp_float_sse2 jSConvFS2
+#define jsimd_fdct_islow_mmx jSFDMIS
+#define jsimd_fdct_ifast_mmx jSFDMIF
+#define jconst_fdct_islow_sse2 jSCFDS2IS
+#define jsimd_fdct_islow_sse2 jSFDS2IS
+#define jconst_fdct_ifast_sse2 jSCFDS2IF
+#define jsimd_fdct_ifast_sse2 jSFDS2IF
+#define jsimd_fdct_float_3dnow jSFD3DF
+#define jconst_fdct_float_sse jSCFDSF
+#define jsimd_fdct_float_sse jSFDSF
+#define jsimd_quantize_mmx jSQuantM
+#define jsimd_quantize_sse2 jSQuantS2
+#define jsimd_quantize_float_3dnow jSQuantF3D
+#define jsimd_quantize_float_sse jSQuantFS
+#define jsimd_quantize_float_sse2 jSQuantFS2
+#define jsimd_idct_2x2_mmx jSIDM22
+#define jsimd_idct_4x4_mmx jSIDM44
+#define jconst_idct_red_sse2 jSCIDS2R
+#define jsimd_idct_2x2_sse2 jSIDS222
+#define jsimd_idct_4x4_sse2 jSIDS244
+#define jsimd_idct_islow_mmx jSIDMIS
+#define jsimd_idct_ifast_mmx jSIDMIF
+#define jconst_idct_islow_sse2 jSCIDS2IS
+#define jsimd_idct_islow_sse2 jSIDS2IS
+#define jconst_idct_ifast_sse2 jSCIDS2IF
+#define jsimd_idct_ifast_sse2 jSIDS2IF
+#define jsimd_idct_float_3dnow jSID3DF
+#define jconst_fdct_float_sse jSCIDSF
+#define jsimd_idct_float_sse jSIDSF
+#define jconst_fdct_float_sse2 jSCIDS2F
+#define jsimd_idct_float_sse2 jSIDS2F
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+/* SIMD Ext: retrieve SIMD/CPU information */
+EXTERN(unsigned int) jpeg_simd_cpu_support JPP((void));
+
+/* SIMD Color Space Conversion */
+EXTERN(void) jsimd_rgb_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+
+EXTERN(void) jsimd_ycc_rgb_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+
+extern const int jconst_rgb_ycc_convert_sse2[];
+EXTERN(void) jsimd_rgb_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
+ JPP((JDIMENSION img_width,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows));
+
+extern const int jconst_ycc_rgb_convert_sse2[];
+EXTERN(void) jsimd_ycc_rgb_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
+ JPP((JDIMENSION out_width,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows));
+
+/* SIMD Downsample */
+EXTERN(void) jsimd_h2v2_downsample_mmx
+ JPP((JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample_mmx
+ JPP((JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+EXTERN(void) jsimd_h2v2_downsample_sse2
+ JPP((JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample_sse2
+ JPP((JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+/* SIMD Upsample */
+EXTERN(void) jsimd_h2v2_upsample_mmx
+ JPP((int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample_mmx
+ JPP((int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
+ JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
+ JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+
+EXTERN(void) jsimd_h2v2_upsample_sse2
+ JPP((int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample_sse2
+ JPP((int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+extern const int jconst_fancy_upsample_sse2[];
+EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
+ JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
+ JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+extern const int jconst_merged_upsample_sse2[];
+EXTERN(void) jsimd_h2v2_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
+ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+
+/* SIMD Sample Conversion */
+EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ DCTELEM * workspace));
+
+EXTERN(void) jsimd_convsamp_sse2 JPP((JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ DCTELEM * workspace));
+
+EXTERN(void) jsimd_convsamp_float_3dnow JPP((JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_convsamp_float_sse JPP((JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_convsamp_float_sse2 JPP((JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ FAST_FLOAT * workspace));
+
+/* SIMD Forward DCT */
+EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data));
+
+extern const int jconst_fdct_ifast_sse2[];
+EXTERN(void) jsimd_fdct_islow_sse2 JPP((DCTELEM * data));
+extern const int jconst_fdct_islow_sse2[];
+EXTERN(void) jsimd_fdct_ifast_sse2 JPP((DCTELEM * data));
+
+EXTERN(void) jsimd_fdct_float_3dnow JPP((FAST_FLOAT * data));
+
+extern const int jconst_fdct_float_sse[];
+EXTERN(void) jsimd_fdct_float_sse JPP((FAST_FLOAT * data));
+
+/* SIMD Quantization */
+EXTERN(void) jsimd_quantize_mmx JPP((JCOEFPTR coef_block,
+ DCTELEM * divisors,
+ DCTELEM * workspace));
+
+EXTERN(void) jsimd_quantize_sse2 JPP((JCOEFPTR coef_block,
+ DCTELEM * divisors,
+ DCTELEM * workspace));
+
+EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block,
+ FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_quantize_float_sse JPP((JCOEFPTR coef_block,
+ FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_quantize_float_sse2 JPP((JCOEFPTR coef_block,
+ FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace));
+
+/* SIMD Reduced Inverse DCT */
+EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_mmx JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
+extern const int jconst_idct_red_sse2[];
+EXTERN(void) jsimd_idct_2x2_sse2 JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
+/* SIMD Inverse DCT */
+EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast_mmx JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
+extern const int jconst_idct_islow_sse2[];
+EXTERN(void) jsimd_idct_islow_sse2 JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+extern const int jconst_idct_ifast_sse2[];
+EXTERN(void) jsimd_idct_ifast_sse2 JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
+EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
+extern const int jconst_idct_float_sse[];
+EXTERN(void) jsimd_idct_float_sse JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
+extern const int jconst_idct_float_sse2[];
+EXTERN(void) jsimd_idct_float_sse2 JPP((void * dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col));
+
diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c
new file mode 100644
index 0000000..f5aec18
--- /dev/null
+++ b/simd/jsimd_i386.c
@@ -0,0 +1,956 @@
+/*
+ * jsimd_i386.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+
+static unsigned int simd_support = ~0;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd (void)
+{
+ char *env = NULL;
+
+ if (simd_support != ~0)
+ return;
+
+ simd_support = jpeg_simd_cpu_support();
+
+ /* Force different settings through environment variables */
+ env = getenv("JSIMD_FORCEMMX");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support &= JSIMD_MMX;
+ env = getenv("JSIMD_FORCE3DNOW");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support &= JSIMD_3DNOW;
+ env = getenv("JSIMD_FORCESSE");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support &= JSIMD_SSE;
+ env = getenv("JSIMD_FORCESSE2");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support &= JSIMD_SSE2;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch(cinfo->in_color_space)
+ {
+ case JCS_EXT_RGB:
+ sse2fct=jsimd_extrgb_ycc_convert_sse2;
+ mmxfct=jsimd_extrgb_ycc_convert_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ sse2fct=jsimd_extrgbx_ycc_convert_sse2;
+ mmxfct=jsimd_extrgbx_ycc_convert_mmx;
+ break;
+ case JCS_EXT_BGR:
+ sse2fct=jsimd_extbgr_ycc_convert_sse2;
+ mmxfct=jsimd_extbgr_ycc_convert_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ sse2fct=jsimd_extbgrx_ycc_convert_sse2;
+ mmxfct=jsimd_extbgrx_ycc_convert_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ sse2fct=jsimd_extxbgr_ycc_convert_sse2;
+ mmxfct=jsimd_extxbgr_ycc_convert_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ sse2fct=jsimd_extxrgb_ycc_convert_sse2;
+ mmxfct=jsimd_extxrgb_ycc_convert_mmx;
+ break;
+ default:
+ sse2fct=jsimd_rgb_ycc_convert_sse2;
+ mmxfct=jsimd_rgb_ycc_convert_mmx;
+ break;
+ }
+
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+ sse2fct(cinfo->image_width, input_buf,
+ output_buf, output_row, num_rows);
+ else if (simd_support & JSIMD_MMX)
+ mmxfct(cinfo->image_width, input_buf,
+ output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+ void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+ void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch(cinfo->out_color_space)
+ {
+ case JCS_EXT_RGB:
+ sse2fct=jsimd_ycc_extrgb_convert_sse2;
+ mmxfct=jsimd_ycc_extrgb_convert_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ sse2fct=jsimd_ycc_extrgbx_convert_sse2;
+ mmxfct=jsimd_ycc_extrgbx_convert_mmx;
+ break;
+ case JCS_EXT_BGR:
+ sse2fct=jsimd_ycc_extbgr_convert_sse2;
+ mmxfct=jsimd_ycc_extbgr_convert_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ sse2fct=jsimd_ycc_extbgrx_convert_sse2;
+ mmxfct=jsimd_ycc_extbgrx_convert_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ sse2fct=jsimd_ycc_extxbgr_convert_sse2;
+ mmxfct=jsimd_ycc_extxbgr_convert_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ sse2fct=jsimd_ycc_extxrgb_convert_sse2;
+ mmxfct=jsimd_ycc_extxrgb_convert_mmx;
+ break;
+ default:
+ sse2fct=jsimd_ycc_rgb_convert_sse2;
+ mmxfct=jsimd_ycc_rgb_convert_mmx;
+ break;
+ }
+
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+ sse2fct(cinfo->output_width, input_buf,
+ input_row, output_buf, num_rows);
+ else if (simd_support & JSIMD_MMX)
+ mmxfct(cinfo->output_width, input_buf,
+ input_row, output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ if (simd_support & JSIMD_SSE2)
+ jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ if (simd_support & JSIMD_SSE2)
+ jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ if (simd_support & JSIMD_SSE2)
+ jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor,
+ cinfo->output_width, input_data, output_data_ptr);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor,
+ cinfo->output_width, input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ if (simd_support & JSIMD_SSE2)
+ jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor,
+ cinfo->output_width, input_data, output_data_ptr);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor,
+ cinfo->output_width, input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data, output_data_ptr);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data, output_data_ptr);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch(cinfo->out_color_space)
+ {
+ case JCS_EXT_RGB:
+ sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
+ mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
+ mmxfct=jsimd_h2v2_extrgbx_merged_upsample_mmx;
+ break;
+ case JCS_EXT_BGR:
+ sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
+ mmxfct=jsimd_h2v2_extbgr_merged_upsample_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
+ mmxfct=jsimd_h2v2_extbgrx_merged_upsample_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
+ mmxfct=jsimd_h2v2_extxbgr_merged_upsample_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
+ mmxfct=jsimd_h2v2_extxrgb_merged_upsample_mmx;
+ break;
+ default:
+ sse2fct=jsimd_h2v2_merged_upsample_sse2;
+ mmxfct=jsimd_h2v2_merged_upsample_mmx;
+ break;
+ }
+
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ sse2fct(cinfo->output_width, input_buf,
+ in_row_group_ctr, output_buf);
+ else if (simd_support & JSIMD_MMX)
+ mmxfct(cinfo->output_width, input_buf,
+ in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch(cinfo->out_color_space)
+ {
+ case JCS_EXT_RGB:
+ sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
+ mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx;
+ break;
+ case JCS_EXT_RGBX:
+ sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
+ mmxfct=jsimd_h2v1_extrgbx_merged_upsample_mmx;
+ break;
+ case JCS_EXT_BGR:
+ sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
+ mmxfct=jsimd_h2v1_extbgr_merged_upsample_mmx;
+ break;
+ case JCS_EXT_BGRX:
+ sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
+ mmxfct=jsimd_h2v1_extbgrx_merged_upsample_mmx;
+ break;
+ case JCS_EXT_XBGR:
+ sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
+ mmxfct=jsimd_h2v1_extxbgr_merged_upsample_mmx;
+ break;
+ case JCS_EXT_XRGB:
+ sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
+ mmxfct=jsimd_h2v1_extxrgb_merged_upsample_mmx;
+ break;
+ default:
+ sse2fct=jsimd_h2v1_merged_upsample_sse2;
+ mmxfct=jsimd_h2v1_merged_upsample_mmx;
+ break;
+ }
+
+ if ((simd_support & JSIMD_SSE2) &&
+ IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ sse2fct(cinfo->output_width, input_buf,
+ in_row_group_ctr, output_buf);
+ else if (simd_support & JSIMD_MMX)
+ mmxfct(cinfo->output_width, input_buf,
+ in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_SSE)
+ return 1;
+ if (simd_support & JSIMD_3DNOW)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM * workspace)
+{
+ if (simd_support & JSIMD_SSE2)
+ jsimd_convsamp_sse2(sample_data, start_col, workspace);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_convsamp_mmx(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT * workspace)
+{
+ if (simd_support & JSIMD_SSE2)
+ jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+ else if (simd_support & JSIMD_SSE)
+ jsimd_convsamp_float_sse(sample_data, start_col, workspace);
+ else if (simd_support & JSIMD_3DNOW)
+ jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+ return 1;
+ if (simd_support & JSIMD_3DNOW)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+ jsimd_fdct_islow_sse2(data);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_fdct_islow_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+ jsimd_fdct_ifast_sse2(data);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_fdct_ifast_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+ if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+ jsimd_fdct_float_sse(data);
+ else if (simd_support & JSIMD_3DNOW)
+ jsimd_fdct_float_3dnow(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+ if (simd_support & JSIMD_SSE)
+ return 1;
+ if (simd_support & JSIMD_3DNOW)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+ DCTELEM * workspace)
+{
+ if (simd_support & JSIMD_SSE2)
+ jsimd_quantize_sse2(coef_block, divisors, workspace);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_quantize_mmx(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace)
+{
+ if (simd_support & JSIMD_SSE2)
+ jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+ else if (simd_support & JSIMD_SSE)
+ jsimd_quantize_float_sse(coef_block, divisors, workspace);
+ else if (simd_support & JSIMD_3DNOW)
+ jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+ return 1;
+ if (simd_support & JSIMD_MMX)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+ if (sizeof(FLOAT_MULT_TYPE) != 4)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+ return 1;
+ if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+ return 1;
+ if (simd_support & JSIMD_3DNOW)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+ jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+ jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+ else if (simd_support & JSIMD_MMX)
+ jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+ jsimd_idct_float_sse2(compptr->dct_table, coef_block,
+ output_buf, output_col);
+ else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+ jsimd_idct_float_sse(compptr->dct_table, coef_block,
+ output_buf, output_col);
+ else if (simd_support & JSIMD_3DNOW)
+ jsimd_idct_float_3dnow(compptr->dct_table, coef_block,
+ output_buf, output_col);
+}
+
diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c
new file mode 100644
index 0000000..0da564c
--- /dev/null
+++ b/simd/jsimd_x86_64.c
@@ -0,0 +1,680 @@
+/*
+ * jsimd_x86_64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * x86_64 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "simd/jsimd.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order) (((unsigned long)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch(cinfo->in_color_space)
+ {
+ case JCS_EXT_RGB:
+ sse2fct=jsimd_extrgb_ycc_convert_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ sse2fct=jsimd_extrgbx_ycc_convert_sse2;
+ break;
+ case JCS_EXT_BGR:
+ sse2fct=jsimd_extbgr_ycc_convert_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ sse2fct=jsimd_extbgrx_ycc_convert_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ sse2fct=jsimd_extxbgr_ycc_convert_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ sse2fct=jsimd_extxrgb_ycc_convert_sse2;
+ break;
+ default:
+ sse2fct=jsimd_rgb_ycc_convert_sse2;
+ break;
+ }
+
+ sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+ void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch(cinfo->out_color_space)
+ {
+ case JCS_EXT_RGB:
+ sse2fct=jsimd_ycc_extrgb_convert_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ sse2fct=jsimd_ycc_extrgbx_convert_sse2;
+ break;
+ case JCS_EXT_BGR:
+ sse2fct=jsimd_ycc_extbgr_convert_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ sse2fct=jsimd_ycc_extbgrx_convert_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ sse2fct=jsimd_ycc_extxbgr_convert_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ sse2fct=jsimd_ycc_extxrgb_convert_sse2;
+ break;
+ default:
+ sse2fct=jsimd_ycc_rgb_convert_sse2;
+ break;
+ }
+
+ sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_sse2(cinfo->image_width,
+ cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_sse2(cinfo->image_width,
+ cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor,
+ cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor,
+ cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+ jpeg_component_info * compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY * output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+ compptr->downsampled_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch(cinfo->out_color_space)
+ {
+ case JCS_EXT_RGB:
+ sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
+ break;
+ case JCS_EXT_BGR:
+ sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
+ break;
+ default:
+ sse2fct=jsimd_h2v2_merged_upsample_sse2;
+ break;
+ }
+
+ sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch(cinfo->out_color_space)
+ {
+ case JCS_EXT_RGB:
+ sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
+ break;
+ case JCS_EXT_RGBX:
+ sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
+ break;
+ case JCS_EXT_BGR:
+ sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
+ break;
+ case JCS_EXT_BGRX:
+ sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
+ break;
+ case JCS_EXT_XBGR:
+ sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
+ break;
+ case JCS_EXT_XRGB:
+ sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
+ break;
+ default:
+ sse2fct=jsimd_h2v1_merged_upsample_sse2;
+ break;
+ }
+
+ sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM * workspace)
+{
+ jsimd_convsamp_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT * workspace)
+{
+ jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_fdct_float_sse))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+ jsimd_fdct_islow_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+ jsimd_fdct_ifast_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+ jsimd_fdct_float_sse(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+ DCTELEM * workspace)
+{
+ jsimd_quantize_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+ FAST_FLOAT * workspace)
+{
+ jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_idct_red_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+ if (sizeof(FLOAT_MULT_TYPE) != 4)
+ return 0;
+
+ if (!IS_ALIGNED_SSE(jconst_idct_float_sse2))
+ return 0;
+
+ return 1;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_float_sse2(compptr->dct_table, coef_block,
+ output_buf, output_col);
+}
+
diff --git a/simd/jsimdcfg.inc.h b/simd/jsimdcfg.inc.h
new file mode 100644
index 0000000..4876038
--- /dev/null
+++ b/simd/jsimdcfg.inc.h
@@ -0,0 +1,168 @@
+// This file generates the include file for the assembly
+// implementations by abusing the C preprocessor.
+//
+// Note: Some things are manually defined as they need to
+// be mapped to NASM types.
+
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+
+#define JPEG_INTERNALS
+
+#include "../jpeglib.h"
+#include "../jconfig.h"
+#include "../jmorecfg.h"
+#include "jsimd.h"
+
+#define define(var) %define _cpp_protection_##var
+#define definev(var) %define _cpp_protection_##var var
+
+;
+; -- jpeglib.h
+;
+
+definev(DCTSIZE)
+definev(DCTSIZE2)
+
+;
+; -- jmorecfg.h
+;
+
+definev(RGB_RED)
+definev(RGB_GREEN)
+definev(RGB_BLUE)
+
+definev(RGB_PIXELSIZE)
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+
+%define JSAMPLE byte ; unsigned char
+%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
+
+definev(CENTERJSAMPLE)
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF word ; short
+%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION dword ; unsigned int
+%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
+
+%define JSAMPROW POINTER ; JSAMPLE FAR * (jpeglib.h)
+%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
+%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
+%define JCOEFPTR POINTER ; JCOEF FAR * (jpeglib.h)
+%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
+
+;
+; -- jdct.h
+;
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM word ; short
+%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
+
+%define FAST_FLOAT FP32 ; float
+%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define ISLOW_MULT_TYPE word ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
+
+%define IFAST_MULT_TYPE word ; must be short
+%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
+
+%define FLOAT_MULT_TYPE FP32 ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
+
+;
+; -- jsimd.h
+;
+
+definev(JSIMD_NONE)
+definev(JSIMD_MMX)
+definev(JSIMD_3DNOW)
+definev(JSIMD_SSE)
+definev(JSIMD_SSE2)
+
+; Short forms of external names for systems with brain-damaged linkers.
+;
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+definev(jpeg_simd_cpu_support)
+definev(jsimd_rgb_ycc_convert_mmx)
+definev(jsimd_ycc_rgb_convert_mmx)
+definev(jconst_rgb_ycc_convert_sse2)
+definev(jsimd_rgb_ycc_convert_sse2)
+definev(jconst_ycc_rgb_convert_sse2)
+definev(jsimd_ycc_rgb_convert_sse2)
+definev(jsimd_h2v2_downsample_mmx)
+definev(jsimd_h2v1_downsample_mmx)
+definev(jsimd_h2v2_downsample_sse2)
+definev(jsimd_h2v1_downsample_sse2)
+definev(jsimd_h2v2_upsample_mmx)
+definev(jsimd_h2v1_upsample_mmx)
+definev(jsimd_h2v1_fancy_upsample_mmx)
+definev(jsimd_h2v2_fancy_upsample_mmx)
+definev(jsimd_h2v1_merged_upsample_mmx)
+definev(jsimd_h2v2_merged_upsample_mmx)
+definev(jsimd_h2v2_upsample_sse2)
+definev(jsimd_h2v1_upsample_sse2)
+definev(jconst_fancy_upsample_sse2)
+definev(jsimd_h2v1_fancy_upsample_sse2)
+definev(jsimd_h2v2_fancy_upsample_sse2)
+definev(jconst_merged_upsample_sse2)
+definev(jsimd_h2v1_merged_upsample_sse2)
+definev(jsimd_h2v2_merged_upsample_sse2)
+definev(jsimd_convsamp_mmx)
+definev(jsimd_convsamp_sse2)
+definev(jsimd_convsamp_float_3dnow)
+definev(jsimd_convsamp_float_sse)
+definev(jsimd_convsamp_float_sse2)
+definev(jsimd_fdct_islow_mmx)
+definev(jsimd_fdct_ifast_mmx)
+definev(jconst_fdct_islow_sse2)
+definev(jsimd_fdct_islow_sse2)
+definev(jconst_fdct_ifast_sse2)
+definev(jsimd_fdct_ifast_sse2)
+definev(jsimd_fdct_float_3dnow)
+definev(jconst_fdct_float_sse)
+definev(jsimd_fdct_float_sse)
+definev(jsimd_quantize_mmx)
+definev(jsimd_quantize_sse2)
+definev(jsimd_quantize_float_3dnow)
+definev(jsimd_quantize_float_sse)
+definev(jsimd_quantize_float_sse2)
+definev(jsimd_idct_2x2_mmx)
+definev(jsimd_idct_4x4_mmx)
+definev(jconst_idct_red_sse2)
+definev(jsimd_idct_2x2_sse2)
+definev(jsimd_idct_4x4_sse2)
+definev(jsimd_idct_islow_mmx)
+definev(jsimd_idct_ifast_mmx)
+definev(jconst_idct_islow_sse2)
+definev(jsimd_idct_islow_sse2)
+definev(jconst_idct_ifast_sse2)
+definev(jsimd_idct_ifast_sse2)
+definev(jsimd_idct_float_3dnow)
+definev(jconst_idct_float_sse)
+definev(jsimd_idct_float_sse)
+definev(jconst_idct_float_sse2)
+definev(jsimd_idct_float_sse2)
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
diff --git a/simd/jsimdcpu.asm b/simd/jsimdcpu.asm
new file mode 100644
index 0000000..bdbcc23
--- /dev/null
+++ b/simd/jsimdcpu.asm
@@ -0,0 +1,105 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 32
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support (void)
+;
+
+ align 16
+ global EXTN(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+ push edi
+
+ xor edi,edi ; simd support flag
+
+ pushfd
+ pop eax
+ mov edx,eax
+ xor eax, 1<<21 ; flip ID bit in EFLAGS
+ push eax
+ popfd
+ pushfd
+ pop eax
+ xor eax,edx
+ jz short .return ; CPUID is not supported
+
+ ; Check for MMX instruction support
+ xor eax,eax
+ cpuid
+ test eax,eax
+ jz short .return
+
+ xor eax,eax
+ inc eax
+ cpuid
+ mov eax,edx ; eax = Standard feature flags
+
+ test eax, 1<<23 ; bit23:MMX
+ jz short .no_mmx
+ or edi, byte JSIMD_MMX
+.no_mmx:
+ test eax, 1<<25 ; bit25:SSE
+ jz short .no_sse
+ or edi, byte JSIMD_SSE
+.no_sse:
+ test eax, 1<<26 ; bit26:SSE2
+ jz short .no_sse2
+ or edi, byte JSIMD_SSE2
+.no_sse2:
+
+ ; Check for 3DNow! instruction support
+ mov eax, 0x80000000
+ cpuid
+ cmp eax, 0x80000000
+ jbe short .return
+
+ mov eax, 0x80000001
+ cpuid
+ mov eax,edx ; eax = Extended feature flags
+
+ test eax, 1<<31 ; bit31:3DNow!(vendor independent)
+ jz short .no_3dnow
+ or edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+ mov eax,edi
+
+ pop edi
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 16
diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc
new file mode 100644
index 0000000..4695360
--- /dev/null
+++ b/simd/jsimdext.inc
@@ -0,0 +1,317 @@
+;
+; jsimdext.inc - common declarations
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty. In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+; claim that you wrote the original software. If you use this software
+; in a product, an acknowledgment in the product documentation would be
+; appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+; misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+;
+; [TAB8]
+
+; ==========================================================================
+; System-dependent configurations
+
+%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%define SEG_TEXT .text align=16 public use32 class=CODE
+%define SEG_CONST .rdata align=16 public use32 class=CONST
+
+%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT .text align=16 public use32 class=CODE
+%define SEG_CONST .data align=16 public use32 class=DATA
+
+%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; mark stack as non-executable
+section .note.GNU-stack noalloc noexec nowrite progbits
+
+; -- segment definition --
+;
+%ifdef __x86_64__
+%define SEG_TEXT .text progbits align=16
+%define SEG_CONST .rodata progbits align=16
+%else
+%define SEG_TEXT .text progbits alloc exec nowrite align=16
+%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
+%endif
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
+%define EXTN(name) name ; foo() -> foo
+
+%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT .text
+%define SEG_CONST .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
+
+%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why?
+%define SEG_CONST .rodata align=16
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
+
+%else ; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT .text
+%define SEG_CONST .data
+
+%endif ; ----------------------------------------------
+
+; ==========================================================================
+
+; --------------------------------------------------------------------------
+; Common types
+;
+%ifdef __x86_64__
+%define POINTER qword ; general pointer type
+%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
+%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
+%else
+%define POINTER dword ; general pointer type
+%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
+%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
+%endif
+
+%define INT dword ; signed integer type
+%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
+%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
+
+%define FP32 dword ; IEEE754 single
+%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
+%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
+
+%define MMWORD qword ; int64 (MMX register)
+%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
+%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
+
+; NASM is buggy and doesn't properly handle operand sizes for SSE
+; instructions, so for now we have to define XMMWORD as blank.
+%define XMMWORD ; int128 (SSE register)
+%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
+%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
+
+; Similar hacks for when we load a dword or MMWORD into an xmm# register
+%define XMM_DWORD
+%define XMM_MMWORD
+
+%define SIZEOF_BYTE 1 ; sizeof(BYTE)
+%define SIZEOF_WORD 2 ; sizeof(WORD)
+%define SIZEOF_DWORD 4 ; sizeof(DWORD)
+%define SIZEOF_QWORD 8 ; sizeof(QWORD)
+%define SIZEOF_OWORD 16 ; sizeof(OWORD)
+
+%define BYTE_BIT 8 ; CHAR_BIT in C
+%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
+%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
+%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
+%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
+
+; --------------------------------------------------------------------------
+; External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name) _ %+ name ; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+; Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC ; -------------------------------------------
+
+%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+ SECTION SEG_CONST
+const_base:
+
+%define GOTOFF(got,sym) (got) + (sym) - const_base
+
+%imacro get_GOT 1
+ ; NOTE: this macro destroys ecx resister.
+ call %%geteip
+ add ecx, byte (%%ref - $)
+ jmp short %%adjust
+%%geteip:
+ mov ecx, POINTER [esp]
+ ret
+%%adjust:
+ push ebp
+ xor ebp,ebp ; ebp = 0
+%ifidni %1,ebx ; (%1 == ebx)
+ ; db 0x8D,0x9C + jmp near const_base =
+ ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+ db 0x8D,0x9C ; 8D,9C
+ jmp near const_base ; E9,(const_base-%%ref)
+%%ref:
+%else ; (%1 != ebx)
+ ; db 0x8D,0x8C + jmp near const_base =
+ ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+ db 0x8D,0x8C ; 8D,8C
+ jmp near const_base ; E9,(const_base-%%ref)
+%%ref: mov %1, ecx
+%endif ; (%1 == ebx)
+ pop ebp
+%endmacro
+
+%else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT 1
+ extern GOT_SYMBOL
+ call %%geteip
+ add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+ jmp short %%done
+%%geteip:
+ mov %1, POINTER [esp]
+ ret
+%%done:
+%endmacro
+
+%endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic 1.nolist
+ push %1
+%endmacro
+%imacro poppic 1.nolist
+ pop %1
+%endmacro
+%imacro movpic 2.nolist
+ mov %1,%2
+%endmacro
+
+%else ; !PIC -----------------------------------------
+
+%define GOTOFF(got,sym) (sym)
+
+%imacro get_GOT 1.nolist
+%endmacro
+%imacro pushpic 1.nolist
+%endmacro
+%imacro poppic 1.nolist
+%endmacro
+%imacro movpic 2.nolist
+%endmacro
+
+%endif ; PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+; Align the next instruction on {2,4,8,16,..}-byte boundary.
+; ".balign n,,m" in GNU as
+;
+%define MSKLE(x,y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b,n) (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
+ db 0x90 ; nop
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
+ db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
+ db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
+ db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
+ db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00]
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
+ db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00]
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
+ db 0x8B,0xED ; mov ebp,ebp
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
+ db 0x90 ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+ align %1, db 0 ; filling zeros
+%endmacro
+
+%ifdef __x86_64__
+%imacro collect_args 0
+ push r10
+ push r11
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r10, rdi
+ mov r11, rsi
+ mov r12, rdx
+ mov r13, rcx
+ mov r14, r8
+ mov r15, r9
+%endmacro
+
+%imacro uncollect_args 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop r11
+ pop r10
+%endmacro
+
+%endif
+
+; --------------------------------------------------------------------------
+; Defines picked up from the C headers
+;
+%include "jsimdcfg.inc"
+
+; --------------------------------------------------------------------------
diff --git a/simd/nasm_lt.sh b/simd/nasm_lt.sh
new file mode 100755
index 0000000..b112862
--- /dev/null
+++ b/simd/nasm_lt.sh
@@ -0,0 +1,57 @@
+#! /bin/sh
+command=""
+infile=""
+o_opt=no
+pic=no
+while [ $# -gt 0 ]; do
+ case "$1" in
+ -DPIC|-fPIC|-fpic)
+ if [ "$pic" != "yes" ] ; then
+ command="$command -DPIC"
+ pic=yes
+ fi
+ ;;
+ -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \
+ -fobj|-fwin32|-frdf|-fieee|-fmacho|-fmacho64)
+ # it's a file format specifier for nasm.
+ command="$command $1"
+ ;;
+ -f*)
+ # maybe a code-generation flag for gcc.
+ ;;
+ -[Ii]*)
+ incdir=`echo "$1" | sed 's/^-[Ii]//'`
+ if [ "x$incdir" = x -a "x$2" != x ] ; then
+ case "$2" in
+ -*) ;;
+ *) incdir="$2"; shift;;
+ esac
+ fi
+ if [ "x$incdir" != x ] ; then
+ # In the case of NASM, the trailing slash is necessary.
+ incdir=`echo "$incdir" | sed 's%/*$%/%'`
+ command="$command -I$incdir"
+ fi
+ ;;
+ -o*)
+ o_opt=yes
+ command="$command $1"
+ ;;
+ *.asm)
+ infile=$1
+ command="$command $1"
+ ;;
+ *)
+ command="$command $1"
+ ;;
+ esac
+ shift
+done
+if [ "$o_opt" != yes ] ; then
+ # By default, NASM creates an output file
+ # in the same directory as the input file.
+ outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o"
+ command="$command $outfile"
+fi
+echo $command
+exec $command
diff --git a/testimg.bmp b/testimg.bmp
index 8603d15..d67e025 100644
--- a/testimg.bmp
+++ b/testimg.bmp
Binary files differ
diff --git a/testimg.jpg b/testimg.jpg
index b34ca5d..76d223f 100644
--- a/testimg.jpg
+++ b/testimg.jpg
Binary files differ
diff --git a/testimg.ppm b/testimg.ppm
index 9d81ce2..b391684 100644
--- a/testimg.ppm
+++ b/testimg.ppm
Binary files differ
diff --git a/testimgp.jpg b/testimgp.jpg
index 8cbb658..b78f6d0 100644
--- a/testimgp.jpg
+++ b/testimgp.jpg
Binary files differ
diff --git a/turbojpeg.h b/turbojpeg.h
new file mode 100644
index 0000000..808e2f3
--- /dev/null
+++ b/turbojpeg.h
@@ -0,0 +1,231 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005, 2006 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version. The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * wxWindows Library License for more details.
+ */
+
+#if (defined(_MSC_VER) || defined(__CYGWIN__) || defined(__MINGW32__)) && defined(_WIN32) && defined(DLLDEFINE)
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif
+
+#define DLLCALL
+
+/* Subsampling */
+#define NUMSUBOPT 4
+
+enum {TJ_444=0, TJ_422, TJ_420, TJ_GRAYSCALE};
+
+/* Flags */
+#define TJ_BGR 1
+#define TJ_BOTTOMUP 2
+#define TJ_FORCEMMX 8 /* Force IPP to use MMX code even if SSE available */
+#define TJ_FORCESSE 16 /* Force IPP to use SSE1 code even if SSE2 available */
+#define TJ_FORCESSE2 32 /* Force IPP to use SSE2 code (useful if auto-detect is not working properly) */
+#define TJ_ALPHAFIRST 64 /* BGR buffer is ABGR and RGB buffer is ARGB */
+#define TJ_FORCESSE3 128 /* Force IPP to use SSE3 code (useful if auto-detect is not working properly) */
+#define TJ_FASTUPSAMPLE 256 /* Use fast, inaccurate 4:2:2 and 4:2:0 YUV upsampling routines in libjpeg decompressor */
+
+typedef void* tjhandle;
+
+#define TJPAD(p) (((p)+3)&(~3))
+#ifndef max
+ #define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* API follows */
+
+
+/*
+ tjhandle tjInitCompress(void)
+
+ Creates a new JPEG compressor instance, allocates memory for the structures,
+ and returns a handle to the instance. Most applications will only
+ need to call this once at the beginning of the program or once for each
+ concurrent thread. Don't try to create a new instance every time you
+ compress an image, because this will cause performance to suffer.
+
+ RETURNS: NULL on error
+*/
+DLLEXPORT tjhandle DLLCALL tjInitCompress(void);
+
+
+/*
+ int tjCompress(tjhandle j,
+ unsigned char *srcbuf, int width, int pitch, int height, int pixelsize,
+ unsigned char *dstbuf, unsigned long *size,
+ int jpegsubsamp, int jpegqual, int flags)
+
+ [INPUT] j = instance handle previously returned from a call to
+ tjInitCompress()
+ [INPUT] srcbuf = pointer to user-allocated image buffer containing pixels in
+ RGB(A) or BGR(A) form
+ [INPUT] width = width (in pixels) of the source image
+ [INPUT] pitch = bytes per line of the source image (width*pixelsize if the
+ bitmap is unpadded, else TJPAD(width*pixelsize) if each line of the bitmap
+ is padded to the nearest 32-bit boundary, such as is the case for Windows
+ bitmaps. You can also be clever and use this parameter to skip lines, etc.,
+ as long as the pitch is greater than 0.)
+ [INPUT] height = height (in pixels) of the source image
+ [INPUT] pixelsize = size (in bytes) of each pixel in the source image
+ RGBA and BGRA: 4, RGB and BGR: 3
+ [INPUT] dstbuf = pointer to user-allocated image buffer which will receive
+ the JPEG image. Use the macro TJBUFSIZE(width, height) to determine
+ the appropriate size for this buffer based on the image width and height.
+ [OUTPUT] size = pointer to unsigned long which receives the size (in bytes)
+ of the compressed image
+ [INPUT] jpegsubsamp = Specifies either 4:2:0, 4:2:2, or 4:4:4 subsampling.
+ When the image is converted from the RGB to YCbCr colorspace as part of the
+ JPEG compression process, every other Cb and Cr (chrominance) pixel can be
+ discarded to produce a smaller image with little perceptible loss of
+ image clarity (the human eye is more sensitive to small changes in
+ brightness than small changes in color.)
+
+ TJ_420: 4:2:0 subsampling. Discards every other Cb, Cr pixel in both
+ horizontal and vertical directions.
+ TJ_422: 4:2:2 subsampling. Discards every other Cb, Cr pixel only in
+ the horizontal direction.
+ TJ_444: no subsampling.
+ TJ_GRAYSCALE: Generate grayscale JPEG image
+
+ [INPUT] jpegqual = JPEG quality (an integer between 0 and 100 inclusive.)
+ [INPUT] flags = the bitwise OR of one or more of the following
+
+ TJ_BGR: The components of each pixel in the source image are stored in
+ B,G,R order, not R,G,B
+ TJ_BOTTOMUP: The source image is stored in bottom-up (Windows) order,
+ not top-down
+ TJ_FORCEMMX: Valid only for the Intel Performance Primitives implementation
+ of this codec-- force IPP to use MMX code (bypass CPU auto-detection)
+ TJ_FORCESSE: Valid only for the Intel Performance Primitives implementation
+ of this codec-- force IPP to use SSE code (bypass CPU auto-detection)
+ TJ_FORCESSE2: Valid only for the Intel Performance Primitives implementation
+ of this codec-- force IPP to use SSE2 code (bypass CPU auto-detection)
+ TJ_FORCESSE3: Valid only for the Intel Performance Primitives implementation
+ of this codec-- force IPP to use SSE3 code (bypass CPU auto-detection)
+
+ RETURNS: 0 on success, -1 on error
+*/
+DLLEXPORT int DLLCALL tjCompress(tjhandle j,
+ unsigned char *srcbuf, int width, int pitch, int height, int pixelsize,
+ unsigned char *dstbuf, unsigned long *size,
+ int jpegsubsamp, int jpegqual, int flags);
+
+DLLEXPORT unsigned long DLLCALL TJBUFSIZE(int width, int height);
+
+/*
+ tjhandle tjInitDecompress(void)
+
+ Creates a new JPEG decompressor instance, allocates memory for the
+ structures, and returns a handle to the instance. Most applications will
+ only need to call this once at the beginning of the program or once for each
+ concurrent thread. Don't try to create a new instance every time you
+ decompress an image, because this will cause performance to suffer.
+
+ RETURNS: NULL on error
+*/
+DLLEXPORT tjhandle DLLCALL tjInitDecompress(void);
+
+
+/*
+ int tjDecompressHeader(tjhandle j,
+ unsigned char *srcbuf, unsigned long size,
+ int *width, int *height)
+
+ [INPUT] j = instance handle previously returned from a call to
+ tjInitDecompress()
+ [INPUT] srcbuf = pointer to a user-allocated buffer containing the JPEG image
+ to decompress
+ [INPUT] size = size of the JPEG image buffer (in bytes)
+ [OUTPUT] width = width (in pixels) of the JPEG image
+ [OUTPUT] height = height (in pixels) of the JPEG image
+
+ RETURNS: 0 on success, -1 on error
+*/
+DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle j,
+ unsigned char *srcbuf, unsigned long size,
+ int *width, int *height);
+
+
+/*
+ int tjDecompress(tjhandle j,
+ unsigned char *srcbuf, unsigned long size,
+ unsigned char *dstbuf, int width, int pitch, int height, int pixelsize,
+ int flags)
+
+ [INPUT] j = instance handle previously returned from a call to
+ tjInitDecompress()
+ [INPUT] srcbuf = pointer to a user-allocated buffer containing the JPEG image
+ to decompress
+ [INPUT] size = size of the JPEG image buffer (in bytes)
+ [INPUT] dstbuf = pointer to user-allocated image buffer which will receive
+ the bitmap image. This buffer should normally be pitch*height
+ bytes in size, although this pointer may also be used to decompress into
+ a specific region of a larger buffer.
+ [INPUT] width = width (in pixels) of the destination image
+ [INPUT] pitch = bytes per line of the destination image (width*pixelsize if the
+ bitmap is unpadded, else TJPAD(width*pixelsize) if each line of the bitmap
+ is padded to the nearest 32-bit boundary, such as is the case for Windows
+ bitmaps. You can also be clever and use this parameter to skip lines, etc.,
+ as long as the pitch is greater than 0.)
+ [INPUT] height = height (in pixels) of the destination image
+ [INPUT] pixelsize = size (in bytes) of each pixel in the destination image
+ RGBA/RGBx and BGRA/BGRx: 4, RGB and BGR: 3
+ [INPUT] flags = the bitwise OR of one or more of the following
+
+ TJ_BGR: The components of each pixel in the destination image should be
+ written in B,G,R order, not R,G,B
+ TJ_BOTTOMUP: The destination image should be stored in bottom-up
+ (Windows) order, not top-down
+ TJ_FORCEMMX: Valid only for the Intel Performance Primitives implementation
+ of this codec-- force IPP to use MMX code (bypass CPU auto-detection)
+ TJ_FORCESSE: Valid only for the Intel Performance Primitives implementation
+ of this codec-- force IPP to use SSE code (bypass CPU auto-detection)
+ TJ_FORCESSE2: Valid only for the Intel Performance Primitives implementation
+ of this codec-- force IPP to use SSE2 code (bypass CPU auto-detection)
+
+ RETURNS: 0 on success, -1 on error
+*/
+DLLEXPORT int DLLCALL tjDecompress(tjhandle j,
+ unsigned char *srcbuf, unsigned long size,
+ unsigned char *dstbuf, int width, int pitch, int height, int pixelsize,
+ int flags);
+
+
+/*
+ int tjDestroy(tjhandle h)
+
+ Frees structures associated with a compression or decompression instance
+
+ [INPUT] h = instance handle (returned from a previous call to
+ tjInitCompress() or tjInitDecompress()
+
+ RETURNS: 0 on success, -1 on error
+*/
+DLLEXPORT int DLLCALL tjDestroy(tjhandle h);
+
+
+/*
+ char *tjGetErrorStr(void)
+
+ Returns a descriptive error message explaining why the last command failed
+*/
+DLLEXPORT char* DLLCALL tjGetErrorStr(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/turbojpegl.c b/turbojpegl.c
new file mode 100644
index 0000000..82acf23
--- /dev/null
+++ b/turbojpegl.c
@@ -0,0 +1,354 @@
+/* Copyright (C)2004 Landmark Graphics Corporation
+ * Copyright (C)2005 Sun Microsystems, Inc.
+ * Copyright (C)2009 D. R. Commander
+ *
+ * This library is free software and may be redistributed and/or modified under
+ * the terms of the wxWindows Library License, Version 3.1 or (at your option)
+ * any later version. The full license is in the LICENSE.txt file included
+ * with this distribution.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * wxWindows Library License for more details.
+ */
+
+// This implements a JPEG compressor/decompressor using the libjpeg API
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <jpeglib.h>
+#include <jerror.h>
+#include <setjmp.h>
+#include "./turbojpeg.h"
+
+
+// Error handling
+
+static char lasterror[JMSG_LENGTH_MAX]="No error";
+
+typedef struct _error_mgr
+{
+ struct jpeg_error_mgr pub;
+ jmp_buf jb;
+} error_mgr;
+
+static void my_error_exit(j_common_ptr cinfo)
+{
+ error_mgr *myerr = (error_mgr *)cinfo->err;
+ (*cinfo->err->output_message)(cinfo);
+ longjmp(myerr->jb, 1);
+}
+
+static void my_output_message(j_common_ptr cinfo)
+{
+ (*cinfo->err->format_message)(cinfo, lasterror);
+}
+
+
+// Global structures, macros, etc.
+
+typedef struct _jpgstruct
+{
+ struct jpeg_compress_struct cinfo;
+ struct jpeg_decompress_struct dinfo;
+ struct jpeg_destination_mgr jdms;
+ struct jpeg_source_mgr jsms;
+ error_mgr jerr;
+ int initc, initd;
+} jpgstruct;
+
+static const int hsampfactor[NUMSUBOPT]={1, 2, 2, 1};
+static const int vsampfactor[NUMSUBOPT]={1, 1, 2, 1};
+
+#define _throw(c) {sprintf(lasterror, "%s", c); return -1;}
+#define _catch(f) {if((f)==-1) return -1;}
+#define checkhandle(h) jpgstruct *j=(jpgstruct *)h; \
+ if(!j) _throw("Invalid handle");
+
+
+// CO
+
+static boolean empty_output_buffer(struct jpeg_compress_struct *cinfo)
+{
+ ERREXIT(cinfo, JERR_BUFFER_SIZE);
+ return TRUE;
+}
+
+static void destination_noop(struct jpeg_compress_struct *cinfo)
+{
+}
+
+DLLEXPORT tjhandle DLLCALL tjInitCompress(void)
+{
+ jpgstruct *j=NULL;
+ if((j=(jpgstruct *)malloc(sizeof(jpgstruct)))==NULL)
+ {sprintf(lasterror, "Memory allocation failure"); return NULL;}
+ memset(j, 0, sizeof(jpgstruct));
+ j->cinfo.err=jpeg_std_error(&j->jerr.pub);
+ j->jerr.pub.error_exit=my_error_exit;
+ j->jerr.pub.output_message=my_output_message;
+
+ if(setjmp(j->jerr.jb))
+ { // this will execute if LIBJPEG has an error
+ if(j) free(j); return NULL;
+ }
+
+ jpeg_create_compress(&j->cinfo);
+ j->cinfo.dest=&j->jdms;
+ j->jdms.init_destination=destination_noop;
+ j->jdms.empty_output_buffer=empty_output_buffer;
+ j->jdms.term_destination=destination_noop;
+
+ j->initc=1;
+ return (tjhandle)j;
+}
+
+DLLEXPORT unsigned long DLLCALL TJBUFSIZE(int width, int height)
+{
+ // This allows enough room in case the image doesn't compress
+ return ((width+15)&(~15)) * ((height+15)&(~15)) * 6 + 2048;
+}
+
+DLLEXPORT int DLLCALL tjCompress(tjhandle h,
+ unsigned char *srcbuf, int width, int pitch, int height, int ps,
+ unsigned char *dstbuf, unsigned long *size,
+ int jpegsub, int qual, int flags)
+{
+ int i; JSAMPROW *row_pointer=NULL;
+
+ checkhandle(h);
+
+ if(srcbuf==NULL || width<=0 || pitch<0 || height<=0
+ || dstbuf==NULL || size==NULL
+ || jpegsub<0 || jpegsub>=NUMSUBOPT || qual<0 || qual>100)
+ _throw("Invalid argument in tjCompress()");
+ if(ps!=3 && ps!=4) _throw("This compressor can only take 24-bit or 32-bit RGB input");
+ if(!j->initc) _throw("Instance has not been initialized for compression");
+
+ if(pitch==0) pitch=width*ps;
+
+ j->cinfo.image_width = width;
+ j->cinfo.image_height = height;
+ j->cinfo.input_components = ps;
+
+ #if JCS_EXTENSIONS==1
+ j->cinfo.in_color_space = JCS_EXT_RGB;
+ if(ps==3 && (flags&TJ_BGR))
+ j->cinfo.in_color_space = JCS_EXT_BGR;
+ else if(ps==4 && !(flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
+ j->cinfo.in_color_space = JCS_EXT_RGBX;
+ else if(ps==4 && (flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
+ j->cinfo.in_color_space = JCS_EXT_BGRX;
+ else if(ps==4 && (flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
+ j->cinfo.in_color_space = JCS_EXT_XBGR;
+ else if(ps==4 && !(flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
+ j->cinfo.in_color_space = JCS_EXT_XRGB;
+ #else
+ #error "TurboJPEG requires JPEG colorspace extensions"
+ #endif
+
+ if(setjmp(j->jerr.jb))
+ { // this will execute if LIBJPEG has an error
+ if(row_pointer) free(row_pointer);
+ return -1;
+ }
+
+ jpeg_set_defaults(&j->cinfo);
+
+ jpeg_set_quality(&j->cinfo, qual, TRUE);
+ if(jpegsub==TJ_GRAYSCALE)
+ jpeg_set_colorspace(&j->cinfo, JCS_GRAYSCALE);
+ else
+ jpeg_set_colorspace(&j->cinfo, JCS_YCbCr);
+ j->cinfo.dct_method = JDCT_FASTEST;
+
+ j->cinfo.comp_info[0].h_samp_factor=hsampfactor[jpegsub];
+ j->cinfo.comp_info[1].h_samp_factor=1;
+ j->cinfo.comp_info[2].h_samp_factor=1;
+ j->cinfo.comp_info[0].v_samp_factor=vsampfactor[jpegsub];
+ j->cinfo.comp_info[1].v_samp_factor=1;
+ j->cinfo.comp_info[2].v_samp_factor=1;
+
+ j->jdms.next_output_byte = dstbuf;
+ j->jdms.free_in_buffer = TJBUFSIZE(j->cinfo.image_width, j->cinfo.image_height);
+
+ if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
+ _throw("Memory allocation failed in tjInitCompress()");
+ for(i=0; i<height; i++)
+ {
+ if(flags&TJ_BOTTOMUP) row_pointer[i]= &srcbuf[(height-i-1)*pitch];
+ else row_pointer[i]= &srcbuf[i*pitch];
+ }
+ jpeg_start_compress(&j->cinfo, TRUE);
+ while(j->cinfo.next_scanline<j->cinfo.image_height)
+ {
+ jpeg_write_scanlines(&j->cinfo, &row_pointer[j->cinfo.next_scanline],
+ j->cinfo.image_height-j->cinfo.next_scanline);
+ }
+ jpeg_finish_compress(&j->cinfo);
+ *size=TJBUFSIZE(j->cinfo.image_width, j->cinfo.image_height)-(j->jdms.free_in_buffer);
+
+ if(row_pointer) free(row_pointer);
+ return 0;
+}
+
+
+// DEC
+
+static boolean fill_input_buffer (struct jpeg_decompress_struct *dinfo)
+{
+ ERREXIT(dinfo, JERR_BUFFER_SIZE);
+ return TRUE;
+}
+
+static void skip_input_data (struct jpeg_decompress_struct *dinfo, long num_bytes)
+{
+ dinfo->src->next_input_byte += (size_t) num_bytes;
+ dinfo->src->bytes_in_buffer -= (size_t) num_bytes;
+}
+
+static void source_noop (struct jpeg_decompress_struct *dinfo)
+{
+}
+
+DLLEXPORT tjhandle DLLCALL tjInitDecompress(void)
+{
+ jpgstruct *j;
+ if((j=(jpgstruct *)malloc(sizeof(jpgstruct)))==NULL)
+ {sprintf(lasterror, "Memory allocation failure"); return NULL;}
+ memset(j, 0, sizeof(jpgstruct));
+ j->dinfo.err=jpeg_std_error(&j->jerr.pub);
+ j->jerr.pub.error_exit=my_error_exit;
+ j->jerr.pub.output_message=my_output_message;
+
+ if(setjmp(j->jerr.jb))
+ { // this will execute if LIBJPEG has an error
+ free(j); return NULL;
+ }
+
+ jpeg_create_decompress(&j->dinfo);
+ j->dinfo.src=&j->jsms;
+ j->jsms.init_source=source_noop;
+ j->jsms.fill_input_buffer = fill_input_buffer;
+ j->jsms.skip_input_data = skip_input_data;
+ j->jsms.resync_to_restart = jpeg_resync_to_restart;
+ j->jsms.term_source = source_noop;
+
+ j->initd=1;
+ return (tjhandle)j;
+}
+
+
+DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle h,
+ unsigned char *srcbuf, unsigned long size,
+ int *width, int *height)
+{
+ checkhandle(h);
+
+ if(srcbuf==NULL || size<=0 || width==NULL || height==NULL)
+ _throw("Invalid argument in tjDecompressHeader()");
+ if(!j->initd) _throw("Instance has not been initialized for decompression");
+
+ if(setjmp(j->jerr.jb))
+ { // this will execute if LIBJPEG has an error
+ return -1;
+ }
+
+ j->jsms.bytes_in_buffer = size;
+ j->jsms.next_input_byte = srcbuf;
+
+ jpeg_read_header(&j->dinfo, TRUE);
+
+ *width=j->dinfo.image_width; *height=j->dinfo.image_height;
+
+ jpeg_abort_decompress(&j->dinfo);
+
+ if(*width<1 || *height<1) _throw("Invalid data returned in header");
+ return 0;
+}
+
+
+DLLEXPORT int DLLCALL tjDecompress(tjhandle h,
+ unsigned char *srcbuf, unsigned long size,
+ unsigned char *dstbuf, int width, int pitch, int height, int ps,
+ int flags)
+{
+ int i; JSAMPROW *row_pointer=NULL;
+
+ checkhandle(h);
+
+ if(srcbuf==NULL || size<=0
+ || dstbuf==NULL || width<=0 || pitch<0 || height<=0)
+ _throw("Invalid argument in tjDecompress()");
+ if(ps!=3 && ps!=4) _throw("This compressor can only take 24-bit or 32-bit RGB input");
+ if(!j->initd) _throw("Instance has not been initialized for decompression");
+
+ if(pitch==0) pitch=width*ps;
+
+ if(setjmp(j->jerr.jb))
+ { // this will execute if LIBJPEG has an error
+ if(row_pointer) free(row_pointer);
+ return -1;
+ }
+
+ j->jsms.bytes_in_buffer = size;
+ j->jsms.next_input_byte = srcbuf;
+
+ jpeg_read_header(&j->dinfo, TRUE);
+
+ if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
+ _throw("Memory allocation failed in tjInitDecompress()");
+ for(i=0; i<height; i++)
+ {
+ if(flags&TJ_BOTTOMUP) row_pointer[i]= &dstbuf[(height-i-1)*pitch];
+ else row_pointer[i]= &dstbuf[i*pitch];
+ }
+
+ #if JCS_EXTENSIONS==1
+ j->dinfo.out_color_space = JCS_EXT_RGB;
+ if(ps==3 && (flags&TJ_BGR))
+ j->dinfo.out_color_space = JCS_EXT_BGR;
+ else if(ps==4 && !(flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
+ j->dinfo.out_color_space = JCS_EXT_RGBX;
+ else if(ps==4 && (flags&TJ_BGR) && !(flags&TJ_ALPHAFIRST))
+ j->dinfo.out_color_space = JCS_EXT_BGRX;
+ else if(ps==4 && (flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
+ j->dinfo.out_color_space = JCS_EXT_XBGR;
+ else if(ps==4 && !(flags&TJ_BGR) && (flags&TJ_ALPHAFIRST))
+ j->dinfo.out_color_space = JCS_EXT_XRGB;
+ #else
+ #error "TurboJPEG requires JPEG colorspace extensions"
+ #endif
+ if(flags&TJ_FASTUPSAMPLE) j->dinfo.do_fancy_upsampling=FALSE;
+
+ jpeg_start_decompress(&j->dinfo);
+ while(j->dinfo.output_scanline<j->dinfo.output_height)
+ {
+ jpeg_read_scanlines(&j->dinfo, &row_pointer[j->dinfo.output_scanline],
+ j->dinfo.output_height-j->dinfo.output_scanline);
+ }
+ jpeg_finish_decompress(&j->dinfo);
+
+ if(row_pointer) free(row_pointer);
+ return 0;
+}
+
+
+// General
+
+DLLEXPORT char* DLLCALL tjGetErrorStr(void)
+{
+ return lasterror;
+}
+
+DLLEXPORT int DLLCALL tjDestroy(tjhandle h)
+{
+ checkhandle(h);
+ if(setjmp(j->jerr.jb)) return -1;
+ if(j->initc) jpeg_destroy_compress(&j->cinfo);
+ if(j->initd) jpeg_destroy_decompress(&j->dinfo);
+ free(j);
+ return 0;
+}
diff --git a/jconfig.vc b/win/jconfig.h
similarity index 97%
rename from jconfig.vc
rename to win/jconfig.h
index 7e291c7..7987ba6 100644
--- a/jconfig.vc
+++ b/win/jconfig.h
@@ -21,6 +21,7 @@
#endif
#define HAVE_BOOLEAN /* prevent jmorecfg.h from redefining it */
+#define inline __inline
#ifdef JPEG_INTERNALS